diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex.c | 310 | ||||
| -rw-r--r-- | src/regex.h | 14 | ||||
| -rw-r--r-- | src/syntax.c | 96 |
3 files changed, 181 insertions, 239 deletions
diff --git a/src/regex.c b/src/regex.c index 1f2a1f086de..3a25835f452 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -1969,29 +1969,96 @@ struct range_table_work_area | |||
| 1969 | 1969 | ||
| 1970 | #if ! WIDE_CHAR_SUPPORT | 1970 | #if ! WIDE_CHAR_SUPPORT |
| 1971 | 1971 | ||
| 1972 | /* Map a string to the char class it names (if any). */ | 1972 | /* Parse a character class, i.e. string such as "[:name:]". *strp |
| 1973 | points to the string to be parsed and limit is length, in bytes, of | ||
| 1974 | that string. | ||
| 1975 | |||
| 1976 | If *strp point to a string that begins with "[:name:]", where name is | ||
| 1977 | a non-empty sequence of lower case letters, *strp will be advanced past the | ||
| 1978 | closing square bracket and RECC_* constant which maps to the name will be | ||
| 1979 | returned. If name is not a valid character class name zero, or RECC_ERROR, | ||
| 1980 | is returned. | ||
| 1981 | |||
| 1982 | Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned. | ||
| 1983 | |||
| 1984 | The function can be used on ASCII and multibyte (UTF-8-encoded) strings. | ||
| 1985 | */ | ||
| 1973 | re_wctype_t | 1986 | re_wctype_t |
| 1974 | re_wctype (const_re_char *str) | 1987 | re_wctype_parse (const unsigned char **strp, unsigned limit) |
| 1975 | { | 1988 | { |
| 1976 | const char *string = (const char *) str; | 1989 | const char *beg = (const char *)*strp, *it; |
| 1977 | if (STREQ (string, "alnum")) return RECC_ALNUM; | 1990 | |
| 1978 | else if (STREQ (string, "alpha")) return RECC_ALPHA; | 1991 | if (limit < 4 || beg[0] != '[' || beg[1] != ':') |
| 1979 | else if (STREQ (string, "word")) return RECC_WORD; | 1992 | return -1; |
| 1980 | else if (STREQ (string, "ascii")) return RECC_ASCII; | 1993 | |
| 1981 | else if (STREQ (string, "nonascii")) return RECC_NONASCII; | 1994 | beg += 2; /* skip opening ‘[:’ */ |
| 1982 | else if (STREQ (string, "graph")) return RECC_GRAPH; | 1995 | limit -= 3; /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */ |
| 1983 | else if (STREQ (string, "lower")) return RECC_LOWER; | 1996 | for (it = beg; it[0] != ':' || it[1] != ']'; ++it) |
| 1984 | else if (STREQ (string, "print")) return RECC_PRINT; | 1997 | if (!--limit) |
| 1985 | else if (STREQ (string, "punct")) return RECC_PUNCT; | 1998 | return -1; |
| 1986 | else if (STREQ (string, "space")) return RECC_SPACE; | 1999 | |
| 1987 | else if (STREQ (string, "upper")) return RECC_UPPER; | 2000 | *strp = (const unsigned char *)(it + 2); |
| 1988 | else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; | 2001 | |
| 1989 | else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; | 2002 | /* Sort tests in the length=five case by frequency the classes to minimise |
| 1990 | else if (STREQ (string, "digit")) return RECC_DIGIT; | 2003 | number of times we fail the comparison. The frequencies of character class |
| 1991 | else if (STREQ (string, "xdigit")) return RECC_XDIGIT; | 2004 | names used in Emacs sources as of 2016-07-27: |
| 1992 | else if (STREQ (string, "cntrl")) return RECC_CNTRL; | 2005 | |
| 1993 | else if (STREQ (string, "blank")) return RECC_BLANK; | 2006 | $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | |
| 1994 | else return 0; | 2007 | sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr |
| 2008 | 213 [:alnum:] | ||
| 2009 | 104 [:alpha:] | ||
| 2010 | 62 [:space:] | ||
| 2011 | 39 [:digit:] | ||
| 2012 | 36 [:blank:] | ||
| 2013 | 26 [:word:] | ||
| 2014 | 26 [:upper:] | ||
| 2015 | 21 [:lower:] | ||
| 2016 | 10 [:xdigit:] | ||
| 2017 | 10 [:punct:] | ||
| 2018 | 10 [:ascii:] | ||
| 2019 | 4 [:nonascii:] | ||
| 2020 | 4 [:graph:] | ||
| 2021 | 2 [:print:] | ||
| 2022 | 2 [:cntrl:] | ||
| 2023 | 1 [:ff:] | ||
| 2024 | |||
| 2025 | If you update this list, consider also updating chain of or’ed conditions | ||
| 2026 | in execute_charset function. | ||
| 2027 | */ | ||
| 2028 | |||
| 2029 | switch (it - beg) { | ||
| 2030 | case 4: | ||
| 2031 | if (!memcmp (beg, "word", 4)) return RECC_WORD; | ||
| 2032 | break; | ||
| 2033 | case 5: | ||
| 2034 | if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM; | ||
| 2035 | if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA; | ||
| 2036 | if (!memcmp (beg, "space", 5)) return RECC_SPACE; | ||
| 2037 | if (!memcmp (beg, "digit", 5)) return RECC_DIGIT; | ||
| 2038 | if (!memcmp (beg, "blank", 5)) return RECC_BLANK; | ||
| 2039 | if (!memcmp (beg, "upper", 5)) return RECC_UPPER; | ||
| 2040 | if (!memcmp (beg, "lower", 5)) return RECC_LOWER; | ||
| 2041 | if (!memcmp (beg, "punct", 5)) return RECC_PUNCT; | ||
| 2042 | if (!memcmp (beg, "ascii", 5)) return RECC_ASCII; | ||
| 2043 | if (!memcmp (beg, "graph", 5)) return RECC_GRAPH; | ||
| 2044 | if (!memcmp (beg, "print", 5)) return RECC_PRINT; | ||
| 2045 | if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL; | ||
| 2046 | break; | ||
| 2047 | case 6: | ||
| 2048 | if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT; | ||
| 2049 | break; | ||
| 2050 | case 7: | ||
| 2051 | if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE; | ||
| 2052 | break; | ||
| 2053 | case 8: | ||
| 2054 | if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII; | ||
| 2055 | break; | ||
| 2056 | case 9: | ||
| 2057 | if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE; | ||
| 2058 | break; | ||
| 2059 | } | ||
| 2060 | |||
| 2061 | return RECC_ERROR; | ||
| 1995 | } | 2062 | } |
| 1996 | 2063 | ||
| 1997 | /* True if CH is in the char class CC. */ | 2064 | /* True if CH is in the char class CC. */ |
| @@ -2776,10 +2843,74 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, | |||
| 2776 | { | 2843 | { |
| 2777 | boolean escaped_char = false; | 2844 | boolean escaped_char = false; |
| 2778 | const unsigned char *p2 = p; | 2845 | const unsigned char *p2 = p; |
| 2846 | re_wctype_t cc; | ||
| 2779 | re_wchar_t ch; | 2847 | re_wchar_t ch; |
| 2780 | 2848 | ||
| 2781 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2849 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2782 | 2850 | ||
| 2851 | /* See if we're at the beginning of a possible character | ||
| 2852 | class. */ | ||
| 2853 | if (syntax & RE_CHAR_CLASSES && | ||
| 2854 | (cc = re_wctype_parse(&p, pend - p)) != -1) | ||
| 2855 | { | ||
| 2856 | if (cc == 0) | ||
| 2857 | FREE_STACK_RETURN (REG_ECTYPE); | ||
| 2858 | |||
| 2859 | if (p == pend) | ||
| 2860 | FREE_STACK_RETURN (REG_EBRACK); | ||
| 2861 | |||
| 2862 | #ifndef emacs | ||
| 2863 | for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) | ||
| 2864 | if (re_iswctype (btowc (ch), cc)) | ||
| 2865 | { | ||
| 2866 | c = TRANSLATE (ch); | ||
| 2867 | if (c < (1 << BYTEWIDTH)) | ||
| 2868 | SET_LIST_BIT (c); | ||
| 2869 | } | ||
| 2870 | #else /* emacs */ | ||
| 2871 | /* Most character classes in a multibyte match just set | ||
| 2872 | a flag. Exceptions are is_blank, is_digit, is_cntrl, and | ||
| 2873 | is_xdigit, since they can only match ASCII characters. | ||
| 2874 | We don't need to handle them for multibyte. They are | ||
| 2875 | distinguished by a negative wctype. */ | ||
| 2876 | |||
| 2877 | /* Setup the gl_state object to its buffer-defined value. | ||
| 2878 | This hardcodes the buffer-global syntax-table for ASCII | ||
| 2879 | chars, while the other chars will obey syntax-table | ||
| 2880 | properties. It's not ideal, but it's the way it's been | ||
| 2881 | done until now. */ | ||
| 2882 | SETUP_BUFFER_SYNTAX_TABLE (); | ||
| 2883 | |||
| 2884 | for (ch = 0; ch < 256; ++ch) | ||
| 2885 | { | ||
| 2886 | c = RE_CHAR_TO_MULTIBYTE (ch); | ||
| 2887 | if (! CHAR_BYTE8_P (c) | ||
| 2888 | && re_iswctype (c, cc)) | ||
| 2889 | { | ||
| 2890 | SET_LIST_BIT (ch); | ||
| 2891 | c1 = TRANSLATE (c); | ||
| 2892 | if (c1 == c) | ||
| 2893 | continue; | ||
| 2894 | if (ASCII_CHAR_P (c1)) | ||
| 2895 | SET_LIST_BIT (c1); | ||
| 2896 | else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) | ||
| 2897 | SET_LIST_BIT (c1); | ||
| 2898 | } | ||
| 2899 | } | ||
| 2900 | SET_RANGE_TABLE_WORK_AREA_BIT | ||
| 2901 | (range_table_work, re_wctype_to_bit (cc)); | ||
| 2902 | #endif /* emacs */ | ||
| 2903 | /* In most cases the matching rule for char classes only | ||
| 2904 | uses the syntax table for multibyte chars, so that the | ||
| 2905 | content of the syntax-table is not hardcoded in the | ||
| 2906 | range_table. SPACE and WORD are the two exceptions. */ | ||
| 2907 | if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) | ||
| 2908 | bufp->used_syntax = 1; | ||
| 2909 | |||
| 2910 | /* Repeat the loop. */ | ||
| 2911 | continue; | ||
| 2912 | } | ||
| 2913 | |||
| 2783 | /* Don't translate yet. The range TRANSLATE(X..Y) cannot | 2914 | /* Don't translate yet. The range TRANSLATE(X..Y) cannot |
| 2784 | always be determined from TRANSLATE(X) and TRANSLATE(Y) | 2915 | always be determined from TRANSLATE(X) and TRANSLATE(Y) |
| 2785 | So the translation is done later in a loop. Example: | 2916 | So the translation is done later in a loop. Example: |
| @@ -2803,119 +2934,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, | |||
| 2803 | break; | 2934 | break; |
| 2804 | } | 2935 | } |
| 2805 | 2936 | ||
| 2806 | /* See if we're at the beginning of a possible character | ||
| 2807 | class. */ | ||
| 2808 | |||
| 2809 | if (!escaped_char && | ||
| 2810 | syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | ||
| 2811 | { | ||
| 2812 | /* Leave room for the null. */ | ||
| 2813 | unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; | ||
| 2814 | const unsigned char *class_beg; | ||
| 2815 | |||
| 2816 | PATFETCH (c); | ||
| 2817 | c1 = 0; | ||
| 2818 | class_beg = p; | ||
| 2819 | |||
| 2820 | /* If pattern is `[[:'. */ | ||
| 2821 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | ||
| 2822 | |||
| 2823 | for (;;) | ||
| 2824 | { | ||
| 2825 | PATFETCH (c); | ||
| 2826 | if ((c == ':' && *p == ']') || p == pend) | ||
| 2827 | break; | ||
| 2828 | if (c1 < CHAR_CLASS_MAX_LENGTH) | ||
| 2829 | str[c1++] = c; | ||
| 2830 | else | ||
| 2831 | /* This is in any case an invalid class name. */ | ||
| 2832 | str[0] = '\0'; | ||
| 2833 | } | ||
| 2834 | str[c1] = '\0'; | ||
| 2835 | |||
| 2836 | /* If isn't a word bracketed by `[:' and `:]': | ||
| 2837 | undo the ending character, the letters, and | ||
| 2838 | leave the leading `:' and `[' (but set bits for | ||
| 2839 | them). */ | ||
| 2840 | if (c == ':' && *p == ']') | ||
| 2841 | { | ||
| 2842 | re_wctype_t cc = re_wctype (str); | ||
| 2843 | |||
| 2844 | if (cc == 0) | ||
| 2845 | FREE_STACK_RETURN (REG_ECTYPE); | ||
| 2846 | |||
| 2847 | /* Throw away the ] at the end of the character | ||
| 2848 | class. */ | ||
| 2849 | PATFETCH (c); | ||
| 2850 | |||
| 2851 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | ||
| 2852 | |||
| 2853 | #ifndef emacs | ||
| 2854 | for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) | ||
| 2855 | if (re_iswctype (btowc (ch), cc)) | ||
| 2856 | { | ||
| 2857 | c = TRANSLATE (ch); | ||
| 2858 | if (c < (1 << BYTEWIDTH)) | ||
| 2859 | SET_LIST_BIT (c); | ||
| 2860 | } | ||
| 2861 | #else /* emacs */ | ||
| 2862 | /* Most character classes in a multibyte match | ||
| 2863 | just set a flag. Exceptions are is_blank, | ||
| 2864 | is_digit, is_cntrl, and is_xdigit, since | ||
| 2865 | they can only match ASCII characters. We | ||
| 2866 | don't need to handle them for multibyte. | ||
| 2867 | They are distinguished by a negative wctype. */ | ||
| 2868 | |||
| 2869 | /* Setup the gl_state object to its buffer-defined | ||
| 2870 | value. This hardcodes the buffer-global | ||
| 2871 | syntax-table for ASCII chars, while the other chars | ||
| 2872 | will obey syntax-table properties. It's not ideal, | ||
| 2873 | but it's the way it's been done until now. */ | ||
| 2874 | SETUP_BUFFER_SYNTAX_TABLE (); | ||
| 2875 | |||
| 2876 | for (ch = 0; ch < 256; ++ch) | ||
| 2877 | { | ||
| 2878 | c = RE_CHAR_TO_MULTIBYTE (ch); | ||
| 2879 | if (! CHAR_BYTE8_P (c) | ||
| 2880 | && re_iswctype (c, cc)) | ||
| 2881 | { | ||
| 2882 | SET_LIST_BIT (ch); | ||
| 2883 | c1 = TRANSLATE (c); | ||
| 2884 | if (c1 == c) | ||
| 2885 | continue; | ||
| 2886 | if (ASCII_CHAR_P (c1)) | ||
| 2887 | SET_LIST_BIT (c1); | ||
| 2888 | else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) | ||
| 2889 | SET_LIST_BIT (c1); | ||
| 2890 | } | ||
| 2891 | } | ||
| 2892 | SET_RANGE_TABLE_WORK_AREA_BIT | ||
| 2893 | (range_table_work, re_wctype_to_bit (cc)); | ||
| 2894 | #endif /* emacs */ | ||
| 2895 | /* In most cases the matching rule for char classes | ||
| 2896 | only uses the syntax table for multibyte chars, | ||
| 2897 | so that the content of the syntax-table is not | ||
| 2898 | hardcoded in the range_table. SPACE and WORD are | ||
| 2899 | the two exceptions. */ | ||
| 2900 | if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) | ||
| 2901 | bufp->used_syntax = 1; | ||
| 2902 | |||
| 2903 | /* Repeat the loop. */ | ||
| 2904 | continue; | ||
| 2905 | } | ||
| 2906 | else | ||
| 2907 | { | ||
| 2908 | /* Go back to right after the "[:". */ | ||
| 2909 | p = class_beg; | ||
| 2910 | SET_LIST_BIT ('['); | ||
| 2911 | |||
| 2912 | /* Because the `:' may start the range, we | ||
| 2913 | can't simply set bit and repeat the loop. | ||
| 2914 | Instead, just set it to C and handle below. */ | ||
| 2915 | c = ':'; | ||
| 2916 | } | ||
| 2917 | } | ||
| 2918 | |||
| 2919 | if (p < pend && p[0] == '-' && p[1] != ']') | 2937 | if (p < pend && p[0] == '-' && p[1] != ']') |
| 2920 | { | 2938 | { |
| 2921 | 2939 | ||
| @@ -4659,28 +4677,8 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) | |||
| 4659 | re_wchar_t range_start, range_end; | 4677 | re_wchar_t range_start, range_end; |
| 4660 | 4678 | ||
| 4661 | /* Sort tests by the most commonly used classes with some adjustment to which | 4679 | /* Sort tests by the most commonly used classes with some adjustment to which |
| 4662 | tests are easiest to perform. Frequencies of character class names used in | 4680 | tests are easiest to perform. Take a look at comment in re_wctype_parse |
| 4663 | Emacs sources as of 2016-07-15: | 4681 | for table with frequencies of character class names. */ |
| 4664 | |||
| 4665 | $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | | ||
| 4666 | sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr | ||
| 4667 | 213 [:alnum:] | ||
| 4668 | 104 [:alpha:] | ||
| 4669 | 62 [:space:] | ||
| 4670 | 39 [:digit:] | ||
| 4671 | 36 [:blank:] | ||
| 4672 | 26 [:upper:] | ||
| 4673 | 24 [:word:] | ||
| 4674 | 21 [:lower:] | ||
| 4675 | 10 [:punct:] | ||
| 4676 | 10 [:ascii:] | ||
| 4677 | 9 [:xdigit:] | ||
| 4678 | 4 [:nonascii:] | ||
| 4679 | 4 [:graph:] | ||
| 4680 | 2 [:print:] | ||
| 4681 | 2 [:cntrl:] | ||
| 4682 | 1 [:ff:] | ||
| 4683 | */ | ||
| 4684 | 4682 | ||
| 4685 | if ((class_bits & BIT_MULTIBYTE) || | 4683 | if ((class_bits & BIT_MULTIBYTE) || |
| 4686 | (class_bits & BIT_ALNUM && ISALNUM (c)) || | 4684 | (class_bits & BIT_ALNUM && ISALNUM (c)) || |
diff --git a/src/regex.h b/src/regex.h index 817167a07ca..01b659addbb 100644 --- a/src/regex.h +++ b/src/regex.h | |||
| @@ -585,25 +585,13 @@ extern void regfree (regex_t *__preg); | |||
| 585 | /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ | 585 | /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ |
| 586 | # include <wchar.h> | 586 | # include <wchar.h> |
| 587 | # include <wctype.h> | 587 | # include <wctype.h> |
| 588 | #endif | ||
| 589 | 588 | ||
| 590 | #if WIDE_CHAR_SUPPORT | ||
| 591 | /* The GNU C library provides support for user-defined character classes | ||
| 592 | and the functions from ISO C amendment 1. */ | ||
| 593 | # ifdef CHARCLASS_NAME_MAX | ||
| 594 | # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX | ||
| 595 | # else | ||
| 596 | /* This shouldn't happen but some implementation might still have this | ||
| 597 | problem. Use a reasonable default value. */ | ||
| 598 | # define CHAR_CLASS_MAX_LENGTH 256 | ||
| 599 | # endif | ||
| 600 | typedef wctype_t re_wctype_t; | 589 | typedef wctype_t re_wctype_t; |
| 601 | typedef wchar_t re_wchar_t; | 590 | typedef wchar_t re_wchar_t; |
| 602 | # define re_wctype wctype | 591 | # define re_wctype wctype |
| 603 | # define re_iswctype iswctype | 592 | # define re_iswctype iswctype |
| 604 | # define re_wctype_to_bit(cc) 0 | 593 | # define re_wctype_to_bit(cc) 0 |
| 605 | #else | 594 | #else |
| 606 | # define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ | ||
| 607 | # ifndef emacs | 595 | # ifndef emacs |
| 608 | # define btowc(c) c | 596 | # define btowc(c) c |
| 609 | # endif | 597 | # endif |
| @@ -621,7 +609,7 @@ typedef enum { RECC_ERROR = 0, | |||
| 621 | } re_wctype_t; | 609 | } re_wctype_t; |
| 622 | 610 | ||
| 623 | extern char re_iswctype (int ch, re_wctype_t cc); | 611 | extern char re_iswctype (int ch, re_wctype_t cc); |
| 624 | extern re_wctype_t re_wctype (const unsigned char* str); | 612 | extern re_wctype_t re_wctype_parse (const unsigned char **strp, unsigned limit); |
| 625 | 613 | ||
| 626 | typedef int re_wchar_t; | 614 | typedef int re_wchar_t; |
| 627 | 615 | ||
diff --git a/src/syntax.c b/src/syntax.c index f8d987b377c..667de402ec4 100644 --- a/src/syntax.c +++ b/src/syntax.c | |||
| @@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim, | |||
| 1691 | /* At first setup fastmap. */ | 1691 | /* At first setup fastmap. */ |
| 1692 | while (i_byte < size_byte) | 1692 | while (i_byte < size_byte) |
| 1693 | { | 1693 | { |
| 1694 | c = str[i_byte++]; | 1694 | if (handle_iso_classes) |
| 1695 | |||
| 1696 | if (handle_iso_classes && c == '[' | ||
| 1697 | && i_byte < size_byte | ||
| 1698 | && str[i_byte] == ':') | ||
| 1699 | { | 1695 | { |
| 1700 | const unsigned char *class_beg = str + i_byte + 1; | 1696 | const unsigned char *ch = str + i_byte; |
| 1701 | const unsigned char *class_end = class_beg; | 1697 | re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte); |
| 1702 | const unsigned char *class_limit = str + size_byte - 2; | ||
| 1703 | /* Leave room for the null. */ | ||
| 1704 | unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1]; | ||
| 1705 | re_wctype_t cc; | ||
| 1706 | |||
| 1707 | if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH) | ||
| 1708 | class_limit = class_beg + CHAR_CLASS_MAX_LENGTH; | ||
| 1709 | |||
| 1710 | while (class_end < class_limit | ||
| 1711 | && *class_end >= 'a' && *class_end <= 'z') | ||
| 1712 | class_end++; | ||
| 1713 | |||
| 1714 | if (class_end == class_beg | ||
| 1715 | || *class_end != ':' || class_end[1] != ']') | ||
| 1716 | goto not_a_class_name; | ||
| 1717 | |||
| 1718 | memcpy (class_name, class_beg, class_end - class_beg); | ||
| 1719 | class_name[class_end - class_beg] = 0; | ||
| 1720 | |||
| 1721 | cc = re_wctype (class_name); | ||
| 1722 | if (cc == 0) | 1698 | if (cc == 0) |
| 1723 | error ("Invalid ISO C character class"); | 1699 | error ("Invalid ISO C character class"); |
| 1724 | 1700 | if (cc != -1) | |
| 1725 | iso_classes = Fcons (make_number (cc), iso_classes); | 1701 | { |
| 1726 | 1702 | iso_classes = Fcons (make_number (cc), iso_classes); | |
| 1727 | i_byte = class_end + 2 - str; | 1703 | i_byte = ch - str; |
| 1728 | continue; | 1704 | continue; |
| 1705 | } | ||
| 1729 | } | 1706 | } |
| 1730 | 1707 | ||
| 1731 | not_a_class_name: | 1708 | c = str[i_byte++]; |
| 1709 | |||
| 1732 | if (c == '\\') | 1710 | if (c == '\\') |
| 1733 | { | 1711 | { |
| 1734 | if (i_byte == size_byte) | 1712 | if (i_byte == size_byte) |
| @@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim, | |||
| 1808 | while (i_byte < size_byte) | 1786 | while (i_byte < size_byte) |
| 1809 | { | 1787 | { |
| 1810 | int leading_code = str[i_byte]; | 1788 | int leading_code = str[i_byte]; |
| 1811 | c = STRING_CHAR_AND_LENGTH (str + i_byte, len); | ||
| 1812 | i_byte += len; | ||
| 1813 | 1789 | ||
| 1814 | if (handle_iso_classes && c == '[' | 1790 | if (handle_iso_classes) |
| 1815 | && i_byte < size_byte | ||
| 1816 | && STRING_CHAR (str + i_byte) == ':') | ||
| 1817 | { | 1791 | { |
| 1818 | const unsigned char *class_beg = str + i_byte + 1; | 1792 | const unsigned char *ch = str + i_byte; |
| 1819 | const unsigned char *class_end = class_beg; | 1793 | re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte); |
| 1820 | const unsigned char *class_limit = str + size_byte - 2; | ||
| 1821 | /* Leave room for the null. */ | ||
| 1822 | unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1]; | ||
| 1823 | re_wctype_t cc; | ||
| 1824 | |||
| 1825 | if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH) | ||
| 1826 | class_limit = class_beg + CHAR_CLASS_MAX_LENGTH; | ||
| 1827 | |||
| 1828 | while (class_end < class_limit | ||
| 1829 | && *class_end >= 'a' && *class_end <= 'z') | ||
| 1830 | class_end++; | ||
| 1831 | |||
| 1832 | if (class_end == class_beg | ||
| 1833 | || *class_end != ':' || class_end[1] != ']') | ||
| 1834 | goto not_a_class_name_multibyte; | ||
| 1835 | |||
| 1836 | memcpy (class_name, class_beg, class_end - class_beg); | ||
| 1837 | class_name[class_end - class_beg] = 0; | ||
| 1838 | |||
| 1839 | cc = re_wctype (class_name); | ||
| 1840 | if (cc == 0) | 1794 | if (cc == 0) |
| 1841 | error ("Invalid ISO C character class"); | 1795 | error ("Invalid ISO C character class"); |
| 1842 | 1796 | if (cc != -1) | |
| 1843 | iso_classes = Fcons (make_number (cc), iso_classes); | 1797 | { |
| 1844 | 1798 | iso_classes = Fcons (make_number (cc), iso_classes); | |
| 1845 | i_byte = class_end + 2 - str; | 1799 | i_byte = ch - str; |
| 1846 | continue; | 1800 | continue; |
| 1801 | } | ||
| 1847 | } | 1802 | } |
| 1848 | 1803 | ||
| 1849 | not_a_class_name_multibyte: | 1804 | if (leading_code== '\\') |
| 1850 | if (c == '\\') | ||
| 1851 | { | 1805 | { |
| 1852 | if (i_byte == size_byte) | 1806 | if (++i_byte == size_byte) |
| 1853 | break; | 1807 | break; |
| 1854 | 1808 | ||
| 1855 | leading_code = str[i_byte]; | 1809 | leading_code = str[i_byte]; |
| 1856 | c = STRING_CHAR_AND_LENGTH (str + i_byte, len); | ||
| 1857 | i_byte += len; | ||
| 1858 | } | 1810 | } |
| 1811 | c = STRING_CHAR_AND_LENGTH (str + i_byte, len); | ||
| 1812 | i_byte += len; | ||
| 1813 | |||
| 1814 | |||
| 1859 | /* Treat `-' as range character only if another character | 1815 | /* Treat `-' as range character only if another character |
| 1860 | follows. */ | 1816 | follows. */ |
| 1861 | if (i_byte + 1 < size_byte | 1817 | if (i_byte + 1 < size_byte |