diff options
| author | Kenichi Handa | 2007-02-15 11:23:52 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2007-02-15 11:23:52 +0000 |
| commit | cf9c99bcf551d4a50b029f6fc766b7403c846dd5 (patch) | |
| tree | 32465f5419610ab16c4b2d96f992b390d66883ea /src | |
| parent | b13abc5d94ee9e85223cb620b91206bda70d7685 (diff) | |
| download | emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.tar.gz emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.zip | |
(RE_STRING_CHAR, RE_STRING_CHAR_AND_LENGTH): New arg
multibte. Callers changed.
(RE_CHAR_TO_MULTIBYTE, RE_CHAR_TO_UNIBYTE): New macros.
(MAKE_CHAR_MULTIBYTE, MAKE_CHAR_UNIBYTE): Deleted. Callers
changed to use RE_CHAR_TO_MULTIBYTE and RE_CHAR_TO_UNIBYTE
respectively.
(SETUP_ASCII_RANGE, SETUP_UNIBYTE_RANGE): New macros.
(SETUP_MULTIBYTE_RANGE): Generate more compact range_table.
(regex_compile): Make the compiled pattern usable both for
multibyte and unibyte targets.
(analyse_first): Make the fastmap usable both for multibyte and
unibyte targets.
(TRANSLATE_VIA_MULTIBYTE): Deleted.
(re_match_2_internal): Pay attention to the case that the
multibyteness of bufp and target may be different.
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex.c | 453 |
1 files changed, 300 insertions, 153 deletions
diff --git a/src/regex.c b/src/regex.c index 177908cb751..782f758468f 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -145,11 +145,18 @@ | |||
| 145 | 145 | ||
| 146 | # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) | 146 | # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) |
| 147 | # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) | 147 | # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) |
| 148 | # define RE_STRING_CHAR(p, s) \ | 148 | # define RE_STRING_CHAR(p, s, multibyte) \ |
| 149 | (multibyte ? (STRING_CHAR (p, s)) : (*(p))) | 149 | (multibyte ? (STRING_CHAR (p, s)) : (*(p))) |
| 150 | # define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ | 150 | # define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \ |
| 151 | (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) | 151 | (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) |
| 152 | 152 | ||
| 153 | # define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)] | ||
| 154 | |||
| 155 | # define RE_CHAR_TO_UNIBYTE(c) \ | ||
| 156 | (ASCII_CHAR_P (c) ? (c) \ | ||
| 157 | : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c) \ | ||
| 158 | : multibyte_char_to_unibyte_safe (c)) | ||
| 159 | |||
| 153 | /* Set C a (possibly converted to multibyte) character before P. P | 160 | /* Set C a (possibly converted to multibyte) character before P. P |
| 154 | points into a string which is the virtual concatenation of STR1 | 161 | points into a string which is the virtual concatenation of STR1 |
| 155 | (which ends at END1) or STR2 (which ends at END2). */ | 162 | (which ends at END1) or STR2 (which ends at END2). */ |
| @@ -165,7 +172,7 @@ | |||
| 165 | else \ | 172 | else \ |
| 166 | { \ | 173 | { \ |
| 167 | (c = ((p) == (str2) ? (end1) : (p))[-1]); \ | 174 | (c = ((p) == (str2) ? (end1) : (p))[-1]); \ |
| 168 | MAKE_CHAR_MULTIBYTE (c); \ | 175 | (c) = RE_CHAR_TO_MULTIBYTE (c); \ |
| 169 | } \ | 176 | } \ |
| 170 | } while (0) | 177 | } while (0) |
| 171 | 178 | ||
| @@ -174,12 +181,12 @@ | |||
| 174 | # define GET_CHAR_AFTER(c, p, len) \ | 181 | # define GET_CHAR_AFTER(c, p, len) \ |
| 175 | do { \ | 182 | do { \ |
| 176 | if (multibyte) \ | 183 | if (multibyte) \ |
| 177 | c = STRING_CHAR_AND_LENGTH (p, 0, len); \ | 184 | (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \ |
| 178 | else \ | 185 | else \ |
| 179 | { \ | 186 | { \ |
| 180 | c = *p; \ | 187 | (c) = *p; \ |
| 181 | len = 1; \ | 188 | len = 1; \ |
| 182 | MAKE_CHAR_MULTIBYTE (c); \ | 189 | (c) = RE_CHAR_TO_MULTIBYTE (c); \ |
| 183 | } \ | 190 | } \ |
| 184 | } while (0) | 191 | } while (0) |
| 185 | 192 | ||
| @@ -301,10 +308,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; | |||
| 301 | # define MULTIBYTE_FORM_LENGTH(p, s) (1) | 308 | # define MULTIBYTE_FORM_LENGTH(p, s) (1) |
| 302 | # define PREV_CHAR_BOUNDARY(p, limit) ((p)--) | 309 | # define PREV_CHAR_BOUNDARY(p, limit) ((p)--) |
| 303 | # define STRING_CHAR(p, s) (*(p)) | 310 | # define STRING_CHAR(p, s) (*(p)) |
| 304 | # define RE_STRING_CHAR STRING_CHAR | 311 | # define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s)) |
| 305 | # define CHAR_STRING(c, s) (*(s) = (c), 1) | 312 | # define CHAR_STRING(c, s) (*(s) = (c), 1) |
| 306 | # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) | 313 | # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) |
| 307 | # define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH | 314 | # define RE_STRING_CHAR_AND_LENGTH(p, s, multibyte) STRING_CHAR_AND_LENGTH ((p), (s)) |
| 315 | # define RE_CHAR_TO_MULTIBYTE(c) (c) | ||
| 316 | # define RE_CHAR_TO_UNIBYTE(c) (c) | ||
| 308 | # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | 317 | # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ |
| 309 | (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) | 318 | (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) |
| 310 | # define GET_CHAR_AFTER(c, p, len) \ | 319 | # define GET_CHAR_AFTER(c, p, len) \ |
| @@ -312,8 +321,6 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; | |||
| 312 | # define MAKE_CHAR(charset, c1, c2) (c1) | 321 | # define MAKE_CHAR(charset, c1, c2) (c1) |
| 313 | # define BYTE8_TO_CHAR(c) (c) | 322 | # define BYTE8_TO_CHAR(c) (c) |
| 314 | # define CHAR_BYTE8_P(c) (0) | 323 | # define CHAR_BYTE8_P(c) (0) |
| 315 | # define MAKE_CHAR_MULTIBYTE(c) (c) | ||
| 316 | # define MAKE_CHAR_UNIBYTE(c) (c) | ||
| 317 | # define CHAR_LEADING_CODE(c) (c) | 324 | # define CHAR_LEADING_CODE(c) (c) |
| 318 | 325 | ||
| 319 | #endif /* not emacs */ | 326 | #endif /* not emacs */ |
| @@ -1761,7 +1768,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, | |||
| 1761 | do { \ | 1768 | do { \ |
| 1762 | int len; \ | 1769 | int len; \ |
| 1763 | if (p == pend) return REG_EEND; \ | 1770 | if (p == pend) return REG_EEND; \ |
| 1764 | c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ | 1771 | c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \ |
| 1765 | p += len; \ | 1772 | p += len; \ |
| 1766 | } while (0) | 1773 | } while (0) |
| 1767 | 1774 | ||
| @@ -2019,32 +2026,107 @@ struct range_table_work_area | |||
| 2019 | 2026 | ||
| 2020 | #ifdef emacs | 2027 | #ifdef emacs |
| 2021 | 2028 | ||
| 2022 | /* Store characters in the rage range C0 to C1 in WORK_AREA while | 2029 | /* Store characters in the range FROM to TO in the bitmap at B (for |
| 2023 | translating them and paying attention to the continuity of | 2030 | ASCII and unibyte characters) and WORK_AREA (for multibyte |
| 2024 | translated characters. | 2031 | characters) while translating them and paying attention to the |
| 2032 | continuity of translated characters. | ||
| 2025 | 2033 | ||
| 2026 | Implementation note: It is better to implement this fairly big | 2034 | Implementation note: It is better to implement these fairly big |
| 2027 | macro by a function, but it's not that easy because macros called | 2035 | macros by a function, but it's not that easy because macros called |
| 2028 | in this macro assume various local variables already declared. */ | 2036 | in this macro assume various local variables already declared. */ |
| 2029 | 2037 | ||
| 2030 | #define SETUP_MULTIBYTE_RANGE(work_area, c0, c1) \ | 2038 | /* Both FROM and TO are ASCII characters. */ |
| 2031 | do { \ | 2039 | |
| 2032 | re_wchar_t c, t, t_last; \ | 2040 | #define SETUP_ASCII_RANGE(work_area, FROM, TO) \ |
| 2033 | int n; \ | 2041 | do { \ |
| 2034 | \ | 2042 | int C0, C1; \ |
| 2035 | c = (c0); \ | 2043 | \ |
| 2036 | t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \ | 2044 | for (C0 = (FROM); C0 <= (TO); C0++) \ |
| 2037 | for (c++, n = 1; c <= (c1); c++, n++) \ | 2045 | { \ |
| 2038 | { \ | 2046 | C1 = TRANSLATE (C0); \ |
| 2039 | t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \ | 2047 | if (! ASCII_CHAR_P (C1)) \ |
| 2040 | if (t_last + n == t) \ | 2048 | { \ |
| 2041 | continue; \ | 2049 | SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \ |
| 2042 | SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \ | 2050 | if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \ |
| 2043 | t_last = t; \ | 2051 | C1 = C0; \ |
| 2044 | n = 0; \ | 2052 | } \ |
| 2045 | } \ | 2053 | SET_LIST_BIT (C1); \ |
| 2046 | if (n > 0) \ | 2054 | } \ |
| 2047 | SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \ | 2055 | } while (0) |
| 2056 | |||
| 2057 | |||
| 2058 | /* Both FROM and TO are unibyte characters (0x80..0xFF). */ | ||
| 2059 | |||
| 2060 | #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \ | ||
| 2061 | do { \ | ||
| 2062 | int C0, C1, C2, I; \ | ||
| 2063 | int USED = RANGE_TABLE_WORK_USED (work_area); \ | ||
| 2064 | \ | ||
| 2065 | for (C0 = (FROM); C0 <= (TO); C0++) \ | ||
| 2066 | { \ | ||
| 2067 | C1 = RE_CHAR_TO_MULTIBYTE (C0); \ | ||
| 2068 | if (CHAR_BYTE8_P (C1)) \ | ||
| 2069 | SET_LIST_BIT (C0); \ | ||
| 2070 | else \ | ||
| 2071 | { \ | ||
| 2072 | C2 = TRANSLATE (C1); \ | ||
| 2073 | if (C2 == C1 \ | ||
| 2074 | || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \ | ||
| 2075 | C1 = C0; \ | ||
| 2076 | SET_LIST_BIT (C1); \ | ||
| 2077 | for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \ | ||
| 2078 | { \ | ||
| 2079 | int from = RANGE_TABLE_WORK_ELT (work_area, I); \ | ||
| 2080 | int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \ | ||
| 2081 | \ | ||
| 2082 | if (C2 >= from - 1 && C2 <= to + 1) \ | ||
| 2083 | { \ | ||
| 2084 | if (C2 == from - 1) \ | ||
| 2085 | RANGE_TABLE_WORK_ELT (work_area, I)--; \ | ||
| 2086 | else if (C2 == to + 1) \ | ||
| 2087 | RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \ | ||
| 2088 | break; \ | ||
| 2089 | } \ | ||
| 2090 | } \ | ||
| 2091 | if (I < USED) \ | ||
| 2092 | SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \ | ||
| 2093 | } \ | ||
| 2094 | } \ | ||
| 2095 | } while (0) | ||
| 2096 | |||
| 2097 | |||
| 2098 | /* Both FROM and TO are mulitbyte characters. */ | ||
| 2099 | |||
| 2100 | #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \ | ||
| 2101 | do { \ | ||
| 2102 | int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \ | ||
| 2103 | \ | ||
| 2104 | SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \ | ||
| 2105 | for (C0 = (FROM); C0 <= (TO); C0++) \ | ||
| 2106 | { \ | ||
| 2107 | C1 = TRANSLATE (C0); \ | ||
| 2108 | if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \ | ||
| 2109 | || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \ | ||
| 2110 | SET_LIST_BIT (C2); \ | ||
| 2111 | if (C1 >= (FROM) && C1 <= (TO)) \ | ||
| 2112 | continue; \ | ||
| 2113 | for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \ | ||
| 2114 | { \ | ||
| 2115 | int from = RANGE_TABLE_WORK_ELT (work_area, I); \ | ||
| 2116 | int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \ | ||
| 2117 | \ | ||
| 2118 | if (C1 >= from - 1 && C1 <= to + 1) \ | ||
| 2119 | { \ | ||
| 2120 | if (C1 == from - 1) \ | ||
| 2121 | RANGE_TABLE_WORK_ELT (work_area, I)--; \ | ||
| 2122 | else if (C1 == to + 1) \ | ||
| 2123 | RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \ | ||
| 2124 | break; \ | ||
| 2125 | } \ | ||
| 2126 | } \ | ||
| 2127 | if (I < USED) \ | ||
| 2128 | SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \ | ||
| 2129 | } \ | ||
| 2048 | } while (0) | 2130 | } while (0) |
| 2049 | 2131 | ||
| 2050 | #endif /* emacs */ | 2132 | #endif /* emacs */ |
| @@ -2904,6 +2986,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2904 | { | 2986 | { |
| 2905 | boolean escaped_char = false; | 2987 | boolean escaped_char = false; |
| 2906 | const unsigned char *p2 = p; | 2988 | const unsigned char *p2 = p; |
| 2989 | re_wchar_t ch, c2; | ||
| 2907 | 2990 | ||
| 2908 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2991 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2909 | 2992 | ||
| @@ -2966,7 +3049,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2966 | them). */ | 3049 | them). */ |
| 2967 | if (c == ':' && *p == ']') | 3050 | if (c == ':' && *p == ']') |
| 2968 | { | 3051 | { |
| 2969 | re_wchar_t ch; | ||
| 2970 | re_wctype_t cc; | 3052 | re_wctype_t cc; |
| 2971 | int limit; | 3053 | int limit; |
| 2972 | 3054 | ||
| @@ -2981,41 +3063,41 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2981 | 3063 | ||
| 2982 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 3064 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2983 | 3065 | ||
| 2984 | /* Most character classes in a multibyte match | 3066 | #ifndef emacs |
| 2985 | just set a flag. Exceptions are is_blank, | 3067 | for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) |
| 2986 | is_digit, is_cntrl, and is_xdigit, since | ||
| 2987 | they can only match ASCII characters. We | ||
| 2988 | don't need to handle them for multibyte. | ||
| 2989 | They are distinguished by a negative wctype. */ | ||
| 2990 | |||
| 2991 | for (ch = 0; ch < 128; ++ch) | ||
| 2992 | if (re_iswctype (btowc (ch), cc)) | 3068 | if (re_iswctype (btowc (ch), cc)) |
| 2993 | { | 3069 | { |
| 2994 | c = TRANSLATE (ch); | 3070 | c = TRANSLATE (ch); |
| 2995 | if (c < (1 << BYTEWIDTH)) | 3071 | if (c < (1 << BYTEWIDTH)) |
| 2996 | SET_LIST_BIT (c); | 3072 | SET_LIST_BIT (c); |
| 2997 | } | 3073 | } |
| 3074 | #else /* emacs */ | ||
| 3075 | /* Most character classes in a multibyte match | ||
| 3076 | just set a flag. Exceptions are is_blank, | ||
| 3077 | is_digit, is_cntrl, and is_xdigit, since | ||
| 3078 | they can only match ASCII characters. We | ||
| 3079 | don't need to handle them for multibyte. | ||
| 3080 | They are distinguished by a negative wctype. */ | ||
| 2998 | 3081 | ||
| 2999 | if (target_multibyte) | 3082 | for (ch = 0; ch < 256; ++ch) |
| 3000 | { | 3083 | { |
| 3001 | SET_RANGE_TABLE_WORK_AREA_BIT | 3084 | c = RE_CHAR_TO_MULTIBYTE (ch); |
| 3002 | (range_table_work, re_wctype_to_bit (cc)); | 3085 | if (! CHAR_BYTE8_P (c) |
| 3003 | } | 3086 | && re_iswctype (c, cc)) |
| 3004 | else | ||
| 3005 | { | ||
| 3006 | for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) | ||
| 3007 | { | 3087 | { |
| 3008 | c = ch; | 3088 | SET_LIST_BIT (ch); |
| 3009 | MAKE_CHAR_MULTIBYTE (c); | 3089 | c1 = TRANSLATE (c); |
| 3010 | if (re_iswctype (btowc (c), cc)) | 3090 | if (c1 == c) |
| 3011 | { | 3091 | continue; |
| 3012 | c = TRANSLATE (c); | 3092 | if (ASCII_CHAR_P (c1)) |
| 3013 | MAKE_CHAR_UNIBYTE (c); | 3093 | SET_LIST_BIT (c1); |
| 3014 | SET_LIST_BIT (c); | 3094 | else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) |
| 3015 | } | 3095 | SET_LIST_BIT (c1); |
| 3016 | } | 3096 | } |
| 3017 | } | 3097 | } |
| 3018 | 3098 | SET_RANGE_TABLE_WORK_AREA_BIT | |
| 3099 | (range_table_work, re_wctype_to_bit (cc)); | ||
| 3100 | #endif /* emacs */ | ||
| 3019 | /* In most cases the matching rule for char classes | 3101 | /* In most cases the matching rule for char classes |
| 3020 | only uses the syntax table for multibyte chars, | 3102 | only uses the syntax table for multibyte chars, |
| 3021 | so that the content of the syntax-table it is not | 3103 | so that the content of the syntax-table it is not |
| @@ -3048,51 +3130,63 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3048 | 3130 | ||
| 3049 | /* Fetch the character which ends the range. */ | 3131 | /* Fetch the character which ends the range. */ |
| 3050 | PATFETCH (c1); | 3132 | PATFETCH (c1); |
| 3051 | if (c > c1) | 3133 | #ifdef emacs |
| 3052 | { | 3134 | if (CHAR_BYTE8_P (c1) |
| 3053 | if (syntax & RE_NO_EMPTY_RANGES) | 3135 | && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c)) |
| 3054 | FREE_STACK_RETURN (REG_ERANGEX); | 3136 | /* Treat the range from a multibyte character to |
| 3055 | /* Else, repeat the loop. */ | 3137 | raw-byte character as empty. */ |
| 3056 | } | 3138 | c = c1 + 1; |
| 3139 | #endif /* emacs */ | ||
| 3057 | } | 3140 | } |
| 3058 | else | 3141 | else |
| 3059 | /* Range from C to C. */ | 3142 | /* Range from C to C. */ |
| 3060 | c1 = c; | 3143 | c1 = c; |
| 3061 | 3144 | ||
| 3062 | #ifndef emacs | 3145 | if (c > c1) |
| 3063 | c = TRANSLATE (c); | ||
| 3064 | c1 = TRANSLATE (c1); | ||
| 3065 | /* Set the range into bitmap */ | ||
| 3066 | for (; c <= c1; c++) | ||
| 3067 | SET_LIST_BIT (TRANSLATE (c)); | ||
| 3068 | #else /* not emacs */ | ||
| 3069 | if (target_multibyte) | ||
| 3070 | { | 3146 | { |
| 3071 | if (c1 >= 128) | 3147 | if (syntax & RE_NO_EMPTY_RANGES) |
| 3072 | { | 3148 | FREE_STACK_RETURN (REG_ERANGEX); |
| 3073 | re_wchar_t c0 = MAX (c, 128); | 3149 | /* Else, repeat the loop. */ |
| 3074 | |||
| 3075 | SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1); | ||
| 3076 | c1 = 127; | ||
| 3077 | } | ||
| 3078 | for (; c <= c1; c++) | ||
| 3079 | SET_LIST_BIT (TRANSLATE (c)); | ||
| 3080 | } | 3150 | } |
| 3081 | else | 3151 | else |
| 3082 | { | 3152 | { |
| 3083 | re_wchar_t c0; | 3153 | #ifndef emacs |
| 3084 | 3154 | /* Set the range into bitmap */ | |
| 3085 | for (; c <= c1; c++) | 3155 | for (; c <= c1; c++) |
| 3086 | { | 3156 | { |
| 3087 | c0 = c; | 3157 | ch = TRANSLATE (c); |
| 3088 | if (! multibyte) | 3158 | if (ch < (1 << BYTEWIDTH)) |
| 3089 | MAKE_CHAR_MULTIBYTE (c0); | 3159 | SET_LIST_BIT (ch); |
| 3090 | c0 = TRANSLATE (c0); | 3160 | } |
| 3091 | MAKE_CHAR_UNIBYTE (c0); | 3161 | #else /* emacs */ |
| 3092 | SET_LIST_BIT (c0); | 3162 | if (c < 128) |
| 3163 | { | ||
| 3164 | ch = MIN (127, c1); | ||
| 3165 | SETUP_ASCII_RANGE (range_table_work, c, ch); | ||
| 3166 | c = ch + 1; | ||
| 3167 | if (CHAR_BYTE8_P (c1)) | ||
| 3168 | c = BYTE8_TO_CHAR (128); | ||
| 3169 | } | ||
| 3170 | if (c <= c1) | ||
| 3171 | { | ||
| 3172 | if (CHAR_BYTE8_P (c)) | ||
| 3173 | { | ||
| 3174 | c = CHAR_TO_BYTE8 (c); | ||
| 3175 | c1 = CHAR_TO_BYTE8 (c1); | ||
| 3176 | for (; c <= c1; c++) | ||
| 3177 | SET_LIST_BIT (c); | ||
| 3178 | } | ||
| 3179 | else if (multibyte) | ||
| 3180 | { | ||
| 3181 | SETUP_MULTIBYTE_RANGE (range_table_work, c, c1); | ||
| 3182 | } | ||
| 3183 | else | ||
| 3184 | { | ||
| 3185 | SETUP_UNIBYTE_RANGE (range_table_work, c, c1); | ||
| 3186 | } | ||
| 3093 | } | 3187 | } |
| 3188 | #endif /* emacs */ | ||
| 3094 | } | 3189 | } |
| 3095 | #endif /* not emacs */ | ||
| 3096 | } | 3190 | } |
| 3097 | 3191 | ||
| 3098 | /* Discard any (non)matching list bytes that are all 0 at the | 3192 | /* Discard any (non)matching list bytes that are all 0 at the |
| @@ -3677,17 +3771,22 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3677 | { | 3771 | { |
| 3678 | int len; | 3772 | int len; |
| 3679 | 3773 | ||
| 3680 | if (! multibyte) | 3774 | if (multibyte) |
| 3681 | MAKE_CHAR_MULTIBYTE (c); | ||
| 3682 | c = TRANSLATE (c); | ||
| 3683 | if (target_multibyte) | ||
| 3684 | { | 3775 | { |
| 3776 | c = TRANSLATE (c); | ||
| 3685 | len = CHAR_STRING (c, b); | 3777 | len = CHAR_STRING (c, b); |
| 3686 | b += len; | 3778 | b += len; |
| 3687 | } | 3779 | } |
| 3688 | else | 3780 | else |
| 3689 | { | 3781 | { |
| 3690 | MAKE_CHAR_UNIBYTE (c); | 3782 | c1 = RE_CHAR_TO_MULTIBYTE (c); |
| 3783 | if (! CHAR_BYTE8_P (c1)) | ||
| 3784 | { | ||
| 3785 | re_wchar_t c2 = TRANSLATE (c1); | ||
| 3786 | |||
| 3787 | if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0) | ||
| 3788 | c = c1; | ||
| 3789 | } | ||
| 3691 | *b++ = c; | 3790 | *b++ = c; |
| 3692 | len = 1; | 3791 | len = 1; |
| 3693 | } | 3792 | } |
| @@ -3714,11 +3813,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3714 | /* We have succeeded; set the length of the buffer. */ | 3813 | /* We have succeeded; set the length of the buffer. */ |
| 3715 | bufp->used = b - bufp->buffer; | 3814 | bufp->used = b - bufp->buffer; |
| 3716 | 3815 | ||
| 3717 | #ifdef emacs | ||
| 3718 | /* Now the buffer is adjusted for the multibyteness of a target. */ | ||
| 3719 | bufp->multibyte = bufp->target_multibyte; | ||
| 3720 | #endif | ||
| 3721 | |||
| 3722 | #ifdef DEBUG | 3816 | #ifdef DEBUG |
| 3723 | if (debug > 0) | 3817 | if (debug > 0) |
| 3724 | { | 3818 | { |
| @@ -3964,11 +4058,23 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 3964 | 4058 | ||
| 3965 | case exactn: | 4059 | case exactn: |
| 3966 | if (fastmap) | 4060 | if (fastmap) |
| 3967 | /* If multibyte is nonzero, the first byte of each | 4061 | { |
| 3968 | character is an ASCII or a leading code. Otherwise, | 4062 | /* If multibyte is nonzero, the first byte of each |
| 3969 | each byte is a character. Thus, this works in both | 4063 | character is an ASCII or a leading code. Otherwise, |
| 3970 | cases. */ | 4064 | each byte is a character. Thus, this works in both |
| 3971 | fastmap[p[1]] = 1; | 4065 | cases. */ |
| 4066 | fastmap[p[1]] = 1; | ||
| 4067 | if (! multibyte) | ||
| 4068 | { | ||
| 4069 | /* For the case of matching this unibyte regex | ||
| 4070 | against multibyte, we must set a leading code of | ||
| 4071 | the corresponding multibyte character. */ | ||
| 4072 | int c = RE_CHAR_TO_MULTIBYTE (p[1]); | ||
| 4073 | |||
| 4074 | if (! CHAR_BYTE8_P (c)) | ||
| 4075 | fastmap[CHAR_LEADING_CODE (c)] = 1; | ||
| 4076 | } | ||
| 4077 | } | ||
| 3972 | break; | 4078 | break; |
| 3973 | 4079 | ||
| 3974 | 4080 | ||
| @@ -3983,12 +4089,8 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 3983 | if (!fastmap) break; | 4089 | if (!fastmap) break; |
| 3984 | { | 4090 | { |
| 3985 | /* Chars beyond end of bitmap are possible matches. */ | 4091 | /* Chars beyond end of bitmap are possible matches. */ |
| 3986 | /* In a multibyte case, the bitmap is used only for ASCII | ||
| 3987 | characters. */ | ||
| 3988 | int limit = multibyte ? 128 : (1 << BYTEWIDTH); | ||
| 3989 | |||
| 3990 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; | 4092 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; |
| 3991 | j < limit; j++) | 4093 | j < (1 << BYTEWIDTH); j++) |
| 3992 | fastmap[j] = 1; | 4094 | fastmap[j] = 1; |
| 3993 | } | 4095 | } |
| 3994 | 4096 | ||
| @@ -4031,7 +4133,7 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 4031 | 4133 | ||
| 4032 | /* Extract the number of ranges in range table into COUNT. */ | 4134 | /* Extract the number of ranges in range table into COUNT. */ |
| 4033 | EXTRACT_NUMBER_AND_INCR (count, p); | 4135 | EXTRACT_NUMBER_AND_INCR (count, p); |
| 4034 | for (; count > 0; count--, p += 2 * 3) /* XXX */ | 4136 | for (; count > 0; count--, p += 3) |
| 4035 | { | 4137 | { |
| 4036 | /* Extract the start and end of each range. */ | 4138 | /* Extract the start and end of each range. */ |
| 4037 | EXTRACT_CHARACTER (c, p); | 4139 | EXTRACT_CHARACTER (c, p); |
| @@ -4329,9 +4431,8 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 4329 | int total_size = size1 + size2; | 4431 | int total_size = size1 + size2; |
| 4330 | int endpos = startpos + range; | 4432 | int endpos = startpos + range; |
| 4331 | boolean anchored_start; | 4433 | boolean anchored_start; |
| 4332 | /* Nonzero if BUFP is setup for multibyte characters. We are sure | 4434 | /* Nonzero if we are searching multibyte string. */ |
| 4333 | that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */ | 4435 | const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp); |
| 4334 | const boolean multibyte = RE_MULTIBYTE_P (bufp); | ||
| 4335 | 4436 | ||
| 4336 | /* Check for out-of-range STARTPOS. */ | 4437 | /* Check for out-of-range STARTPOS. */ |
| 4337 | if (startpos < 0 || startpos > total_size) | 4438 | if (startpos < 0 || startpos > total_size) |
| @@ -4437,10 +4538,14 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 4437 | else | 4538 | else |
| 4438 | while (range > lim) | 4539 | while (range > lim) |
| 4439 | { | 4540 | { |
| 4541 | register re_wchar_t ch, translated; | ||
| 4542 | |||
| 4440 | buf_ch = *d; | 4543 | buf_ch = *d; |
| 4441 | MAKE_CHAR_MULTIBYTE (buf_ch); | 4544 | ch = RE_CHAR_TO_MULTIBYTE (buf_ch); |
| 4442 | buf_ch = RE_TRANSLATE (translate, buf_ch); | 4545 | translated = RE_TRANSLATE (translate, ch); |
| 4443 | MAKE_CHAR_UNIBYTE (buf_ch); | 4546 | if (translated != ch |
| 4547 | && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0) | ||
| 4548 | buf_ch = ch; | ||
| 4444 | if (fastmap[buf_ch]) | 4549 | if (fastmap[buf_ch]) |
| 4445 | break; | 4550 | break; |
| 4446 | d++; | 4551 | d++; |
| @@ -4484,7 +4589,15 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 4484 | } | 4589 | } |
| 4485 | else | 4590 | else |
| 4486 | { | 4591 | { |
| 4487 | if (! fastmap[TRANSLATE (*d)]) | 4592 | register re_wchar_t ch, translated; |
| 4593 | |||
| 4594 | buf_ch = *d; | ||
| 4595 | ch = RE_CHAR_TO_MULTIBYTE (buf_ch); | ||
| 4596 | translated = TRANSLATE (ch); | ||
| 4597 | if (translated != ch | ||
| 4598 | && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0) | ||
| 4599 | buf_ch = ch; | ||
| 4600 | if (! fastmap[TRANSLATE (buf_ch)]) | ||
| 4488 | goto advance; | 4601 | goto advance; |
| 4489 | } | 4602 | } |
| 4490 | } | 4603 | } |
| @@ -4765,11 +4878,11 @@ mutually_exclusive_p (bufp, p1, p2) | |||
| 4765 | { | 4878 | { |
| 4766 | register re_wchar_t c | 4879 | register re_wchar_t c |
| 4767 | = (re_opcode_t) *p2 == endline ? '\n' | 4880 | = (re_opcode_t) *p2 == endline ? '\n' |
| 4768 | : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); | 4881 | : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte); |
| 4769 | 4882 | ||
| 4770 | if ((re_opcode_t) *p1 == exactn) | 4883 | if ((re_opcode_t) *p1 == exactn) |
| 4771 | { | 4884 | { |
| 4772 | if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2)) | 4885 | if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte)) |
| 4773 | { | 4886 | { |
| 4774 | DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); | 4887 | DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); |
| 4775 | return 1; | 4888 | return 1; |
| @@ -4993,23 +5106,6 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4993 | } | 5106 | } |
| 4994 | WEAK_ALIAS (__re_match_2, re_match_2) | 5107 | WEAK_ALIAS (__re_match_2, re_match_2) |
| 4995 | 5108 | ||
| 4996 | #ifdef emacs | ||
| 4997 | #define TRANSLATE_VIA_MULTIBYTE(c) \ | ||
| 4998 | do { \ | ||
| 4999 | if (multibyte) \ | ||
| 5000 | (c) = TRANSLATE (c); \ | ||
| 5001 | else \ | ||
| 5002 | { \ | ||
| 5003 | MAKE_CHAR_MULTIBYTE (c); \ | ||
| 5004 | (c) = TRANSLATE (c); \ | ||
| 5005 | MAKE_CHAR_UNIBYTE (c); \ | ||
| 5006 | } \ | ||
| 5007 | } while (0) | ||
| 5008 | |||
| 5009 | #else | ||
| 5010 | #define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c)) | ||
| 5011 | #endif | ||
| 5012 | |||
| 5013 | 5109 | ||
| 5014 | /* This is a separate function so that we can force an alloca cleanup | 5110 | /* This is a separate function so that we can force an alloca cleanup |
| 5015 | afterwards. */ | 5111 | afterwards. */ |
| @@ -5050,10 +5146,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5050 | /* We use this to map every character in the string. */ | 5146 | /* We use this to map every character in the string. */ |
| 5051 | RE_TRANSLATE_TYPE translate = bufp->translate; | 5147 | RE_TRANSLATE_TYPE translate = bufp->translate; |
| 5052 | 5148 | ||
| 5053 | /* Nonzero if BUFP is setup for multibyte characters. We are sure | 5149 | /* Nonzero if BUFP is setup from a multibyte regex. */ |
| 5054 | that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */ | ||
| 5055 | const boolean multibyte = RE_MULTIBYTE_P (bufp); | 5150 | const boolean multibyte = RE_MULTIBYTE_P (bufp); |
| 5056 | 5151 | ||
| 5152 | /* Nonzero if STRING1/STRING2 are multibyte. */ | ||
| 5153 | const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp); | ||
| 5154 | |||
| 5057 | /* Failure point stack. Each place that can handle a failure further | 5155 | /* Failure point stack. Each place that can handle a failure further |
| 5058 | down the line pushes a failure point on this stack. It consists of | 5156 | down the line pushes a failure point on this stack. It consists of |
| 5059 | regstart, and regend for all registers corresponding to | 5157 | regstart, and regend for all registers corresponding to |
| @@ -5433,14 +5531,20 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5433 | while (--mcnt); | 5531 | while (--mcnt); |
| 5434 | #else /* emacs */ | 5532 | #else /* emacs */ |
| 5435 | /* The cost of testing `translate' is comparatively small. */ | 5533 | /* The cost of testing `translate' is comparatively small. */ |
| 5436 | if (multibyte) | 5534 | if (target_multibyte) |
| 5437 | do | 5535 | do |
| 5438 | { | 5536 | { |
| 5439 | int pat_charlen, buf_charlen; | 5537 | int pat_charlen, buf_charlen; |
| 5440 | unsigned int pat_ch, buf_ch; | 5538 | int pat_ch, buf_ch; |
| 5441 | 5539 | ||
| 5442 | PREFETCH (); | 5540 | PREFETCH (); |
| 5443 | pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); | 5541 | if (multibyte) |
| 5542 | pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); | ||
| 5543 | else | ||
| 5544 | { | ||
| 5545 | pat_ch = RE_CHAR_TO_MULTIBYTE (*p); | ||
| 5546 | pat_charlen = 1; | ||
| 5547 | } | ||
| 5444 | buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | 5548 | buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); |
| 5445 | 5549 | ||
| 5446 | if (TRANSLATE (buf_ch) != pat_ch) | 5550 | if (TRANSLATE (buf_ch) != pat_ch) |
| @@ -5457,16 +5561,38 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5457 | else | 5561 | else |
| 5458 | do | 5562 | do |
| 5459 | { | 5563 | { |
| 5460 | unsigned int buf_ch; | 5564 | int pat_charlen, buf_charlen; |
| 5565 | int pat_ch, buf_ch; | ||
| 5461 | 5566 | ||
| 5462 | PREFETCH (); | 5567 | PREFETCH (); |
| 5463 | buf_ch = *d++; | 5568 | if (multibyte) |
| 5464 | TRANSLATE_VIA_MULTIBYTE (buf_ch); | 5569 | { |
| 5465 | if (buf_ch != *p++) | 5570 | pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); |
| 5571 | if (CHAR_BYTE8_P (pat_ch)) | ||
| 5572 | pat_ch = CHAR_TO_BYTE8 (pat_ch); | ||
| 5573 | else | ||
| 5574 | pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch); | ||
| 5575 | } | ||
| 5576 | else | ||
| 5577 | { | ||
| 5578 | pat_ch = *p; | ||
| 5579 | pat_charlen = 1; | ||
| 5580 | } | ||
| 5581 | buf_ch = RE_CHAR_TO_MULTIBYTE (*d); | ||
| 5582 | if (! CHAR_BYTE8_P (buf_ch)) | ||
| 5583 | { | ||
| 5584 | buf_ch = TRANSLATE (buf_ch); | ||
| 5585 | buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch); | ||
| 5586 | if (buf_ch < 0) | ||
| 5587 | buf_ch = *d; | ||
| 5588 | } | ||
| 5589 | if (buf_ch != pat_ch) | ||
| 5466 | { | 5590 | { |
| 5467 | d = dfail; | 5591 | d = dfail; |
| 5468 | goto fail; | 5592 | goto fail; |
| 5469 | } | 5593 | } |
| 5594 | p += pat_charlen; | ||
| 5595 | d++; | ||
| 5470 | } | 5596 | } |
| 5471 | while (--mcnt); | 5597 | while (--mcnt); |
| 5472 | #endif | 5598 | #endif |
| @@ -5482,7 +5608,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5482 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); | 5608 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); |
| 5483 | 5609 | ||
| 5484 | PREFETCH (); | 5610 | PREFETCH (); |
| 5485 | buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | 5611 | buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen, |
| 5612 | target_multibyte); | ||
| 5486 | buf_ch = TRANSLATE (buf_ch); | 5613 | buf_ch = TRANSLATE (buf_ch); |
| 5487 | 5614 | ||
| 5488 | if ((!(bufp->syntax & RE_DOT_NEWLINE) | 5615 | if ((!(bufp->syntax & RE_DOT_NEWLINE) |
| @@ -5526,10 +5653,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5526 | } | 5653 | } |
| 5527 | 5654 | ||
| 5528 | PREFETCH (); | 5655 | PREFETCH (); |
| 5529 | c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); | 5656 | c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte); |
| 5530 | TRANSLATE_VIA_MULTIBYTE (c); /* The character to match. */ | 5657 | if (target_multibyte) |
| 5658 | { | ||
| 5659 | int c1; | ||
| 5531 | 5660 | ||
| 5532 | if (! multibyte || IS_REAL_ASCII (c)) | 5661 | c = TRANSLATE (c); |
| 5662 | c1 = RE_CHAR_TO_UNIBYTE (c); | ||
| 5663 | if (c1 >= 0) | ||
| 5664 | c = c1; | ||
| 5665 | } | ||
| 5666 | else | ||
| 5667 | { | ||
| 5668 | int c1 = RE_CHAR_TO_MULTIBYTE (c); | ||
| 5669 | |||
| 5670 | if (! CHAR_BYTE8_P (c1)) | ||
| 5671 | { | ||
| 5672 | c1 = TRANSLATE (c1); | ||
| 5673 | c1 = RE_CHAR_TO_UNIBYTE (c1); | ||
| 5674 | if (c1 >= 0) | ||
| 5675 | c = c1; | ||
| 5676 | } | ||
| 5677 | } | ||
| 5678 | |||
| 5679 | if (c < (1 << BYTEWIDTH)) | ||
| 5533 | { /* Lookup bitmap. */ | 5680 | { /* Lookup bitmap. */ |
| 5534 | /* Cast to `unsigned' instead of `unsigned char' in | 5681 | /* Cast to `unsigned' instead of `unsigned char' in |
| 5535 | case the bit list is a full 32 bytes long. */ | 5682 | case the bit list is a full 32 bytes long. */ |
| @@ -6096,7 +6243,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 6096 | UPDATE_SYNTAX_TABLE (charpos); | 6243 | UPDATE_SYNTAX_TABLE (charpos); |
| 6097 | #endif | 6244 | #endif |
| 6098 | PREFETCH (); | 6245 | PREFETCH (); |
| 6099 | c2 = RE_STRING_CHAR (d, dend - d); | 6246 | c2 = RE_STRING_CHAR (d, dend - d, target_multibyte); |
| 6100 | s2 = SYNTAX (c2); | 6247 | s2 = SYNTAX (c2); |
| 6101 | 6248 | ||
| 6102 | /* Case 2: S2 is neither Sword nor Ssymbol. */ | 6249 | /* Case 2: S2 is neither Sword nor Ssymbol. */ |
| @@ -6149,7 +6296,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 6149 | if (!AT_STRINGS_END (d)) | 6296 | if (!AT_STRINGS_END (d)) |
| 6150 | { | 6297 | { |
| 6151 | PREFETCH_NOLIMIT (); | 6298 | PREFETCH_NOLIMIT (); |
| 6152 | c2 = RE_STRING_CHAR (d, dend - d); | 6299 | c2 = RE_STRING_CHAR (d, dend - d, target_multibyte); |
| 6153 | #ifdef emacs | 6300 | #ifdef emacs |
| 6154 | UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); | 6301 | UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); |
| 6155 | #endif | 6302 | #endif |