aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2007-02-15 11:23:52 +0000
committerKenichi Handa2007-02-15 11:23:52 +0000
commitcf9c99bcf551d4a50b029f6fc766b7403c846dd5 (patch)
tree32465f5419610ab16c4b2d96f992b390d66883ea /src
parentb13abc5d94ee9e85223cb620b91206bda70d7685 (diff)
downloademacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.tar.gz
emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.zip
(RE_STRING_CHAR, RE_STRING_CHAR_AND_LENGTH): New arg
multibte. Callers changed. (RE_CHAR_TO_MULTIBYTE, RE_CHAR_TO_UNIBYTE): New macros. (MAKE_CHAR_MULTIBYTE, MAKE_CHAR_UNIBYTE): Deleted. Callers changed to use RE_CHAR_TO_MULTIBYTE and RE_CHAR_TO_UNIBYTE respectively. (SETUP_ASCII_RANGE, SETUP_UNIBYTE_RANGE): New macros. (SETUP_MULTIBYTE_RANGE): Generate more compact range_table. (regex_compile): Make the compiled pattern usable both for multibyte and unibyte targets. (analyse_first): Make the fastmap usable both for multibyte and unibyte targets. (TRANSLATE_VIA_MULTIBYTE): Deleted. (re_match_2_internal): Pay attention to the case that the multibyteness of bufp and target may be different.
Diffstat (limited to 'src')
-rw-r--r--src/regex.c453
1 files changed, 300 insertions, 153 deletions
diff --git a/src/regex.c b/src/regex.c
index 177908cb751..782f758468f 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -145,11 +145,18 @@
145 145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) 146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) 147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
148# define RE_STRING_CHAR(p, s) \ 148# define RE_STRING_CHAR(p, s, multibyte) \
149 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) 149 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
150# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ 150# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) 151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
152 152
153# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
154
155# define RE_CHAR_TO_UNIBYTE(c) \
156 (ASCII_CHAR_P (c) ? (c) \
157 : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c) \
158 : multibyte_char_to_unibyte_safe (c))
159
153/* Set C a (possibly converted to multibyte) character before P. P 160/* Set C a (possibly converted to multibyte) character before P. P
154 points into a string which is the virtual concatenation of STR1 161 points into a string which is the virtual concatenation of STR1
155 (which ends at END1) or STR2 (which ends at END2). */ 162 (which ends at END1) or STR2 (which ends at END2). */
@@ -165,7 +172,7 @@
165 else \ 172 else \
166 { \ 173 { \
167 (c = ((p) == (str2) ? (end1) : (p))[-1]); \ 174 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
168 MAKE_CHAR_MULTIBYTE (c); \ 175 (c) = RE_CHAR_TO_MULTIBYTE (c); \
169 } \ 176 } \
170 } while (0) 177 } while (0)
171 178
@@ -174,12 +181,12 @@
174# define GET_CHAR_AFTER(c, p, len) \ 181# define GET_CHAR_AFTER(c, p, len) \
175 do { \ 182 do { \
176 if (multibyte) \ 183 if (multibyte) \
177 c = STRING_CHAR_AND_LENGTH (p, 0, len); \ 184 (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \
178 else \ 185 else \
179 { \ 186 { \
180 c = *p; \ 187 (c) = *p; \
181 len = 1; \ 188 len = 1; \
182 MAKE_CHAR_MULTIBYTE (c); \ 189 (c) = RE_CHAR_TO_MULTIBYTE (c); \
183 } \ 190 } \
184 } while (0) 191 } while (0)
185 192
@@ -301,10 +308,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
301# define MULTIBYTE_FORM_LENGTH(p, s) (1) 308# define MULTIBYTE_FORM_LENGTH(p, s) (1)
302# define PREV_CHAR_BOUNDARY(p, limit) ((p)--) 309# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
303# define STRING_CHAR(p, s) (*(p)) 310# define STRING_CHAR(p, s) (*(p))
304# define RE_STRING_CHAR STRING_CHAR 311# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
305# define CHAR_STRING(c, s) (*(s) = (c), 1) 312# define CHAR_STRING(c, s) (*(s) = (c), 1)
306# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) 313# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
307# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH 314# define RE_STRING_CHAR_AND_LENGTH(p, s, multibyte) STRING_CHAR_AND_LENGTH ((p), (s))
315# define RE_CHAR_TO_MULTIBYTE(c) (c)
316# define RE_CHAR_TO_UNIBYTE(c) (c)
308# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 317# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
309 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) 318 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
310# define GET_CHAR_AFTER(c, p, len) \ 319# define GET_CHAR_AFTER(c, p, len) \
@@ -312,8 +321,6 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
312# define MAKE_CHAR(charset, c1, c2) (c1) 321# define MAKE_CHAR(charset, c1, c2) (c1)
313# define BYTE8_TO_CHAR(c) (c) 322# define BYTE8_TO_CHAR(c) (c)
314# define CHAR_BYTE8_P(c) (0) 323# define CHAR_BYTE8_P(c) (0)
315# define MAKE_CHAR_MULTIBYTE(c) (c)
316# define MAKE_CHAR_UNIBYTE(c) (c)
317# define CHAR_LEADING_CODE(c) (c) 324# define CHAR_LEADING_CODE(c) (c)
318 325
319#endif /* not emacs */ 326#endif /* not emacs */
@@ -1761,7 +1768,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
1761 do { \ 1768 do { \
1762 int len; \ 1769 int len; \
1763 if (p == pend) return REG_EEND; \ 1770 if (p == pend) return REG_EEND; \
1764 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ 1771 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \
1765 p += len; \ 1772 p += len; \
1766 } while (0) 1773 } while (0)
1767 1774
@@ -2019,32 +2026,107 @@ struct range_table_work_area
2019 2026
2020#ifdef emacs 2027#ifdef emacs
2021 2028
2022/* Store characters in the rage range C0 to C1 in WORK_AREA while 2029/* Store characters in the range FROM to TO in the bitmap at B (for
2023 translating them and paying attention to the continuity of 2030 ASCII and unibyte characters) and WORK_AREA (for multibyte
2024 translated characters. 2031 characters) while translating them and paying attention to the
2032 continuity of translated characters.
2025 2033
2026 Implementation note: It is better to implement this fairly big 2034 Implementation note: It is better to implement these fairly big
2027 macro by a function, but it's not that easy because macros called 2035 macros by a function, but it's not that easy because macros called
2028 in this macro assume various local variables already declared. */ 2036 in this macro assume various local variables already declared. */
2029 2037
2030#define SETUP_MULTIBYTE_RANGE(work_area, c0, c1) \ 2038/* Both FROM and TO are ASCII characters. */
2031 do { \ 2039
2032 re_wchar_t c, t, t_last; \ 2040#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2033 int n; \ 2041 do { \
2034 \ 2042 int C0, C1; \
2035 c = (c0); \ 2043 \
2036 t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \ 2044 for (C0 = (FROM); C0 <= (TO); C0++) \
2037 for (c++, n = 1; c <= (c1); c++, n++) \ 2045 { \
2038 { \ 2046 C1 = TRANSLATE (C0); \
2039 t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \ 2047 if (! ASCII_CHAR_P (C1)) \
2040 if (t_last + n == t) \ 2048 { \
2041 continue; \ 2049 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2042 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \ 2050 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2043 t_last = t; \ 2051 C1 = C0; \
2044 n = 0; \ 2052 } \
2045 } \ 2053 SET_LIST_BIT (C1); \
2046 if (n > 0) \ 2054 } \
2047 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \ 2055 } while (0)
2056
2057
2058/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2059
2060#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2061 do { \
2062 int C0, C1, C2, I; \
2063 int USED = RANGE_TABLE_WORK_USED (work_area); \
2064 \
2065 for (C0 = (FROM); C0 <= (TO); C0++) \
2066 { \
2067 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2068 if (CHAR_BYTE8_P (C1)) \
2069 SET_LIST_BIT (C0); \
2070 else \
2071 { \
2072 C2 = TRANSLATE (C1); \
2073 if (C2 == C1 \
2074 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2075 C1 = C0; \
2076 SET_LIST_BIT (C1); \
2077 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2078 { \
2079 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2080 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2081 \
2082 if (C2 >= from - 1 && C2 <= to + 1) \
2083 { \
2084 if (C2 == from - 1) \
2085 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2086 else if (C2 == to + 1) \
2087 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2088 break; \
2089 } \
2090 } \
2091 if (I < USED) \
2092 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2093 } \
2094 } \
2095 } while (0)
2096
2097
2098/* Both FROM and TO are mulitbyte characters. */
2099
2100#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2101 do { \
2102 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2103 \
2104 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2105 for (C0 = (FROM); C0 <= (TO); C0++) \
2106 { \
2107 C1 = TRANSLATE (C0); \
2108 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2109 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2110 SET_LIST_BIT (C2); \
2111 if (C1 >= (FROM) && C1 <= (TO)) \
2112 continue; \
2113 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2114 { \
2115 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2116 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2117 \
2118 if (C1 >= from - 1 && C1 <= to + 1) \
2119 { \
2120 if (C1 == from - 1) \
2121 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2122 else if (C1 == to + 1) \
2123 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2124 break; \
2125 } \
2126 } \
2127 if (I < USED) \
2128 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2129 } \
2048 } while (0) 2130 } while (0)
2049 2131
2050#endif /* emacs */ 2132#endif /* emacs */
@@ -2904,6 +2986,7 @@ regex_compile (pattern, size, syntax, bufp)
2904 { 2986 {
2905 boolean escaped_char = false; 2987 boolean escaped_char = false;
2906 const unsigned char *p2 = p; 2988 const unsigned char *p2 = p;
2989 re_wchar_t ch, c2;
2907 2990
2908 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2991 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2909 2992
@@ -2966,7 +3049,6 @@ regex_compile (pattern, size, syntax, bufp)
2966 them). */ 3049 them). */
2967 if (c == ':' && *p == ']') 3050 if (c == ':' && *p == ']')
2968 { 3051 {
2969 re_wchar_t ch;
2970 re_wctype_t cc; 3052 re_wctype_t cc;
2971 int limit; 3053 int limit;
2972 3054
@@ -2981,41 +3063,41 @@ regex_compile (pattern, size, syntax, bufp)
2981 3063
2982 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3064 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2983 3065
2984 /* Most character classes in a multibyte match 3066#ifndef emacs
2985 just set a flag. Exceptions are is_blank, 3067 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2986 is_digit, is_cntrl, and is_xdigit, since
2987 they can only match ASCII characters. We
2988 don't need to handle them for multibyte.
2989 They are distinguished by a negative wctype. */
2990
2991 for (ch = 0; ch < 128; ++ch)
2992 if (re_iswctype (btowc (ch), cc)) 3068 if (re_iswctype (btowc (ch), cc))
2993 { 3069 {
2994 c = TRANSLATE (ch); 3070 c = TRANSLATE (ch);
2995 if (c < (1 << BYTEWIDTH)) 3071 if (c < (1 << BYTEWIDTH))
2996 SET_LIST_BIT (c); 3072 SET_LIST_BIT (c);
2997 } 3073 }
3074#else /* emacs */
3075 /* Most character classes in a multibyte match
3076 just set a flag. Exceptions are is_blank,
3077 is_digit, is_cntrl, and is_xdigit, since
3078 they can only match ASCII characters. We
3079 don't need to handle them for multibyte.
3080 They are distinguished by a negative wctype. */
2998 3081
2999 if (target_multibyte) 3082 for (ch = 0; ch < 256; ++ch)
3000 { 3083 {
3001 SET_RANGE_TABLE_WORK_AREA_BIT 3084 c = RE_CHAR_TO_MULTIBYTE (ch);
3002 (range_table_work, re_wctype_to_bit (cc)); 3085 if (! CHAR_BYTE8_P (c)
3003 } 3086 && re_iswctype (c, cc))
3004 else
3005 {
3006 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
3007 { 3087 {
3008 c = ch; 3088 SET_LIST_BIT (ch);
3009 MAKE_CHAR_MULTIBYTE (c); 3089 c1 = TRANSLATE (c);
3010 if (re_iswctype (btowc (c), cc)) 3090 if (c1 == c)
3011 { 3091 continue;
3012 c = TRANSLATE (c); 3092 if (ASCII_CHAR_P (c1))
3013 MAKE_CHAR_UNIBYTE (c); 3093 SET_LIST_BIT (c1);
3014 SET_LIST_BIT (c); 3094 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3015 } 3095 SET_LIST_BIT (c1);
3016 } 3096 }
3017 } 3097 }
3018 3098 SET_RANGE_TABLE_WORK_AREA_BIT
3099 (range_table_work, re_wctype_to_bit (cc));
3100#endif /* emacs */
3019 /* In most cases the matching rule for char classes 3101 /* In most cases the matching rule for char classes
3020 only uses the syntax table for multibyte chars, 3102 only uses the syntax table for multibyte chars,
3021 so that the content of the syntax-table it is not 3103 so that the content of the syntax-table it is not
@@ -3048,51 +3130,63 @@ regex_compile (pattern, size, syntax, bufp)
3048 3130
3049 /* Fetch the character which ends the range. */ 3131 /* Fetch the character which ends the range. */
3050 PATFETCH (c1); 3132 PATFETCH (c1);
3051 if (c > c1) 3133#ifdef emacs
3052 { 3134 if (CHAR_BYTE8_P (c1)
3053 if (syntax & RE_NO_EMPTY_RANGES) 3135 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3054 FREE_STACK_RETURN (REG_ERANGEX); 3136 /* Treat the range from a multibyte character to
3055 /* Else, repeat the loop. */ 3137 raw-byte character as empty. */
3056 } 3138 c = c1 + 1;
3139#endif /* emacs */
3057 } 3140 }
3058 else 3141 else
3059 /* Range from C to C. */ 3142 /* Range from C to C. */
3060 c1 = c; 3143 c1 = c;
3061 3144
3062#ifndef emacs 3145 if (c > c1)
3063 c = TRANSLATE (c);
3064 c1 = TRANSLATE (c1);
3065 /* Set the range into bitmap */
3066 for (; c <= c1; c++)
3067 SET_LIST_BIT (TRANSLATE (c));
3068#else /* not emacs */
3069 if (target_multibyte)
3070 { 3146 {
3071 if (c1 >= 128) 3147 if (syntax & RE_NO_EMPTY_RANGES)
3072 { 3148 FREE_STACK_RETURN (REG_ERANGEX);
3073 re_wchar_t c0 = MAX (c, 128); 3149 /* Else, repeat the loop. */
3074
3075 SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1);
3076 c1 = 127;
3077 }
3078 for (; c <= c1; c++)
3079 SET_LIST_BIT (TRANSLATE (c));
3080 } 3150 }
3081 else 3151 else
3082 { 3152 {
3083 re_wchar_t c0; 3153#ifndef emacs
3084 3154 /* Set the range into bitmap */
3085 for (; c <= c1; c++) 3155 for (; c <= c1; c++)
3086 { 3156 {
3087 c0 = c; 3157 ch = TRANSLATE (c);
3088 if (! multibyte) 3158 if (ch < (1 << BYTEWIDTH))
3089 MAKE_CHAR_MULTIBYTE (c0); 3159 SET_LIST_BIT (ch);
3090 c0 = TRANSLATE (c0); 3160 }
3091 MAKE_CHAR_UNIBYTE (c0); 3161#else /* emacs */
3092 SET_LIST_BIT (c0); 3162 if (c < 128)
3163 {
3164 ch = MIN (127, c1);
3165 SETUP_ASCII_RANGE (range_table_work, c, ch);
3166 c = ch + 1;
3167 if (CHAR_BYTE8_P (c1))
3168 c = BYTE8_TO_CHAR (128);
3169 }
3170 if (c <= c1)
3171 {
3172 if (CHAR_BYTE8_P (c))
3173 {
3174 c = CHAR_TO_BYTE8 (c);
3175 c1 = CHAR_TO_BYTE8 (c1);
3176 for (; c <= c1; c++)
3177 SET_LIST_BIT (c);
3178 }
3179 else if (multibyte)
3180 {
3181 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3182 }
3183 else
3184 {
3185 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3186 }
3093 } 3187 }
3188#endif /* emacs */
3094 } 3189 }
3095#endif /* not emacs */
3096 } 3190 }
3097 3191
3098 /* Discard any (non)matching list bytes that are all 0 at the 3192 /* Discard any (non)matching list bytes that are all 0 at the
@@ -3677,17 +3771,22 @@ regex_compile (pattern, size, syntax, bufp)
3677 { 3771 {
3678 int len; 3772 int len;
3679 3773
3680 if (! multibyte) 3774 if (multibyte)
3681 MAKE_CHAR_MULTIBYTE (c);
3682 c = TRANSLATE (c);
3683 if (target_multibyte)
3684 { 3775 {
3776 c = TRANSLATE (c);
3685 len = CHAR_STRING (c, b); 3777 len = CHAR_STRING (c, b);
3686 b += len; 3778 b += len;
3687 } 3779 }
3688 else 3780 else
3689 { 3781 {
3690 MAKE_CHAR_UNIBYTE (c); 3782 c1 = RE_CHAR_TO_MULTIBYTE (c);
3783 if (! CHAR_BYTE8_P (c1))
3784 {
3785 re_wchar_t c2 = TRANSLATE (c1);
3786
3787 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3788 c = c1;
3789 }
3691 *b++ = c; 3790 *b++ = c;
3692 len = 1; 3791 len = 1;
3693 } 3792 }
@@ -3714,11 +3813,6 @@ regex_compile (pattern, size, syntax, bufp)
3714 /* We have succeeded; set the length of the buffer. */ 3813 /* We have succeeded; set the length of the buffer. */
3715 bufp->used = b - bufp->buffer; 3814 bufp->used = b - bufp->buffer;
3716 3815
3717#ifdef emacs
3718 /* Now the buffer is adjusted for the multibyteness of a target. */
3719 bufp->multibyte = bufp->target_multibyte;
3720#endif
3721
3722#ifdef DEBUG 3816#ifdef DEBUG
3723 if (debug > 0) 3817 if (debug > 0)
3724 { 3818 {
@@ -3964,11 +4058,23 @@ analyse_first (p, pend, fastmap, multibyte)
3964 4058
3965 case exactn: 4059 case exactn:
3966 if (fastmap) 4060 if (fastmap)
3967 /* If multibyte is nonzero, the first byte of each 4061 {
3968 character is an ASCII or a leading code. Otherwise, 4062 /* If multibyte is nonzero, the first byte of each
3969 each byte is a character. Thus, this works in both 4063 character is an ASCII or a leading code. Otherwise,
3970 cases. */ 4064 each byte is a character. Thus, this works in both
3971 fastmap[p[1]] = 1; 4065 cases. */
4066 fastmap[p[1]] = 1;
4067 if (! multibyte)
4068 {
4069 /* For the case of matching this unibyte regex
4070 against multibyte, we must set a leading code of
4071 the corresponding multibyte character. */
4072 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4073
4074 if (! CHAR_BYTE8_P (c))
4075 fastmap[CHAR_LEADING_CODE (c)] = 1;
4076 }
4077 }
3972 break; 4078 break;
3973 4079
3974 4080
@@ -3983,12 +4089,8 @@ analyse_first (p, pend, fastmap, multibyte)
3983 if (!fastmap) break; 4089 if (!fastmap) break;
3984 { 4090 {
3985 /* Chars beyond end of bitmap are possible matches. */ 4091 /* Chars beyond end of bitmap are possible matches. */
3986 /* In a multibyte case, the bitmap is used only for ASCII
3987 characters. */
3988 int limit = multibyte ? 128 : (1 << BYTEWIDTH);
3989
3990 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; 4092 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3991 j < limit; j++) 4093 j < (1 << BYTEWIDTH); j++)
3992 fastmap[j] = 1; 4094 fastmap[j] = 1;
3993 } 4095 }
3994 4096
@@ -4031,7 +4133,7 @@ analyse_first (p, pend, fastmap, multibyte)
4031 4133
4032 /* Extract the number of ranges in range table into COUNT. */ 4134 /* Extract the number of ranges in range table into COUNT. */
4033 EXTRACT_NUMBER_AND_INCR (count, p); 4135 EXTRACT_NUMBER_AND_INCR (count, p);
4034 for (; count > 0; count--, p += 2 * 3) /* XXX */ 4136 for (; count > 0; count--, p += 3)
4035 { 4137 {
4036 /* Extract the start and end of each range. */ 4138 /* Extract the start and end of each range. */
4037 EXTRACT_CHARACTER (c, p); 4139 EXTRACT_CHARACTER (c, p);
@@ -4329,9 +4431,8 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4329 int total_size = size1 + size2; 4431 int total_size = size1 + size2;
4330 int endpos = startpos + range; 4432 int endpos = startpos + range;
4331 boolean anchored_start; 4433 boolean anchored_start;
4332 /* Nonzero if BUFP is setup for multibyte characters. We are sure 4434 /* Nonzero if we are searching multibyte string. */
4333 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */ 4435 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4334 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4335 4436
4336 /* Check for out-of-range STARTPOS. */ 4437 /* Check for out-of-range STARTPOS. */
4337 if (startpos < 0 || startpos > total_size) 4438 if (startpos < 0 || startpos > total_size)
@@ -4437,10 +4538,14 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4437 else 4538 else
4438 while (range > lim) 4539 while (range > lim)
4439 { 4540 {
4541 register re_wchar_t ch, translated;
4542
4440 buf_ch = *d; 4543 buf_ch = *d;
4441 MAKE_CHAR_MULTIBYTE (buf_ch); 4544 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4442 buf_ch = RE_TRANSLATE (translate, buf_ch); 4545 translated = RE_TRANSLATE (translate, ch);
4443 MAKE_CHAR_UNIBYTE (buf_ch); 4546 if (translated != ch
4547 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4548 buf_ch = ch;
4444 if (fastmap[buf_ch]) 4549 if (fastmap[buf_ch])
4445 break; 4550 break;
4446 d++; 4551 d++;
@@ -4484,7 +4589,15 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4484 } 4589 }
4485 else 4590 else
4486 { 4591 {
4487 if (! fastmap[TRANSLATE (*d)]) 4592 register re_wchar_t ch, translated;
4593
4594 buf_ch = *d;
4595 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4596 translated = TRANSLATE (ch);
4597 if (translated != ch
4598 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4599 buf_ch = ch;
4600 if (! fastmap[TRANSLATE (buf_ch)])
4488 goto advance; 4601 goto advance;
4489 } 4602 }
4490 } 4603 }
@@ -4765,11 +4878,11 @@ mutually_exclusive_p (bufp, p1, p2)
4765 { 4878 {
4766 register re_wchar_t c 4879 register re_wchar_t c
4767 = (re_opcode_t) *p2 == endline ? '\n' 4880 = (re_opcode_t) *p2 == endline ? '\n'
4768 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); 4881 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
4769 4882
4770 if ((re_opcode_t) *p1 == exactn) 4883 if ((re_opcode_t) *p1 == exactn)
4771 { 4884 {
4772 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2)) 4885 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
4773 { 4886 {
4774 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); 4887 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4775 return 1; 4888 return 1;
@@ -4993,23 +5106,6 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4993} 5106}
4994WEAK_ALIAS (__re_match_2, re_match_2) 5107WEAK_ALIAS (__re_match_2, re_match_2)
4995 5108
4996#ifdef emacs
4997#define TRANSLATE_VIA_MULTIBYTE(c) \
4998 do { \
4999 if (multibyte) \
5000 (c) = TRANSLATE (c); \
5001 else \
5002 { \
5003 MAKE_CHAR_MULTIBYTE (c); \
5004 (c) = TRANSLATE (c); \
5005 MAKE_CHAR_UNIBYTE (c); \
5006 } \
5007 } while (0)
5008
5009#else
5010#define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c))
5011#endif
5012
5013 5109
5014/* This is a separate function so that we can force an alloca cleanup 5110/* This is a separate function so that we can force an alloca cleanup
5015 afterwards. */ 5111 afterwards. */
@@ -5050,10 +5146,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5050 /* We use this to map every character in the string. */ 5146 /* We use this to map every character in the string. */
5051 RE_TRANSLATE_TYPE translate = bufp->translate; 5147 RE_TRANSLATE_TYPE translate = bufp->translate;
5052 5148
5053 /* Nonzero if BUFP is setup for multibyte characters. We are sure 5149 /* Nonzero if BUFP is setup from a multibyte regex. */
5054 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
5055 const boolean multibyte = RE_MULTIBYTE_P (bufp); 5150 const boolean multibyte = RE_MULTIBYTE_P (bufp);
5056 5151
5152 /* Nonzero if STRING1/STRING2 are multibyte. */
5153 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5154
5057 /* Failure point stack. Each place that can handle a failure further 5155 /* Failure point stack. Each place that can handle a failure further
5058 down the line pushes a failure point on this stack. It consists of 5156 down the line pushes a failure point on this stack. It consists of
5059 regstart, and regend for all registers corresponding to 5157 regstart, and regend for all registers corresponding to
@@ -5433,14 +5531,20 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5433 while (--mcnt); 5531 while (--mcnt);
5434#else /* emacs */ 5532#else /* emacs */
5435 /* The cost of testing `translate' is comparatively small. */ 5533 /* The cost of testing `translate' is comparatively small. */
5436 if (multibyte) 5534 if (target_multibyte)
5437 do 5535 do
5438 { 5536 {
5439 int pat_charlen, buf_charlen; 5537 int pat_charlen, buf_charlen;
5440 unsigned int pat_ch, buf_ch; 5538 int pat_ch, buf_ch;
5441 5539
5442 PREFETCH (); 5540 PREFETCH ();
5443 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); 5541 if (multibyte)
5542 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5543 else
5544 {
5545 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5546 pat_charlen = 1;
5547 }
5444 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5548 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5445 5549
5446 if (TRANSLATE (buf_ch) != pat_ch) 5550 if (TRANSLATE (buf_ch) != pat_ch)
@@ -5457,16 +5561,38 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5457 else 5561 else
5458 do 5562 do
5459 { 5563 {
5460 unsigned int buf_ch; 5564 int pat_charlen, buf_charlen;
5565 int pat_ch, buf_ch;
5461 5566
5462 PREFETCH (); 5567 PREFETCH ();
5463 buf_ch = *d++; 5568 if (multibyte)
5464 TRANSLATE_VIA_MULTIBYTE (buf_ch); 5569 {
5465 if (buf_ch != *p++) 5570 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5571 if (CHAR_BYTE8_P (pat_ch))
5572 pat_ch = CHAR_TO_BYTE8 (pat_ch);
5573 else
5574 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5575 }
5576 else
5577 {
5578 pat_ch = *p;
5579 pat_charlen = 1;
5580 }
5581 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5582 if (! CHAR_BYTE8_P (buf_ch))
5583 {
5584 buf_ch = TRANSLATE (buf_ch);
5585 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5586 if (buf_ch < 0)
5587 buf_ch = *d;
5588 }
5589 if (buf_ch != pat_ch)
5466 { 5590 {
5467 d = dfail; 5591 d = dfail;
5468 goto fail; 5592 goto fail;
5469 } 5593 }
5594 p += pat_charlen;
5595 d++;
5470 } 5596 }
5471 while (--mcnt); 5597 while (--mcnt);
5472#endif 5598#endif
@@ -5482,7 +5608,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5482 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 5608 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5483 5609
5484 PREFETCH (); 5610 PREFETCH ();
5485 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5611 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
5612 target_multibyte);
5486 buf_ch = TRANSLATE (buf_ch); 5613 buf_ch = TRANSLATE (buf_ch);
5487 5614
5488 if ((!(bufp->syntax & RE_DOT_NEWLINE) 5615 if ((!(bufp->syntax & RE_DOT_NEWLINE)
@@ -5526,10 +5653,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5526 } 5653 }
5527 5654
5528 PREFETCH (); 5655 PREFETCH ();
5529 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); 5656 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
5530 TRANSLATE_VIA_MULTIBYTE (c); /* The character to match. */ 5657 if (target_multibyte)
5658 {
5659 int c1;
5531 5660
5532 if (! multibyte || IS_REAL_ASCII (c)) 5661 c = TRANSLATE (c);
5662 c1 = RE_CHAR_TO_UNIBYTE (c);
5663 if (c1 >= 0)
5664 c = c1;
5665 }
5666 else
5667 {
5668 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5669
5670 if (! CHAR_BYTE8_P (c1))
5671 {
5672 c1 = TRANSLATE (c1);
5673 c1 = RE_CHAR_TO_UNIBYTE (c1);
5674 if (c1 >= 0)
5675 c = c1;
5676 }
5677 }
5678
5679 if (c < (1 << BYTEWIDTH))
5533 { /* Lookup bitmap. */ 5680 { /* Lookup bitmap. */
5534 /* Cast to `unsigned' instead of `unsigned char' in 5681 /* Cast to `unsigned' instead of `unsigned char' in
5535 case the bit list is a full 32 bytes long. */ 5682 case the bit list is a full 32 bytes long. */
@@ -6096,7 +6243,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
6096 UPDATE_SYNTAX_TABLE (charpos); 6243 UPDATE_SYNTAX_TABLE (charpos);
6097#endif 6244#endif
6098 PREFETCH (); 6245 PREFETCH ();
6099 c2 = RE_STRING_CHAR (d, dend - d); 6246 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
6100 s2 = SYNTAX (c2); 6247 s2 = SYNTAX (c2);
6101 6248
6102 /* Case 2: S2 is neither Sword nor Ssymbol. */ 6249 /* Case 2: S2 is neither Sword nor Ssymbol. */
@@ -6149,7 +6296,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
6149 if (!AT_STRINGS_END (d)) 6296 if (!AT_STRINGS_END (d))
6150 { 6297 {
6151 PREFETCH_NOLIMIT (); 6298 PREFETCH_NOLIMIT ();
6152 c2 = RE_STRING_CHAR (d, dend - d); 6299 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
6153#ifdef emacs 6300#ifdef emacs
6154 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); 6301 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6155#endif 6302#endif