aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2003-05-30 07:00:29 +0000
committerKenichi Handa2003-05-30 07:00:29 +0000
commit6fdd04b0986362911b55e6dc5308f951debf21a2 (patch)
treed4b75740d30704d398c682e8d08c72e4ce035e89 /src
parent09d1b24e07787b770002ea01b034bc4c4cf62699 (diff)
downloademacs-6fdd04b0986362911b55e6dc5308f951debf21a2.tar.gz
emacs-6fdd04b0986362911b55e6dc5308f951debf21a2.zip
(GET_CHAR_BEFORE_2): Check multibyte, not
target_multibyte. Even in a unibyte case, return a converted multibyte char. (GET_CHAR_AFTER): New macro. (PATFETCH): Translate via multibyte char. (HANDLE_UNIBYTE_RANGE): Delete this macro. (SETUP_MULTIBYTE_RANGE): New macro. (regex_compile): Setup compiled code so that its multibyteness matches that of a target. Fix the handling of "[X-YZ]" using SETUP_MULTIBYTE_RANGE. (analyse_first) <charset>: For filling fastmap for all multibyte characters, don't check by BASE_LEADING_CODE_P. (re_search_2): Don't check RE_TARGET_MULTIBYTE_P (bufp). It is the same as RE_MULTIBYTE_P (bufp) now. (mutually_exclusive_p): Check by (! multibyte || IS_REAL_ASCII (c)). (TARGET_CHAR_AND_LENGTH): Delete this macro. (TRANSLATE_VIA_MULTIBYTE): New macro. (re_match_2_internal): Don't check RE_TARGET_MULTIBYTE_P (bufp). It is the same as RE_MULTIBYTE_P (bufp) now. <exactn>: Translate via multibyte. <anychar>: Fetch a character by RE_STRING_CHAR_AND_LENGTH. Don't translate it. <charset, charset_not>: Fetch a character by RE_STRING_CHAR_AND_LENGTH. Translate via multibyte. <duplicate>: Call bcmp_translate with the last arg `multibyte'. <wordbound, notwordbound, wordbeg, wordend, syntaxspec, notsyntaxspec, categoryspec, notcategoryspec> Fetch a character by GET_CHAR_AFTER. (bcmp_translate): Likewise.
Diffstat (limited to 'src')
-rw-r--r--src/regex.c490
1 files changed, 227 insertions, 263 deletions
diff --git a/src/regex.c b/src/regex.c
index 8cbc5f7949a..bea8433153d 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -152,12 +152,12 @@
152# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ 152# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) 153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
154 154
155/* Set C a (possibly multibyte) character before P. P points into a 155/* Set C a (possibly converted to multibyte) character before P. P
156 string which is the virtual concatenation of STR1 (which ends at 156 points into a string which is the virtual concatenation of STR1
157 END1) or STR2 (which ends at END2). */ 157 (which ends at END1) or STR2 (which ends at END2). */
158# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 158# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
159 do { \ 159 do { \
160 if (target_multibyte) \ 160 if (multibyte) \
161 { \ 161 { \
162 re_char *dtemp = (p) == (str2) ? (end1) : (p); \ 162 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
163 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ 163 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
@@ -167,11 +167,24 @@
167 else \ 167 else \
168 { \ 168 { \
169 (c = ((p) == (str2) ? (end1) : (p))[-1]); \ 169 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
170 if (multibyte) \ 170 MAKE_CHAR_MULTIBYTE (c); \
171 MAKE_CHAR_MULTIBYTE (c); \
172 } \ 171 } \
173 } while (0) 172 } while (0)
174 173
174/* Set C a (possibly converted to multibyte) character at P, and set
175 LEN to the byte length of that character. */
176# define GET_CHAR_AFTER(c, p, len) \
177 do { \
178 if (multibyte) \
179 c = STRING_CHAR_AND_LENGTH (p, 0, len); \
180 else \
181 { \
182 c = *p; \
183 len = 1; \
184 MAKE_CHAR_MULTIBYTE (c); \
185 } \
186 } while (0)
187
175 188
176#else /* not emacs */ 189#else /* not emacs */
177 190
@@ -251,10 +264,13 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
251# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH 264# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
252# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 265# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
253 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) 266 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
267# define GET_CHAR_AFTER(c, p, len) \
268 (c = *p, len = 1)
254# define MAKE_CHAR(charset, c1, c2) (c1) 269# define MAKE_CHAR(charset, c1, c2) (c1)
255# define BYTE8_TO_CHAR(c) (c) 270# define BYTE8_TO_CHAR(c) (c)
256# define CHAR_BYTE8_P(c) (0) 271# define CHAR_BYTE8_P(c) (0)
257# define MAKE_CHAR_MULTIBYTE(c) 0 272# define MAKE_CHAR_MULTIBYTE(c) (c)
273# define MAKE_CHAR_UNIBYTE(c) (c)
258# define CHAR_LEADING_CODE(c) (c) 274# define CHAR_LEADING_CODE(c) (c)
259#endif /* not emacs */ 275#endif /* not emacs */
260 276
@@ -1676,6 +1692,8 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
1676 if (! multibyte) \ 1692 if (! multibyte) \
1677 MAKE_CHAR_MULTIBYTE (c); \ 1693 MAKE_CHAR_MULTIBYTE (c); \
1678 c = TRANSLATE (c); \ 1694 c = TRANSLATE (c); \
1695 if (! target_multibyte) \
1696 MAKE_CHAR_UNIBYTE (c); \
1679 } while (0) 1697 } while (0)
1680 1698
1681/* Fetch the next character in the uncompiled pattern, with no 1699/* Fetch the next character in the uncompiled pattern, with no
@@ -1933,46 +1951,27 @@ struct range_table_work_area
1933 not that easy because macros called within it assumes various 1951 not that easy because macros called within it assumes various
1934 variables being defined. */ 1952 variables being defined. */
1935 1953
1936#define HANDLE_UNIBYTE_RANGE(work_area, c1, c2) \ 1954#define SETUP_MULTIBYTE_RANGE(work_area, c0, c1) \
1937 do { \ 1955 do { \
1938 int char_table[257]; \ 1956 re_wchar_t c, t, t_last; \
1939 int i, j, c; \ 1957 int n; \
1940 \ 1958 \
1941 char_table[(c1) - 1] = -2; /* head sentinel */ \ 1959 c = (c0); \
1942 for (i = (c1); i <= (c2); i++) \ 1960 t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1943 char_table[i] = TRANSLATE (unibyte_char_to_multibyte (i)); \ 1961 for (c++, n = 1; c <= (c1); c++, n++) \
1944 char_table[i] = MAX_CHAR + 2; /* tail sentinel */ \ 1962 { \
1945 \ 1963 t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1946 /* As the number of data is small (at most 128) and we can expect \ 1964 if (t_last + n == t) \
1947 that data in char_table are mostly sorted, we use fairly simple \ 1965 continue; \
1948 `insertion sort'. */ \ 1966 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
1949 for (i = (c1) + 1; i <= (c2); i++) \ 1967 t_last = t; \
1950 { \ 1968 n = 1; \
1951 c = char_table[i]; \ 1969 } \
1952 j = i; \ 1970 if (n > 0) \
1953 while (char_table[j - 1] > c) \ 1971 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
1954 char_table[j] = char_table[j - 1], j--; \
1955 char_table[j] = c; \
1956 } \
1957 \
1958 for (i = (c1); i <= (c2); i++) \
1959 { \
1960 c = char_table[i]; \
1961 if (! IS_REAL_ASCII (c)) \
1962 break; \
1963 SET_LIST_BIT (c); \
1964 } \
1965 while (i <= (c2)) \
1966 { \
1967 c = char_table[i]; \
1968 for (j = i + 1; j <= (c2); j++) \
1969 if (char_table[j] - c != j - i) \
1970 break; \
1971 SET_RANGE_TABLE_WORK_AREA ((work_area), c, char_table[j - 1]); \
1972 i = j; \
1973 } \
1974 } while (0) 1972 } while (0)
1975 1973
1974
1976#endif /* emacs */ 1975#endif /* emacs */
1977 1976
1978/* Get the next unsigned number in the uncompiled pattern. */ 1977/* Get the next unsigned number in the uncompiled pattern. */
@@ -2258,6 +2257,9 @@ regex_compile (pattern, size, syntax, bufp)
2258 /* If the object matched can contain multibyte characters. */ 2257 /* If the object matched can contain multibyte characters. */
2259 const boolean multibyte = RE_MULTIBYTE_P (bufp); 2258 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2260 2259
2260 /* If a target can contain multibyte characters. */
2261 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2262
2261#ifdef DEBUG 2263#ifdef DEBUG
2262 debug++; 2264 debug++;
2263 DEBUG_PRINT1 ("\nCompiling pattern: "); 2265 DEBUG_PRINT1 ("\nCompiling pattern: ");
@@ -2572,10 +2574,6 @@ regex_compile (pattern, size, syntax, bufp)
2572 break; 2574 break;
2573 } 2575 }
2574 2576
2575 /* What should we do for the character which is
2576 greater than 0x7F, but not BASE_LEADING_CODE_P?
2577 XXX */
2578
2579 /* See if we're at the beginning of a possible character 2577 /* See if we're at the beginning of a possible character
2580 class. */ 2578 class. */
2581 2579
@@ -2671,65 +2669,41 @@ regex_compile (pattern, size, syntax, bufp)
2671 2669
2672 /* Fetch the character which ends the range. */ 2670 /* Fetch the character which ends the range. */
2673 PATFETCH_RAW (c1); 2671 PATFETCH_RAW (c1);
2674#ifdef emacs 2672 if (c > c1)
2675 if (multibyte)
2676 {
2677 c = TRANSLATE (c);
2678 c1 = TRANSLATE (c1);
2679 if (! IS_REAL_ASCII (c1))
2680 {
2681 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2682 c, c1);
2683 c1 = 127;
2684 }
2685 }
2686 else
2687 { 2673 {
2688 if (! IS_REAL_ASCII (c1)) 2674 if (syntax & RE_NO_EMPTY_RANGES)
2689 { 2675 FREE_STACK_RETURN (REG_ERANGE);
2690 int c2 = MAX (c, 128); 2676 /* Else, repeat the loop. */
2691
2692 HANDLE_UNIBYTE_RANGE (range_table_work, c2, c1);
2693 c1 = 127;
2694 }
2695 } 2677 }
2696#endif
2697 } 2678 }
2698 else 2679 else
2680 c1 = c;
2681#ifndef emacs
2682 c = TRANSLATE (c);
2683 c1 = TRANSLATE (c1);
2684#else /* not emacs */
2685 if (target_multibyte)
2699 { 2686 {
2700 /* Range from C to C. */ 2687 if (! IS_REAL_ASCII (c1))
2701 if (! multibyte)
2702 MAKE_CHAR_MULTIBYTE (c);
2703 c = TRANSLATE (c);
2704 if (IS_REAL_ASCII (c))
2705 c1 = c;
2706 else
2707 { 2688 {
2708 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c); 2689 re_wchar_t c0 = MAX (c, 128);
2709 c = -1; /* Suppress setting bitmap. */ 2690
2691 SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1);
2692 c1 = MIN (127, c1);
2710 } 2693 }
2711 } 2694 }
2712 2695 else
2713 /* Set the range into bitmap */
2714 if (c >= 0)
2715 { 2696 {
2716 re_wchar_t this_char; 2697 if (multibyte)
2717 int range_start = c, range_end = c1;
2718
2719 /* If the start is after the end, the range is empty. */
2720 if (range_start > range_end)
2721 {
2722 if (syntax & RE_NO_EMPTY_RANGES)
2723 FREE_STACK_RETURN (REG_ERANGE);
2724 /* Else, repeat the loop. */
2725 }
2726 else
2727 { 2698 {
2728 for (this_char = range_start; this_char <= range_end; 2699 MAKE_CHAR_UNIBYTE (c);
2729 this_char++) 2700 MAKE_CHAR_UNIBYTE (c1);
2730 SET_LIST_BIT (TRANSLATE (this_char));
2731 } 2701 }
2732 } 2702 }
2703#endif /* not emacs */
2704 /* Set the range into bitmap */
2705 for (; c <= c1; c++)
2706 SET_LIST_BIT (TRANSLATE (c));
2733 } 2707 }
2734 2708
2735 /* Discard any (non)matching list bytes that are all 0 at the 2709 /* Discard any (non)matching list bytes that are all 0 at the
@@ -3264,7 +3238,11 @@ regex_compile (pattern, size, syntax, bufp)
3264 /* You might think it would be useful for \ to mean 3238 /* You might think it would be useful for \ to mean
3265 not to translate; but if we don't translate it 3239 not to translate; but if we don't translate it
3266 it will never match anything. */ 3240 it will never match anything. */
3241 /* Actually we don't have to translate it now, because
3242 it is anyway translated later. */
3243#if 0
3267 c = TRANSLATE (c); 3244 c = TRANSLATE (c);
3245#endif
3268 goto normal_char; 3246 goto normal_char;
3269 } 3247 }
3270 break; 3248 break;
@@ -3307,8 +3285,17 @@ regex_compile (pattern, size, syntax, bufp)
3307 if (! multibyte) 3285 if (! multibyte)
3308 MAKE_CHAR_MULTIBYTE (c); 3286 MAKE_CHAR_MULTIBYTE (c);
3309 c = TRANSLATE (c); 3287 c = TRANSLATE (c);
3310 len = CHAR_STRING (c, b); 3288 if (target_multibyte)
3311 b += len; 3289 {
3290 len = CHAR_STRING (c, b);
3291 b += len;
3292 }
3293 else
3294 {
3295 MAKE_CHAR_UNIBYTE (c);
3296 *b++ = c;
3297 len = 1;
3298 }
3312 (*pending_exact) += len; 3299 (*pending_exact) += len;
3313 } 3300 }
3314 3301
@@ -3334,6 +3321,11 @@ regex_compile (pattern, size, syntax, bufp)
3334 /* We have succeeded; set the length of the buffer. */ 3321 /* We have succeeded; set the length of the buffer. */
3335 bufp->used = b - bufp->buffer; 3322 bufp->used = b - bufp->buffer;
3336 3323
3324#ifdef emacs
3325 /* Now the buffer is adjusted for the multibyteness of a target. */
3326 bufp->multibyte = bufp->target_multibyte;
3327#endif
3328
3337#ifdef DEBUG 3329#ifdef DEBUG
3338 if (debug > 0) 3330 if (debug > 0)
3339 { 3331 {
@@ -3513,8 +3505,6 @@ group_in_compile_stack (compile_stack, regnum)
3513 bother filling it up (obviously) and only return whether the 3505 bother filling it up (obviously) and only return whether the
3514 pattern could potentially match the empty string. 3506 pattern could potentially match the empty string.
3515 3507
3516 MULTIBYTE is always 1 for Emacs, and 0 otherwise.
3517
3518 Return 1 if p..pend might match the empty string. 3508 Return 1 if p..pend might match the empty string.
3519 Return 0 if p..pend matches at least one char. 3509 Return 0 if p..pend matches at least one char.
3520 Return -1 if fastmap was not updated accurately. */ 3510 Return -1 if fastmap was not updated accurately. */
@@ -3600,7 +3590,8 @@ analyse_first (p, pend, fastmap, multibyte)
3600 if (!fastmap) break; 3590 if (!fastmap) break;
3601 { 3591 {
3602 /* Chars beyond end of bitmap are possible matches. */ 3592 /* Chars beyond end of bitmap are possible matches. */
3603 /* Emacs uses the bitmap only for ASCII characters. */ 3593 /* In a multibyte case, the bitmap is used only for ASCII
3594 characters. */
3604 int limit = multibyte ? 128 : (1 << BYTEWIDTH); 3595 int limit = multibyte ? 128 : (1 << BYTEWIDTH);
3605 3596
3606 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; 3597 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
@@ -3623,14 +3614,12 @@ analyse_first (p, pend, fastmap, multibyte)
3623 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) 3614 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3624 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) 3615 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3625 /* If we can match a character class, we can match 3616 /* If we can match a character class, we can match
3626 any character set. */ 3617 any multibyte characters. */
3627 { 3618 {
3628 set_fastmap_for_multibyte_characters:
3629 if (match_any_multibyte_characters == false) 3619 if (match_any_multibyte_characters == false)
3630 { 3620 {
3631 for (j = 0x80; j < 0x100; j++) /* XXX */ 3621 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3632 if (BASE_LEADING_CODE_P (j)) 3622 fastmap[j] = 1;
3633 fastmap[j] = 1;
3634 match_any_multibyte_characters = true; 3623 match_any_multibyte_characters = true;
3635 } 3624 }
3636 } 3625 }
@@ -3688,9 +3677,16 @@ analyse_first (p, pend, fastmap, multibyte)
3688 fastmap[j] = 1; 3677 fastmap[j] = 1;
3689 3678
3690 if (multibyte) 3679 if (multibyte)
3691 /* Any character set can possibly contain a character 3680 {
3692 whose category is K (or not). */ 3681 /* Any character set can possibly contain a character
3693 goto set_fastmap_for_multibyte_characters; 3682 whose category is K (or not). */
3683 if (match_any_multibyte_characters == false)
3684 {
3685 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3686 fastmap[j] = 1;
3687 match_any_multibyte_characters = true;
3688 }
3689 }
3694 break; 3690 break;
3695 3691
3696 /* All cases after this match the empty string. These end with 3692 /* All cases after this match the empty string. These end with
@@ -3942,15 +3938,9 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3942 int total_size = size1 + size2; 3938 int total_size = size1 + size2;
3943 int endpos = startpos + range; 3939 int endpos = startpos + range;
3944 boolean anchored_start; 3940 boolean anchored_start;
3945 3941 /* Nonzero if BUFP is setup for multibyte characters. We are sure
3946 /* Nonzero if BUFP is setup for multibyte characters. */ 3942 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
3947#ifdef emacs 3943 const boolean multibyte = RE_MULTIBYTE_P (bufp);
3948 const boolean multibyte = 1;
3949#else
3950 const boolean multibyte = 0;
3951#endif
3952 /* Nonzero if STR1 and STR2 contains multibyte characters. */
3953 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3954 3944
3955 /* Check for out-of-range STARTPOS. */ 3945 /* Check for out-of-range STARTPOS. */
3956 if (startpos < 0 || startpos > total_size) 3946 if (startpos < 0 || startpos > total_size)
@@ -4039,59 +4029,57 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4039 inside the loop. */ 4029 inside the loop. */
4040 if (RE_TRANSLATE_P (translate)) 4030 if (RE_TRANSLATE_P (translate))
4041 { 4031 {
4042 if (target_multibyte) 4032 if (multibyte)
4043 while (range > lim) 4033 while (range > lim)
4044 { 4034 {
4045 int buf_charlen; 4035 int buf_charlen;
4046 4036
4047 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, 4037 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4048 buf_charlen); 4038 buf_charlen);
4049
4050 buf_ch = RE_TRANSLATE (translate, buf_ch); 4039 buf_ch = RE_TRANSLATE (translate, buf_ch);
4051 if (fastmap[CHAR_LEADING_CODE (buf_ch)]) 4040 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4052 break; 4041 break;
4053
4054 range -= buf_charlen; 4042 range -= buf_charlen;
4055 d += buf_charlen; 4043 d += buf_charlen;
4056 } 4044 }
4057 else if (multibyte) 4045 else
4058 while (range > lim) 4046 while (range > lim)
4059 { 4047 {
4060 buf_ch = *d; 4048 buf_ch = *d;
4049#ifdef emacs
4061 MAKE_CHAR_MULTIBYTE (buf_ch); 4050 MAKE_CHAR_MULTIBYTE (buf_ch);
4062 buf_ch = RE_TRANSLATE (translate, buf_ch); 4051#endif
4063 if (fastmap[CHAR_LEADING_CODE (buf_ch)]) 4052 buf_ch = RE_TRANSLATE (buf_ch);
4053#ifdef emacs
4054 MAKE_CHAR_UNIBYTE (buf_ch);
4055#endif
4056 if (fastmap[buf_ch])
4064 break; 4057 break;
4065
4066 d++; 4058 d++;
4067 range--; 4059 range--;
4068 } 4060 }
4061 }
4062 else
4063 {
4064 if (multibyte)
4065 while (range > lim)
4066 {
4067 int buf_charlen;
4068
4069 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4070 buf_charlen);
4071 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4072 break;
4073 range -= buf_charlen;
4074 d += buf_charlen;
4075 }
4069 else 4076 else
4070 while (range > lim 4077 while (range > lim && !fastmap[*d])
4071 && !fastmap[RE_TRANSLATE (translate, *d)])
4072 { 4078 {
4073 d++; 4079 d++;
4074 range--; 4080 range--;
4075 } 4081 }
4076 } 4082 }
4077 else if (multibyte && ! target_multibyte)
4078 while (range > lim)
4079 {
4080 buf_ch = *d;
4081 MAKE_CHAR_MULTIBYTE (buf_ch);
4082 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4083 break;
4084
4085 d++;
4086 range--;
4087 }
4088 else
4089 while (range > lim && !fastmap[*d])
4090 {
4091 d++;
4092 range--;
4093 }
4094
4095 startpos += irange - range; 4083 startpos += irange - range;
4096 } 4084 }
4097 else /* Searching backwards. */ 4085 else /* Searching backwards. */
@@ -4102,14 +4090,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4102 4090
4103 if (multibyte) 4091 if (multibyte)
4104 { 4092 {
4105 /* Case of Emacs. */ 4093 buf_ch = STRING_CHAR (d, room);
4106 if (target_multibyte)
4107 buf_ch = RE_STRING_CHAR (d, room);
4108 else
4109 {
4110 buf_ch = *d;
4111 MAKE_CHAR_MULTIBYTE (buf_ch);
4112 }
4113 buf_ch = TRANSLATE (buf_ch); 4094 buf_ch = TRANSLATE (buf_ch);
4114 if (! fastmap[CHAR_LEADING_CODE (buf_ch)]) 4095 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4115 goto advance; 4096 goto advance;
@@ -4147,7 +4128,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4147 else if (range > 0) 4128 else if (range > 0)
4148 { 4129 {
4149 /* Update STARTPOS to the next character boundary. */ 4130 /* Update STARTPOS to the next character boundary. */
4150 if (target_multibyte) 4131 if (multibyte)
4151 { 4132 {
4152 re_char *p = POS_ADDR_VSTRING (startpos); 4133 re_char *p = POS_ADDR_VSTRING (startpos);
4153 re_char *pend = STOP_ADDR_VSTRING (startpos); 4134 re_char *pend = STOP_ADDR_VSTRING (startpos);
@@ -4170,7 +4151,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4170 startpos--; 4151 startpos--;
4171 4152
4172 /* Update STARTPOS to the previous character boundary. */ 4153 /* Update STARTPOS to the previous character boundary. */
4173 if (target_multibyte) 4154 if (multibyte)
4174 { 4155 {
4175 re_char *p = POS_ADDR_VSTRING (startpos); 4156 re_char *p = POS_ADDR_VSTRING (startpos);
4176 int len = 0; 4157 int len = 0;
@@ -4178,20 +4159,10 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4178 /* Find the head of multibyte form. */ 4159 /* Find the head of multibyte form. */
4179 while (!CHAR_HEAD_P (*p)) 4160 while (!CHAR_HEAD_P (*p))
4180 p--, len++; 4161 p--, len++;
4181 4162 range += len;
4182 /* Adjust it. */ 4163 if (range > 0)
4183#if 0 /* XXX */ 4164 break;
4184 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1)) 4165 startpos -= len;
4185 ;
4186 else
4187#endif
4188 {
4189 range += len;
4190 if (range > 0)
4191 break;
4192
4193 startpos -= len;
4194 }
4195 } 4166 }
4196 } 4167 }
4197 } 4168 }
@@ -4424,7 +4395,7 @@ mutually_exclusive_p (bufp, p1, p2)
4424 4395
4425 /* Test if C is listed in charset (or charset_not) 4396 /* Test if C is listed in charset (or charset_not)
4426 at `p1'. */ 4397 at `p1'. */
4427 if (SINGLE_BYTE_CHAR_P (c)) 4398 if (! multibyte || IS_REAL_ASCII (c))
4428 { 4399 {
4429 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH 4400 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4430 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4401 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
@@ -4467,9 +4438,10 @@ mutually_exclusive_p (bufp, p1, p2)
4467 size of bitmap table of P1 is extracted by 4438 size of bitmap table of P1 is extracted by
4468 using macro `CHARSET_BITMAP_SIZE'. 4439 using macro `CHARSET_BITMAP_SIZE'.
4469 4440
4470 Since we know that all the character listed in 4441 In a multibyte case, we know that all the character
4471 P2 is ASCII, it is enough to test only bitmap 4442 listed in P2 is ASCII. In a unibyte case, P1 has only a
4472 table of P1. */ 4443 bitmap table. So, in both cases, it is enough to test
4444 only the bitmap table of P1. */
4473 4445
4474 if ((re_opcode_t) *p1 == charset) 4446 if ((re_opcode_t) *p1 == charset)
4475 { 4447 {
@@ -4628,13 +4600,20 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4628WEAK_ALIAS (__re_match_2, re_match_2) 4600WEAK_ALIAS (__re_match_2, re_match_2)
4629 4601
4630#ifdef emacs 4602#ifdef emacs
4631#define TARGET_CHAR_AND_LENGTH(d, len, actual_len) \ 4603#define TRANSLATE_VIA_MULTIBYTE(c) \
4632 (target_multibyte \ 4604 do { \
4633 ? STRING_CHAR_AND_LENGTH (d, len, actual_len) \ 4605 if (multibyte) \
4634 : (actual_len = 1, unibyte_char_to_multibyte (*d))) 4606 (c) = TRANSLATE (c); \
4607 else \
4608 { \
4609 MAKE_CHAR_MULTIBYTE (c); \
4610 (c) = TRANSLATE (c); \
4611 MAKE_CHAR_UNIBYTE (c); \
4612 } \
4613 } while (0)
4614
4635#else 4615#else
4636#define TARGET_CHAR_AND_LENGTH(d, len, actual_len) \ 4616#define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c))
4637 (actual_len = 1, *d)
4638#endif 4617#endif
4639 4618
4640 4619
@@ -4677,14 +4656,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4677 /* We use this to map every character in the string. */ 4656 /* We use this to map every character in the string. */
4678 RE_TRANSLATE_TYPE translate = bufp->translate; 4657 RE_TRANSLATE_TYPE translate = bufp->translate;
4679 4658
4680 /* Nonzero if BUFP is setup for multibyte characters. */ 4659 /* Nonzero if BUFP is setup for multibyte characters. We are sure
4681#ifdef emacs 4660 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
4682 const boolean multibyte = 1; 4661 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4683#else
4684 const boolean multibyte = 0;
4685#endif
4686 /* Nonzero if STR1 and STR2 contains multibyte characters. */
4687 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4688 4662
4689 /* Failure point stack. Each place that can handle a failure further 4663 /* Failure point stack. Each place that can handle a failure further
4690 down the line pushes a failure point on this stack. It consists of 4664 down the line pushes a failure point on this stack. It consists of
@@ -5037,82 +5011,73 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5037 /* Remember the start point to rollback upon failure. */ 5011 /* Remember the start point to rollback upon failure. */
5038 dfail = d; 5012 dfail = d;
5039 5013
5014#ifndef emacs
5040 /* This is written out as an if-else so we don't waste time 5015 /* This is written out as an if-else so we don't waste time
5041 testing `translate' inside the loop. */ 5016 testing `translate' inside the loop. */
5042 if (RE_TRANSLATE_P (translate)) 5017 if (RE_TRANSLATE_P (translate))
5043 { 5018 do
5044 if (multibyte) 5019 {
5045 do 5020 PREFETCH ();
5046 { 5021 if (RE_TRANSLATE (translate, *d) != *p++)
5047 int pat_charlen, buf_charlen;
5048 unsigned int pat_ch, buf_ch;
5049
5050 PREFETCH ();
5051 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5052 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5053
5054 if (RE_TRANSLATE (translate, buf_ch)
5055 != pat_ch)
5056 {
5057 d = dfail;
5058 goto fail;
5059 }
5060
5061 p += pat_charlen;
5062 d += buf_charlen;
5063 mcnt -= pat_charlen;
5064 }
5065 while (mcnt > 0);
5066 else
5067 do
5068 { 5022 {
5069 PREFETCH (); 5023 d = dfail;
5070 if (RE_TRANSLATE (translate, *d) != *p++) 5024 goto fail;
5071 {
5072 d = dfail;
5073 goto fail;
5074 }
5075 d++;
5076 } 5025 }
5077 while (--mcnt); 5026 d++;
5078 } 5027 }
5028 while (--mcnt);
5079 else 5029 else
5080 { 5030 do
5081 if (multibyte == target_multibyte) 5031 {
5082 do 5032 PREFETCH ();
5033 if (*d++ != *p++)
5083 { 5034 {
5084 PREFETCH (); 5035 d = dfail;
5085 if (*d++ != *p++) 5036 goto fail;
5086 {
5087 d = dfail;
5088 goto fail;
5089 }
5090 } 5037 }
5091 while (--mcnt); 5038 }
5092 else /* i.e. multibyte && ! target_multibyte */ 5039 while (--mcnt);
5093 do 5040#else /* emacs */
5041 /* The cost of testing `translate' is comparatively small. */
5042 if (multibyte)
5043 do
5044 {
5045 int pat_charlen, buf_charlen;
5046 unsigned int pat_ch, buf_ch;
5047
5048 PREFETCH ();
5049 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5050 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5051
5052 if (TRANSLATE (buf_ch) != pat_ch)
5094 { 5053 {
5095 int pat_charlen, buf_charlen; 5054 d = dfail;
5096 unsigned int pat_ch, buf_ch; 5055 goto fail;
5056 }
5097 5057
5098 PREFETCH (); 5058 p += pat_charlen;
5099 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); 5059 d += buf_charlen;
5100 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5060 mcnt -= pat_charlen;
5061 }
5062 while (mcnt > 0);
5063 else
5064 do
5065 {
5066 unsigned int buf_ch;
5101 5067
5102 if (pat_ch != buf_ch) 5068 PREFETCH ();
5103 { 5069 buf_ch = *d++;
5104 d = dfail; 5070 TRANSLATE_VIA_MULTIBYTE (buf_ch);
5105 goto fail; 5071 if (buf_ch != *p++)
5106 } 5072 {
5107 p += pat_charlen; 5073 d = dfail;
5108 d += buf_charlen; 5074 goto fail;
5109 mcnt -= pat_charlen;
5110 } 5075 }
5111 while (mcnt > 0); 5076 }
5112 } 5077 while (--mcnt);
5078#endif
5113 break; 5079 break;
5114 5080
5115
5116 /* Match any character except possibly a newline or a null. */ 5081 /* Match any character except possibly a newline or a null. */
5117 case anychar: 5082 case anychar:
5118 { 5083 {
@@ -5122,8 +5087,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5122 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 5087 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5123 5088
5124 PREFETCH (); 5089 PREFETCH ();
5125 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5090 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5126 buf_ch = TRANSLATE (buf_ch);
5127 5091
5128 if ((!(bufp->syntax & RE_DOT_NEWLINE) 5092 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5129 && buf_ch == '\n') 5093 && buf_ch == '\n')
@@ -5166,8 +5130,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5166 } 5130 }
5167 5131
5168 PREFETCH (); 5132 PREFETCH ();
5169 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len); 5133 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5170 c = TRANSLATE (c); /* The character to match. */ 5134 TRANSLATE_VIA_MULTIBYTE (c); /* The character to match. */
5171 5135
5172 if (! multibyte || IS_REAL_ASCII (c)) 5136 if (! multibyte || IS_REAL_ASCII (c))
5173 { /* Lookup bitmap. */ 5137 { /* Lookup bitmap. */
@@ -5309,7 +5273,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5309 /* Compare that many; failure if mismatch, else move 5273 /* Compare that many; failure if mismatch, else move
5310 past them. */ 5274 past them. */
5311 if (RE_TRANSLATE_P (translate) 5275 if (RE_TRANSLATE_P (translate)
5312 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte) 5276 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
5313 : memcmp (d, d2, mcnt)) 5277 : memcmp (d, d2, mcnt))
5314 { 5278 {
5315 d = dfail; 5279 d = dfail;
@@ -5596,7 +5560,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5596 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); 5560 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5597#endif 5561#endif
5598 PREFETCH_NOLIMIT (); 5562 PREFETCH_NOLIMIT ();
5599 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy); 5563 GET_CHAR_AFTER (c2, d, dummy);
5600 s2 = SYNTAX (c2); 5564 s2 = SYNTAX (c2);
5601 5565
5602 if (/* Case 2: Only one of S1 and S2 is Sword. */ 5566 if (/* Case 2: Only one of S1 and S2 is Sword. */
@@ -5632,7 +5596,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5632 UPDATE_SYNTAX_TABLE (charpos); 5596 UPDATE_SYNTAX_TABLE (charpos);
5633#endif 5597#endif
5634 PREFETCH (); 5598 PREFETCH ();
5635 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy); 5599 GET_CHAR_AFTER (c2, d, dummy);
5636 s2 = SYNTAX (c2); 5600 s2 = SYNTAX (c2);
5637 5601
5638 /* Case 2: S2 is not Sword. */ 5602 /* Case 2: S2 is not Sword. */
@@ -5687,7 +5651,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5687 if (!AT_STRINGS_END (d)) 5651 if (!AT_STRINGS_END (d))
5688 { 5652 {
5689 PREFETCH_NOLIMIT (); 5653 PREFETCH_NOLIMIT ();
5690 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy); 5654 GET_CHAR_AFTER (c2, d, dummy);
5691#ifdef emacs 5655#ifdef emacs
5692 UPDATE_SYNTAX_TABLE_FORWARD (charpos); 5656 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5693#endif 5657#endif
@@ -5718,7 +5682,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5718 int len; 5682 int len;
5719 re_wchar_t c; 5683 re_wchar_t c;
5720 5684
5721 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len); 5685 GET_CHAR_AFTER (c, d, len);
5722 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) 5686 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5723 goto fail; 5687 goto fail;
5724 d += len; 5688 d += len;
@@ -5754,7 +5718,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5754 int len; 5718 int len;
5755 re_wchar_t c; 5719 re_wchar_t c;
5756 5720
5757 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len); 5721 GET_CHAR_AFTER (c, d, len);
5758 5722
5759 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) 5723 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5760 goto fail; 5724 goto fail;
@@ -5830,11 +5794,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5830 bytes; nonzero otherwise. */ 5794 bytes; nonzero otherwise. */
5831 5795
5832static int 5796static int
5833bcmp_translate (s1, s2, len, translate, target_multibyte) 5797bcmp_translate (s1, s2, len, translate, multibyte)
5834 re_char *s1, *s2; 5798 re_char *s1, *s2;
5835 register int len; 5799 register int len;
5836 RE_TRANSLATE_TYPE translate; 5800 RE_TRANSLATE_TYPE translate;
5837 const int target_multibyte; 5801 const int multibyte;
5838{ 5802{
5839 register re_char *p1 = s1, *p2 = s2; 5803 register re_char *p1 = s1, *p2 = s2;
5840 re_char *p1_end = s1 + len; 5804 re_char *p1_end = s1 + len;
@@ -5847,8 +5811,8 @@ bcmp_translate (s1, s2, len, translate, target_multibyte)
5847 int p1_charlen, p2_charlen; 5811 int p1_charlen, p2_charlen;
5848 re_wchar_t p1_ch, p2_ch; 5812 re_wchar_t p1_ch, p2_ch;
5849 5813
5850 p1_ch = TARGET_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); 5814 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5851 p2_ch = TARGET_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); 5815 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5852 5816
5853 if (RE_TRANSLATE (translate, p1_ch) 5817 if (RE_TRANSLATE (translate, p1_ch)
5854 != RE_TRANSLATE (translate, p2_ch)) 5818 != RE_TRANSLATE (translate, p2_ch))