aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2002-09-03 04:09:40 +0000
committerKenichi Handa2002-09-03 04:09:40 +0000
commitbf2164799abc98b4aaafd9340ed77cdf2aa18370 (patch)
tree87ad1469a02332ac5c26d2592b4d8f72d8f73511 /src
parent66f089b2750903fdfbf01a33da0bbbef736e6464 (diff)
downloademacs-bf2164799abc98b4aaafd9340ed77cdf2aa18370.tar.gz
emacs-bf2164799abc98b4aaafd9340ed77cdf2aa18370.zip
* regex.c (RE_TARGET_MULTIBYTE_P): New macro.
(GET_CHAR_BEFORE_2): Check target_multibyte, not multibyte. If that is zero, convert an eight-bit char to multibyte. (MAKE_CHAR_MULTIBYTE, CHAR_LEADING_CODE): New dummy new macros for non-emacs case. (PATFETCH): Convert an eight-bit char to multibyte. (HANDLE_UNIBYTE_RANGE): New macro. (regex_compile): Setup the compiled pattern for multibyte chars even if the given regex string is unibyte. Use PATFETCH_RAW instead of PATFETCH in many places. To handle `charset' specification of unibyte, call HANDLE_UNIBYTE_RANGE. Use bitmap only for ASCII chars. (analyse_first) <exactn>: Simplified because the compiled pattern is multibyte. <charset_not>: Setup fastmap from bitmap only for ASCII chars. <charset>: Use CHAR_LEADING_CODE to get leading codes. <categoryspec>: If multibyte, setup fastmap only for ASCII chars here. (re_compile_fastmap) [emacs]: Call analyse_first with the arg multibyte always 1. (re_search_2) In emacs, set the locale variable multibyte to 1, otherwise to 0. New local variable target_multibyte. Check it to decide the multibyteness of STR1 and STR2. If target_multibyte is zero, convert unibyte chars to multibyte before translating and checking fastmap. (TARGET_CHAR_AND_LENGTH): New macro. (re_match_2_internal): In emacs, set the locale variable multibyte to 1, otherwise to 0. New local variable target_multibyte. Check it to decide the multibyteness of STR1 and STR2. Use TARGET_CHAR_AND_LENGTH to fetch a character from D. <charset, charset_not>: If multibyte is nonzero, check fastmap only for ASCII chars. Call bcmp_translate with target_multibyte, not with multibyte. <begline>: Declare the local variable C as `unsigned'. (bcmp_translate): Change the last arg name to target_multibyte.
Diffstat (limited to 'src')
-rw-r--r--src/regex.c389
1 files changed, 270 insertions, 119 deletions
diff --git a/src/regex.c b/src/regex.c
index 9974b2d41ec..ee190497e5c 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -146,6 +146,7 @@
146# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) 146# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
147 147
148# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) 148# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
149# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
149# define RE_STRING_CHAR(p, s) \ 150# define RE_STRING_CHAR(p, s) \
150 (multibyte ? (STRING_CHAR (p, s)) : (*(p))) 151 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
151# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ 152# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
@@ -154,17 +155,21 @@
154/* Set C a (possibly multibyte) character before P. P points into a 155/* Set C a (possibly multibyte) character before P. P points into a
155 string which is the virtual concatenation of STR1 (which ends at 156 string which is the virtual concatenation of STR1 (which ends at
156 END1) or STR2 (which ends at END2). */ 157 END1) or STR2 (which ends at END2). */
157# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 158# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
158 do { \ 159 do { \
159 if (multibyte) \ 160 if (target_multibyte) \
160 { \ 161 { \
161 re_char *dtemp = (p) == (str2) ? (end1) : (p); \ 162 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
162 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ 163 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
163 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ 164 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
164 c = STRING_CHAR (dtemp, (p) - dtemp); \ 165 c = STRING_CHAR (dtemp, (p) - dtemp); \
165 } \ 166 } \
166 else \ 167 else \
167 (c = ((p) == (str2) ? (end1) : (p))[-1]); \ 168 { \
169 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
170 if (multibyte) \
171 MAKE_CHAR_MULTIBYTE (c); \
172 } \
168 } while (0) 173 } while (0)
169 174
170 175
@@ -233,6 +238,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
233# define CHARSET_LEADING_CODE_BASE(c) 0 238# define CHARSET_LEADING_CODE_BASE(c) 0
234# define MAX_MULTIBYTE_LENGTH 1 239# define MAX_MULTIBYTE_LENGTH 1
235# define RE_MULTIBYTE_P(x) 0 240# define RE_MULTIBYTE_P(x) 0
241# define RE_TARGET_MULTIBYTE_P(x) 0
236# define WORD_BOUNDARY_P(c1, c2) (0) 242# define WORD_BOUNDARY_P(c1, c2) (0)
237# define CHAR_HEAD_P(p) (1) 243# define CHAR_HEAD_P(p) (1)
238# define SINGLE_BYTE_CHAR_P(c) (1) 244# define SINGLE_BYTE_CHAR_P(c) (1)
@@ -248,6 +254,8 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
248# define MAKE_CHAR(charset, c1, c2) (c1) 254# define MAKE_CHAR(charset, c1, c2) (c1)
249# define BYTE8_TO_CHAR(c) (c) 255# define BYTE8_TO_CHAR(c) (c)
250# define CHAR_BYTE8_P(c) (0) 256# define CHAR_BYTE8_P(c) (0)
257# define MAKE_CHAR_MULTIBYTE(c) 0
258# define CHAR_LEADING_CODE(c) (c)
251#endif /* not emacs */ 259#endif /* not emacs */
252 260
253#ifndef RE_TRANSLATE 261#ifndef RE_TRANSLATE
@@ -1665,6 +1673,8 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
1665#define PATFETCH(c) \ 1673#define PATFETCH(c) \
1666 do { \ 1674 do { \
1667 PATFETCH_RAW (c); \ 1675 PATFETCH_RAW (c); \
1676 if (! multibyte) \
1677 MAKE_CHAR_MULTIBYTE (c); \
1668 c = TRANSLATE (c); \ 1678 c = TRANSLATE (c); \
1669 } while (0) 1679 } while (0)
1670 1680
@@ -1917,6 +1927,54 @@ struct range_table_work_area
1917#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH)) 1927#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1918 1928
1919 1929
1930#ifdef emacs
1931
1932/* It is better to implement this jumbo macro by a function, but it's
1933 not that easy because macros called within it assumes various
1934 variables being defined. */
1935
1936#define HANDLE_UNIBYTE_RANGE(work_area, c1, c2) \
1937 do { \
1938 int char_table[257]; \
1939 int i, j, c; \
1940 \
1941 char_table[(c1) - 1] = -2; /* head sentinel */ \
1942 for (i = (c1); i <= (c2); i++) \
1943 char_table[i] = TRANSLATE (unibyte_char_to_multibyte (i)); \
1944 char_table[i] = MAX_CHAR + 2; /* tail sentinel */ \
1945 \
1946 /* As the number of data is small (at most 128) and we can expect \
1947 that data in char_table are mostly sorted, we use fairly simple \
1948 `insertion sort'. */ \
1949 for (i = (c1) + 1; i <= (c2); i++) \
1950 { \
1951 c = char_table[i]; \
1952 j = i; \
1953 while (char_table[j - 1] > c) \
1954 char_table[j] = char_table[j - 1], j--; \
1955 char_table[j] = c; \
1956 } \
1957 \
1958 for (i = (c1); i <= (c2); i++) \
1959 { \
1960 c = char_table[i]; \
1961 if (! IS_REAL_ASCII (c)) \
1962 break; \
1963 SET_LIST_BIT (c); \
1964 } \
1965 while (i <= (c2)) \
1966 { \
1967 c = char_table[i]; \
1968 for (j = i + 1; j <= (c2); j++) \
1969 if (char_table[j] - c != j - i) \
1970 break; \
1971 SET_RANGE_TABLE_WORK_AREA ((work_area), c, char_table[j - 1]); \
1972 i = j; \
1973 } \
1974 } while (0)
1975
1976#endif /* emacs */
1977
1920/* Get the next unsigned number in the uncompiled pattern. */ 1978/* Get the next unsigned number in the uncompiled pattern. */
1921#define GET_UNSIGNED_NUMBER(num) \ 1979#define GET_UNSIGNED_NUMBER(num) \
1922 do { if (p != pend) \ 1980 do { if (p != pend) \
@@ -2264,7 +2322,7 @@ regex_compile (pattern, size, syntax, bufp)
2264 /* Loop through the uncompiled pattern until we're at the end. */ 2322 /* Loop through the uncompiled pattern until we're at the end. */
2265 while (p != pend) 2323 while (p != pend)
2266 { 2324 {
2267 PATFETCH (c); 2325 PATFETCH_RAW (c);
2268 2326
2269 switch (c) 2327 switch (c)
2270 { 2328 {
@@ -2346,15 +2404,15 @@ regex_compile (pattern, size, syntax, bufp)
2346 if (p+1 == pend) 2404 if (p+1 == pend)
2347 FREE_STACK_RETURN (REG_EESCAPE); 2405 FREE_STACK_RETURN (REG_EESCAPE);
2348 if (p[1] == '+' || p[1] == '?') 2406 if (p[1] == '+' || p[1] == '?')
2349 PATFETCH (c); /* Gobble up the backslash. */ 2407 PATFETCH_RAW (c); /* Gobble up the backslash. */
2350 else 2408 else
2351 break; 2409 break;
2352 } 2410 }
2353 else 2411 else
2354 break; 2412 break;
2355 /* If we get here, we found another repeat character. */ 2413 /* If we get here, we found another repeat character. */
2356 PATFETCH (c); 2414 PATFETCH_RAW (c);
2357 } 2415 }
2358 2416
2359 /* Star, etc. applied to an empty pattern is equivalent 2417 /* Star, etc. applied to an empty pattern is equivalent
2360 to an empty pattern. */ 2418 to an empty pattern. */
@@ -2495,14 +2553,14 @@ regex_compile (pattern, size, syntax, bufp)
2495 2553
2496 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2554 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2497 2555
2498 PATFETCH (c); 2556 PATFETCH_RAW (c);
2499 2557
2500 /* \ might escape characters inside [...] and [^...]. */ 2558 /* \ might escape characters inside [...] and [^...]. */
2501 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2559 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2502 { 2560 {
2503 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2561 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2504 2562
2505 PATFETCH (c); 2563 PATFETCH_RAW (c);
2506 escaped_char = true; 2564 escaped_char = true;
2507 } 2565 }
2508 else 2566 else
@@ -2528,7 +2586,7 @@ regex_compile (pattern, size, syntax, bufp)
2528 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; 2586 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2529 const unsigned char *class_beg; 2587 const unsigned char *class_beg;
2530 2588
2531 PATFETCH (c); 2589 PATFETCH_RAW (c);
2532 c1 = 0; 2590 c1 = 0;
2533 class_beg = p; 2591 class_beg = p;
2534 2592
@@ -2537,7 +2595,7 @@ regex_compile (pattern, size, syntax, bufp)
2537 2595
2538 for (;;) 2596 for (;;)
2539 { 2597 {
2540 PATFETCH (c); 2598 PATFETCH_RAW (c);
2541 if ((c == ':' && *p == ']') || p == pend) 2599 if ((c == ':' && *p == ']') || p == pend)
2542 break; 2600 break;
2543 if (c1 < CHAR_CLASS_MAX_LENGTH) 2601 if (c1 < CHAR_CLASS_MAX_LENGTH)
@@ -2564,7 +2622,7 @@ regex_compile (pattern, size, syntax, bufp)
2564 2622
2565 /* Throw away the ] at the end of the character 2623 /* Throw away the ] at the end of the character
2566 class. */ 2624 class. */
2567 PATFETCH (c); 2625 PATFETCH_RAW (c);
2568 2626
2569 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2627 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2570 2628
@@ -2573,17 +2631,20 @@ regex_compile (pattern, size, syntax, bufp)
2573 is_digit, is_cntrl, and is_xdigit, since 2631 is_digit, is_cntrl, and is_xdigit, since
2574 they can only match ASCII characters. We 2632 they can only match ASCII characters. We
2575 don't need to handle them for multibyte. 2633 don't need to handle them for multibyte.
2576 They are distinguished by a negative wctype. */ 2634 They are distinguished by a negative wctype.
2577 2635 We need this only for Emacs. */
2578 if (multibyte) 2636#ifdef emacs
2579 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, 2637 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2580 re_wctype_to_bit (cc)); 2638 re_wctype_to_bit (cc));
2639#endif
2581 2640
2582 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 2641 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2583 { 2642 {
2584 int translated = TRANSLATE (ch); 2643 MAKE_CHAR_MULTIBYTE (ch);
2585 if (re_iswctype (btowc (ch), cc)) 2644 ch = TRANSLATE (ch);
2586 SET_LIST_BIT (translated); 2645 if (IS_REAL_ASCII (ch)
2646 & re_iswctype (btowc (ch), cc))
2647 SET_LIST_BIT (ch);
2587 } 2648 }
2588 2649
2589 /* Repeat the loop. */ 2650 /* Repeat the loop. */
@@ -2606,35 +2667,51 @@ regex_compile (pattern, size, syntax, bufp)
2606 { 2667 {
2607 2668
2608 /* Discard the `-'. */ 2669 /* Discard the `-'. */
2609 PATFETCH (c1); 2670 PATFETCH_RAW (c1);
2610 2671
2611 /* Fetch the character which ends the range. */ 2672 /* Fetch the character which ends the range. */
2612 PATFETCH (c1); 2673 PATFETCH_RAW (c1);
2613 2674#ifdef emacs
2614 if (SINGLE_BYTE_CHAR_P (c) 2675 if (multibyte)
2615 && ! SINGLE_BYTE_CHAR_P (c1)) 2676 {
2677 c = TRANSLATE (c);
2678 c1 = TRANSLATE (c1);
2679 if (! IS_REAL_ASCII (c1))
2680 {
2681 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2682 c, c1);
2683 c1 = 127;
2684 }
2685 }
2686 else
2616 { 2687 {
2617 /* Handle a range starting with a character 2688 if (! IS_REAL_ASCII (c1))
2618 fitting in a bitmap to a character not 2689 {
2619 fitting in a bitmap (thus require range 2690 int c2 = MAX (c, 128);
2620 table). We use both a bitmap (for the 2691
2621 range from C to 255) and a range table (for 2692 HANDLE_UNIBYTE_RANGE (range_table_work, c2, c1);
2622 the remaining range). Here, we setup only 2693 c1 = 127;
2623 a range table. A bitmap is setup later. */ 2694 }
2624 re_wchar_t c2
2625 = CHAR_BYTE8_P (c1) ? BYTE8_TO_CHAR (0x80) : 256;
2626
2627 SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1);
2628 c1 = 255;
2629 } 2695 }
2696#endif
2630 } 2697 }
2631 else 2698 else
2632 /* Range from C to C. */ 2699 {
2633 c1 = c; 2700 /* Range from C to C. */
2701 if (! multibyte)
2702 MAKE_CHAR_MULTIBYTE (c);
2703 c = TRANSLATE (c);
2704 if (IS_REAL_ASCII (c))
2705 c1 = c;
2706 else
2707 {
2708 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c);
2709 c = -1; /* Suppress setting bitmap. */
2710 }
2711 }
2634 2712
2635 /* Set the range ... */ 2713 /* Set the range into bitmap */
2636 if (SINGLE_BYTE_CHAR_P (c)) 2714 if (c >= 0)
2637 /* ... into bitmap. */
2638 { 2715 {
2639 re_wchar_t this_char; 2716 re_wchar_t this_char;
2640 int range_start = c, range_end = c1; 2717 int range_start = c, range_end = c1;
@@ -2653,9 +2730,6 @@ regex_compile (pattern, size, syntax, bufp)
2653 SET_LIST_BIT (TRANSLATE (this_char)); 2730 SET_LIST_BIT (TRANSLATE (this_char));
2654 } 2731 }
2655 } 2732 }
2656 else
2657 /* ... into range table. */
2658 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2659 } 2733 }
2660 2734
2661 /* Discard any (non)matching list bytes that are all 0 at the 2735 /* Discard any (non)matching list bytes that are all 0 at the
@@ -2750,7 +2824,7 @@ regex_compile (pattern, size, syntax, bufp)
2750 /* Look for a special (?...) construct */ 2824 /* Look for a special (?...) construct */
2751 if ((syntax & RE_SHY_GROUPS) && *p == '?') 2825 if ((syntax & RE_SHY_GROUPS) && *p == '?')
2752 { 2826 {
2753 PATFETCH (c); /* Gobble up the '?'. */ 2827 PATFETCH_RAW (c); /* Gobble up the '?'. */
2754 PATFETCH (c); 2828 PATFETCH (c);
2755 switch (c) 2829 switch (c)
2756 { 2830 {
@@ -3230,10 +3304,10 @@ regex_compile (pattern, size, syntax, bufp)
3230 { 3304 {
3231 int len; 3305 int len;
3232 3306
3233 if (multibyte) 3307 if (! multibyte)
3234 len = CHAR_STRING (c, b); 3308 MAKE_CHAR_MULTIBYTE (c);
3235 else 3309 c = TRANSLATE (c);
3236 *b = c, len = 1; 3310 len = CHAR_STRING (c, b);
3237 b += len; 3311 b += len;
3238 (*pending_exact) += len; 3312 (*pending_exact) += len;
3239 } 3313 }
@@ -3439,6 +3513,8 @@ group_in_compile_stack (compile_stack, regnum)
3439 bother filling it up (obviously) and only return whether the 3513 bother filling it up (obviously) and only return whether the
3440 pattern could potentially match the empty string. 3514 pattern could potentially match the empty string.
3441 3515
3516 MULTIBYTE is always 1 for Emacs, and 0 otherwise.
3517
3442 Return 1 if p..pend might match the empty string. 3518 Return 1 if p..pend might match the empty string.
3443 Return 0 if p..pend matches at least one char. 3519 Return 0 if p..pend matches at least one char.
3444 Return -1 if fastmap was not updated accurately. */ 3520 Return -1 if fastmap was not updated accurately. */
@@ -3505,14 +3581,11 @@ analyse_first (p, pend, fastmap, multibyte)
3505 3581
3506 case exactn: 3582 case exactn:
3507 if (fastmap) 3583 if (fastmap)
3508 { 3584 /* If multibyte is nonzero, the first byte of each
3509 int c = RE_STRING_CHAR (p + 1, pend - p); 3585 character is an ASCII or a leading code. Otherwise,
3510 3586 each byte is a character. Thus, this works in both
3511 if (SINGLE_BYTE_CHAR_P (c)) 3587 cases. */
3512 fastmap[c] = 1; 3588 fastmap[p[1]] = 1;
3513 else
3514 fastmap[p[1]] = 1;
3515 }
3516 break; 3589 break;
3517 3590
3518 3591
@@ -3524,14 +3597,17 @@ analyse_first (p, pend, fastmap, multibyte)
3524 3597
3525 3598
3526 case charset_not: 3599 case charset_not:
3527 /* Chars beyond end of bitmap are possible matches.
3528 All the single-byte codes can occur in multibyte buffers.
3529 So any that are not listed in the charset
3530 are possible matches, even in multibyte buffers. */
3531 if (!fastmap) break; 3600 if (!fastmap) break;
3532 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; 3601 {
3533 j < (1 << BYTEWIDTH); j++) 3602 /* Chars beyond end of bitmap are possible matches. */
3534 fastmap[j] = 1; 3603 /* Emacs uses the bitmap only for ASCII characters. */
3604 int limit = multibyte ? 128 : (1 << BYTEWIDTH);
3605
3606 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3607 j < limit; j++)
3608 fastmap[j] = 1;
3609 }
3610
3535 /* Fallthrough */ 3611 /* Fallthrough */
3536 case charset: 3612 case charset:
3537 if (!fastmap) break; 3613 if (!fastmap) break;
@@ -3542,7 +3618,7 @@ analyse_first (p, pend, fastmap, multibyte)
3542 fastmap[j] = 1; 3618 fastmap[j] = 1;
3543 3619
3544 if ((not && multibyte) 3620 if ((not && multibyte)
3545 /* Any character set can possibly contain a character 3621 /* Any leading code can possibly start a character
3546 which doesn't match the specified set of characters. */ 3622 which doesn't match the specified set of characters. */
3547 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) 3623 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3548 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) 3624 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
@@ -3562,11 +3638,10 @@ analyse_first (p, pend, fastmap, multibyte)
3562 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) 3638 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3563 && match_any_multibyte_characters == false) 3639 && match_any_multibyte_characters == false)
3564 { 3640 {
3565 /* Set fastmap[I] to 1 where I is a base leading code of each 3641 /* Set fastmap[I] to 1 where I is a leading code of each
3566 multibyte characer in the range table. */ 3642 multibyte characer in the range table. */
3567 int c, count; 3643 int c, count;
3568 unsigned char buf1[MAX_MULTIBYTE_LENGTH]; 3644 unsigned char lc1, lc2;
3569 unsigned char buf2[MAX_MULTIBYTE_LENGTH];
3570 3645
3571 /* Make P points the range table. `+ 2' is to skip flag 3646 /* Make P points the range table. `+ 2' is to skip flag
3572 bits for a character class. */ 3647 bits for a character class. */
@@ -3578,11 +3653,11 @@ analyse_first (p, pend, fastmap, multibyte)
3578 { 3653 {
3579 /* Extract the start and end of each range. */ 3654 /* Extract the start and end of each range. */
3580 EXTRACT_CHARACTER (c, p); 3655 EXTRACT_CHARACTER (c, p);
3581 CHAR_STRING (c, buf1); 3656 lc1 = CHAR_LEADING_CODE (c);
3582 p += 3; 3657 p += 3;
3583 EXTRACT_CHARACTER (c, p); 3658 EXTRACT_CHARACTER (c, p);
3584 CHAR_STRING (c, buf2); 3659 lc2 = CHAR_LEADING_CODE (c);
3585 for (j = buf1[0]; j <= buf2[0]; j++) 3660 for (j = lc1; j <= lc2; j++)
3586 fastmap[j] = 1; 3661 fastmap[j] = 1;
3587 } 3662 }
3588 } 3663 }
@@ -3608,7 +3683,7 @@ analyse_first (p, pend, fastmap, multibyte)
3608 if (!fastmap) break; 3683 if (!fastmap) break;
3609 not = (re_opcode_t)p[-1] == notcategoryspec; 3684 not = (re_opcode_t)p[-1] == notcategoryspec;
3610 k = *p++; 3685 k = *p++;
3611 for (j = 0; j < (1 << BYTEWIDTH); j++) 3686 for (j = (multibyte ? 127 : (1 << BYTEWIDTH)); j >= 0; j--)
3612 if ((CHAR_HAS_CATEGORY (j, k)) ^ not) 3687 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
3613 fastmap[j] = 1; 3688 fastmap[j] = 1;
3614 3689
@@ -3754,7 +3829,15 @@ re_compile_fastmap (bufp)
3754 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 3829 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3755 3830
3756 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, 3831 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3757 fastmap, RE_MULTIBYTE_P (bufp)); 3832 fastmap,
3833#ifdef emacs
3834 /* The compiled pattern buffer is always
3835 setup for multibyte characters. */
3836 1
3837#else
3838 0
3839#endif
3840 );
3758 bufp->can_be_null = (analysis != 0); 3841 bufp->can_be_null = (analysis != 0);
3759 return 0; 3842 return 0;
3760} /* re_compile_fastmap */ 3843} /* re_compile_fastmap */
@@ -3860,8 +3943,14 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3860 int endpos = startpos + range; 3943 int endpos = startpos + range;
3861 boolean anchored_start; 3944 boolean anchored_start;
3862 3945
3863 /* Nonzero if we have to concern multibyte character. */ 3946 /* Nonzero if BUFP is setup for multibyte characters. */
3864 const boolean multibyte = RE_MULTIBYTE_P (bufp); 3947#ifdef emacs
3948 const boolean multibyte = 1;
3949#else
3950 const boolean multibyte = 0;
3951#endif
3952 /* Nonzero if STR1 and STR2 contains multibyte characters. */
3953 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3865 3954
3866 /* Check for out-of-range STARTPOS. */ 3955 /* Check for out-of-range STARTPOS. */
3867 if (startpos < 0 || startpos > total_size) 3956 if (startpos < 0 || startpos > total_size)
@@ -3950,7 +4039,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3950 inside the loop. */ 4039 inside the loop. */
3951 if (RE_TRANSLATE_P (translate)) 4040 if (RE_TRANSLATE_P (translate))
3952 { 4041 {
3953 if (multibyte) 4042 if (target_multibyte)
3954 while (range > lim) 4043 while (range > lim)
3955 { 4044 {
3956 int buf_charlen; 4045 int buf_charlen;
@@ -3959,13 +4048,24 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3959 buf_charlen); 4048 buf_charlen);
3960 4049
3961 buf_ch = RE_TRANSLATE (translate, buf_ch); 4050 buf_ch = RE_TRANSLATE (translate, buf_ch);
3962 if (buf_ch >= 0400 4051 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3963 || fastmap[buf_ch])
3964 break; 4052 break;
3965 4053
3966 range -= buf_charlen; 4054 range -= buf_charlen;
3967 d += buf_charlen; 4055 d += buf_charlen;
3968 } 4056 }
4057 else if (multibyte)
4058 while (range > lim)
4059 {
4060 buf_ch = *d;
4061 MAKE_CHAR_MULTIBYTE (buf_ch);
4062 buf_ch = RE_TRANSLATE (translate, buf_ch);
4063 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4064 break;
4065
4066 d++;
4067 range--;
4068 }
3969 else 4069 else
3970 while (range > lim 4070 while (range > lim
3971 && !fastmap[RE_TRANSLATE (translate, *d)]) 4071 && !fastmap[RE_TRANSLATE (translate, *d)])
@@ -3974,6 +4074,16 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3974 range--; 4074 range--;
3975 } 4075 }
3976 } 4076 }
4077 else if (multibyte && ! target_multibyte)
4078 {
4079 buf_ch = *d;
4080 MAKE_CHAR_MULTIBYTE (buf_ch);
4081 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4082 break;
4083
4084 d++;
4085 range--;
4086 }
3977 else 4087 else
3978 while (range > lim && !fastmap[*d]) 4088 while (range > lim && !fastmap[*d])
3979 { 4089 {
@@ -3989,10 +4099,11 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3989 ? size2 + size1 - startpos 4099 ? size2 + size1 - startpos
3990 : size1 - startpos); 4100 : size1 - startpos);
3991 buf_ch = RE_STRING_CHAR (d, room); 4101 buf_ch = RE_STRING_CHAR (d, room);
4102 if (! target_multibyte)
4103 MAKE_CHAR_MULTIBYTE (buf_ch);
3992 buf_ch = TRANSLATE (buf_ch); 4104 buf_ch = TRANSLATE (buf_ch);
3993 4105
3994 if (! (buf_ch >= 0400 4106 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
3995 || fastmap[buf_ch]))
3996 goto advance; 4107 goto advance;
3997 } 4108 }
3998 } 4109 }
@@ -4022,7 +4133,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4022 else if (range > 0) 4133 else if (range > 0)
4023 { 4134 {
4024 /* Update STARTPOS to the next character boundary. */ 4135 /* Update STARTPOS to the next character boundary. */
4025 if (multibyte) 4136 if (target_multibyte)
4026 { 4137 {
4027 re_char *p = POS_ADDR_VSTRING (startpos); 4138 re_char *p = POS_ADDR_VSTRING (startpos);
4028 re_char *pend = STOP_ADDR_VSTRING (startpos); 4139 re_char *pend = STOP_ADDR_VSTRING (startpos);
@@ -4045,7 +4156,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4045 startpos--; 4156 startpos--;
4046 4157
4047 /* Update STARTPOS to the previous character boundary. */ 4158 /* Update STARTPOS to the previous character boundary. */
4048 if (multibyte) 4159 if (target_multibyte)
4049 { 4160 {
4050 re_char *p = POS_ADDR_VSTRING (startpos); 4161 re_char *p = POS_ADDR_VSTRING (startpos);
4051 int len = 0; 4162 int len = 0;
@@ -4502,6 +4613,17 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4502} 4613}
4503WEAK_ALIAS (__re_match_2, re_match_2) 4614WEAK_ALIAS (__re_match_2, re_match_2)
4504 4615
4616#ifdef emacs
4617#define TARGET_CHAR_AND_LENGTH(d, len, actual_len) \
4618 (target_multibyte \
4619 ? STRING_CHAR_AND_LENGTH (d, len, actual_len) \
4620 : (actual_len = 1, unibyte_char_to_multibyte (*d)))
4621#else
4622#define TARGET_CHAR_AND_LENGTH(d, len, actual_len) \
4623 (actual_len = 1, *d)
4624#endif
4625
4626
4505/* This is a separate function so that we can force an alloca cleanup 4627/* This is a separate function so that we can force an alloca cleanup
4506 afterwards. */ 4628 afterwards. */
4507static int 4629static int
@@ -4541,8 +4663,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4541 /* We use this to map every character in the string. */ 4663 /* We use this to map every character in the string. */
4542 RE_TRANSLATE_TYPE translate = bufp->translate; 4664 RE_TRANSLATE_TYPE translate = bufp->translate;
4543 4665
4544 /* Nonzero if we have to concern multibyte character. */ 4666 /* Nonzero if BUFP is setup for multibyte characters. */
4545 const boolean multibyte = RE_MULTIBYTE_P (bufp); 4667#ifdef emacs
4668 const boolean multibyte = 1;
4669#else
4670 const boolean multibyte = 0;
4671#endif
4672 /* Nonzero if STR1 and STR2 contains multibyte characters. */
4673 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4546 4674
4547 /* Failure point stack. Each place that can handle a failure further 4675 /* Failure point stack. Each place that can handle a failure further
4548 down the line pushes a failure point on this stack. It consists of 4676 down the line pushes a failure point on this stack. It consists of
@@ -4907,7 +5035,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4907 5035
4908 PREFETCH (); 5036 PREFETCH ();
4909 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); 5037 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4910 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5038 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4911 5039
4912 if (RE_TRANSLATE (translate, buf_ch) 5040 if (RE_TRANSLATE (translate, buf_ch)
4913 != pat_ch) 5041 != pat_ch)
@@ -4936,16 +5064,37 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4936 } 5064 }
4937 else 5065 else
4938 { 5066 {
4939 do 5067 if (multibyte == target_multibyte)
4940 { 5068 do
4941 PREFETCH (); 5069 {
4942 if (*d++ != *p++) 5070 PREFETCH ();
4943 { 5071 if (*d++ != *p++)
4944 d = dfail; 5072 {
4945 goto fail; 5073 d = dfail;
4946 } 5074 goto fail;
4947 } 5075 }
4948 while (--mcnt); 5076 }
5077 while (--mcnt);
5078 else /* i.e. multibyte && ! target_multibyte */
5079 do
5080 {
5081 int pat_charlen, buf_charlen;
5082 unsigned int pat_ch, buf_ch;
5083
5084 PREFETCH ();
5085 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5086 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5087
5088 if (pat_ch != buf_ch)
5089 {
5090 d = dfail;
5091 goto fail;
5092 }
5093 p += pat_charlen;
5094 d += buf_charlen;
5095 mcnt -= pat_charlen;
5096 }
5097 while (mcnt > 0);
4949 } 5098 }
4950 break; 5099 break;
4951 5100
@@ -4959,7 +5108,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4959 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 5108 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4960 5109
4961 PREFETCH (); 5110 PREFETCH ();
4962 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); 5111 buf_ch = TARGET_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4963 buf_ch = TRANSLATE (buf_ch); 5112 buf_ch = TRANSLATE (buf_ch);
4964 5113
4965 if ((!(bufp->syntax & RE_DOT_NEWLINE) 5114 if ((!(bufp->syntax & RE_DOT_NEWLINE)
@@ -5003,10 +5152,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5003 } 5152 }
5004 5153
5005 PREFETCH (); 5154 PREFETCH ();
5006 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); 5155 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);
5007 c = TRANSLATE (c); /* The character to match. */ 5156 c = TRANSLATE (c); /* The character to match. */
5008 5157
5009 if (SINGLE_BYTE_CHAR_P (c)) 5158 if (! multibyte || IS_REAL_ASCII (c))
5010 { /* Lookup bitmap. */ 5159 { /* Lookup bitmap. */
5011 /* Cast to `unsigned' instead of `unsigned char' in 5160 /* Cast to `unsigned' instead of `unsigned char' in
5012 case the bit list is a full 32 bytes long. */ 5161 case the bit list is a full 32 bytes long. */
@@ -5146,7 +5295,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5146 /* Compare that many; failure if mismatch, else move 5295 /* Compare that many; failure if mismatch, else move
5147 past them. */ 5296 past them. */
5148 if (RE_TRANSLATE_P (translate) 5297 if (RE_TRANSLATE_P (translate)
5149 ? bcmp_translate (d, d2, mcnt, translate, multibyte) 5298 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
5150 : memcmp (d, d2, mcnt)) 5299 : memcmp (d, d2, mcnt))
5151 { 5300 {
5152 d = dfail; 5301 d = dfail;
@@ -5169,7 +5318,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5169 } 5318 }
5170 else 5319 else
5171 { 5320 {
5172 unsigned char c; 5321 unsigned c;
5173 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2); 5322 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5174 if (c == '\n') 5323 if (c == '\n')
5175 break; 5324 break;
@@ -5421,6 +5570,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5421 is the character at D, and S2 is the syntax of C2. */ 5570 is the character at D, and S2 is the syntax of C2. */
5422 re_wchar_t c1, c2; 5571 re_wchar_t c1, c2;
5423 int s1, s2; 5572 int s1, s2;
5573 int dummy;
5424#ifdef emacs 5574#ifdef emacs
5425 int offset = PTR_TO_OFFSET (d - 1); 5575 int offset = PTR_TO_OFFSET (d - 1);
5426 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); 5576 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@@ -5432,7 +5582,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5432 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); 5582 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5433#endif 5583#endif
5434 PREFETCH_NOLIMIT (); 5584 PREFETCH_NOLIMIT ();
5435 c2 = RE_STRING_CHAR (d, dend - d); 5585 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
5436 s2 = SYNTAX (c2); 5586 s2 = SYNTAX (c2);
5437 5587
5438 if (/* Case 2: Only one of S1 and S2 is Sword. */ 5588 if (/* Case 2: Only one of S1 and S2 is Sword. */
@@ -5461,13 +5611,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5461 is the character at D, and S2 is the syntax of C2. */ 5611 is the character at D, and S2 is the syntax of C2. */
5462 re_wchar_t c1, c2; 5612 re_wchar_t c1, c2;
5463 int s1, s2; 5613 int s1, s2;
5614 int dummy;
5464#ifdef emacs 5615#ifdef emacs
5465 int offset = PTR_TO_OFFSET (d); 5616 int offset = PTR_TO_OFFSET (d);
5466 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); 5617 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5467 UPDATE_SYNTAX_TABLE (charpos); 5618 UPDATE_SYNTAX_TABLE (charpos);
5468#endif 5619#endif
5469 PREFETCH (); 5620 PREFETCH ();
5470 c2 = RE_STRING_CHAR (d, dend - d); 5621 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
5471 s2 = SYNTAX (c2); 5622 s2 = SYNTAX (c2);
5472 5623
5473 /* Case 2: S2 is not Sword. */ 5624 /* Case 2: S2 is not Sword. */
@@ -5505,6 +5656,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5505 is the character at D, and S2 is the syntax of C2. */ 5656 is the character at D, and S2 is the syntax of C2. */
5506 re_wchar_t c1, c2; 5657 re_wchar_t c1, c2;
5507 int s1, s2; 5658 int s1, s2;
5659 int dummy;
5508#ifdef emacs 5660#ifdef emacs
5509 int offset = PTR_TO_OFFSET (d) - 1; 5661 int offset = PTR_TO_OFFSET (d) - 1;
5510 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); 5662 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@@ -5521,7 +5673,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5521 if (!AT_STRINGS_END (d)) 5673 if (!AT_STRINGS_END (d))
5522 { 5674 {
5523 PREFETCH_NOLIMIT (); 5675 PREFETCH_NOLIMIT ();
5524 c2 = RE_STRING_CHAR (d, dend - d); 5676 c2 = TARGET_CHAR_AND_LENGTH (d, dend - d, dummy);
5525#ifdef emacs 5677#ifdef emacs
5526 UPDATE_SYNTAX_TABLE_FORWARD (charpos); 5678 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5527#endif 5679#endif
@@ -5552,8 +5704,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5552 int len; 5704 int len;
5553 re_wchar_t c; 5705 re_wchar_t c;
5554 5706
5555 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); 5707 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);
5556
5557 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) 5708 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5558 goto fail; 5709 goto fail;
5559 d += len; 5710 d += len;
@@ -5589,7 +5740,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5589 int len; 5740 int len;
5590 re_wchar_t c; 5741 re_wchar_t c;
5591 5742
5592 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); 5743 c = TARGET_CHAR_AND_LENGTH (d, dend - d, len);
5593 5744
5594 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) 5745 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5595 goto fail; 5746 goto fail;
@@ -5665,11 +5816,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5665 bytes; nonzero otherwise. */ 5816 bytes; nonzero otherwise. */
5666 5817
5667static int 5818static int
5668bcmp_translate (s1, s2, len, translate, multibyte) 5819bcmp_translate (s1, s2, len, translate, target_multibyte)
5669 re_char *s1, *s2; 5820 re_char *s1, *s2;
5670 register int len; 5821 register int len;
5671 RE_TRANSLATE_TYPE translate; 5822 RE_TRANSLATE_TYPE translate;
5672 const int multibyte; 5823 const int target_multibyte;
5673{ 5824{
5674 register re_char *p1 = s1, *p2 = s2; 5825 register re_char *p1 = s1, *p2 = s2;
5675 re_char *p1_end = s1 + len; 5826 re_char *p1_end = s1 + len;
@@ -5682,8 +5833,8 @@ bcmp_translate (s1, s2, len, translate, multibyte)
5682 int p1_charlen, p2_charlen; 5833 int p1_charlen, p2_charlen;
5683 re_wchar_t p1_ch, p2_ch; 5834 re_wchar_t p1_ch, p2_ch;
5684 5835
5685 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); 5836 p1_ch = TARGET_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5686 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); 5837 p2_ch = TARGET_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5687 5838
5688 if (RE_TRANSLATE (translate, p1_ch) 5839 if (RE_TRANSLATE (translate, p1_ch)
5689 != RE_TRANSLATE (translate, p2_ch)) 5840 != RE_TRANSLATE (translate, p2_ch))