diff options
| author | Richard M. Stallman | 1998-04-03 07:33:13 +0000 |
|---|---|---|
| committer | Richard M. Stallman | 1998-04-03 07:33:13 +0000 |
| commit | e934739e6253aa592cf74f705efc70df9a66cf6d (patch) | |
| tree | d6fbafc52f65ca412991c1ec9df447212ca59043 | |
| parent | c1a0049caa974fbe33698052ecb5e8a0ee8d469f (diff) | |
| download | emacs-e934739e6253aa592cf74f705efc70df9a66cf6d.tar.gz emacs-e934739e6253aa592cf74f705efc70df9a66cf6d.zip | |
(compile_range): Unused function deleted.
(regex_compile): Special handling for range \177-\377.
(regex_compile): Cast args to TRANSLATE to unsigned char.
(re_search_2): Fix forward scan handling multibyte.
Recognize that nonascii characters are not in the fastmap.
Handle fetching multibyte characters for backward scan,
(re_match_2_internal): Handle multibyte and translation
in exactn and anychar.
(bcmp_translate): Handle multibyte chars for translation.
(TRANSLATE): Don't cast to unsigned char.
(PATFETCH): Use RE_TRANSLATE to translate.
| -rw-r--r-- | src/regex.c | 230 |
1 files changed, 133 insertions, 97 deletions
diff --git a/src/regex.c b/src/regex.c index a997402a15d..a26c0f57a65 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -168,7 +168,7 @@ init_syntax_once () | |||
| 168 | 168 | ||
| 169 | #define SYNTAX(c) re_syntax_table[c] | 169 | #define SYNTAX(c) re_syntax_table[c] |
| 170 | 170 | ||
| 171 | /* Dummy macro for non emacs environments. */ | 171 | /* Dummy macros for non-Emacs environments. */ |
| 172 | #define BASE_LEADING_CODE_P(c) (0) | 172 | #define BASE_LEADING_CODE_P(c) (0) |
| 173 | #define WORD_BOUNDARY_P(c1, c2) (0) | 173 | #define WORD_BOUNDARY_P(c1, c2) (0) |
| 174 | #define CHAR_HEAD_P(p) (1) | 174 | #define CHAR_HEAD_P(p) (1) |
| @@ -1539,7 +1539,7 @@ static reg_errcode_t compile_range (); | |||
| 1539 | #define PATFETCH(c) \ | 1539 | #define PATFETCH(c) \ |
| 1540 | do {if (p == pend) return REG_EEND; \ | 1540 | do {if (p == pend) return REG_EEND; \ |
| 1541 | c = (unsigned char) *p++; \ | 1541 | c = (unsigned char) *p++; \ |
| 1542 | if (translate) c = (unsigned char) translate[c]; \ | 1542 | if (translate) c = RE_TRANSLATE (translate, c); \ |
| 1543 | } while (0) | 1543 | } while (0) |
| 1544 | #endif | 1544 | #endif |
| 1545 | 1545 | ||
| @@ -1560,7 +1560,7 @@ static reg_errcode_t compile_range (); | |||
| 1560 | when we use a character as a subscript we must make it unsigned. */ | 1560 | when we use a character as a subscript we must make it unsigned. */ |
| 1561 | #ifndef TRANSLATE | 1561 | #ifndef TRANSLATE |
| 1562 | #define TRANSLATE(d) \ | 1562 | #define TRANSLATE(d) \ |
| 1563 | (translate ? (unsigned char) RE_TRANSLATE (translate, (unsigned char) (d)) : (d)) | 1563 | (translate ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) |
| 1564 | #endif | 1564 | #endif |
| 1565 | 1565 | ||
| 1566 | 1566 | ||
| @@ -2107,9 +2107,10 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2107 | incremented `p', by the way, to be the character after | 2107 | incremented `p', by the way, to be the character after |
| 2108 | the `*'. Do we have to do something analogous here | 2108 | the `*'. Do we have to do something analogous here |
| 2109 | for null bytes, because of RE_DOT_NOT_NULL? */ | 2109 | for null bytes, because of RE_DOT_NOT_NULL? */ |
| 2110 | if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') | 2110 | if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.') |
| 2111 | && zero_times_ok | 2111 | && zero_times_ok |
| 2112 | && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') | 2112 | && p < pend |
| 2113 | && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n') | ||
| 2113 | && !(syntax & RE_DOT_NEWLINE)) | 2114 | && !(syntax & RE_DOT_NEWLINE)) |
| 2114 | { /* We have .*\n. */ | 2115 | { /* We have .*\n. */ |
| 2115 | STORE_JUMP (jump, b, laststart); | 2116 | STORE_JUMP (jump, b, laststart); |
| @@ -2333,7 +2334,18 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2333 | p += len; | 2334 | p += len; |
| 2334 | } | 2335 | } |
| 2335 | 2336 | ||
| 2336 | if (!SAME_CHARSET_P (c, c1)) | 2337 | if (SINGLE_BYTE_CHAR_P (c) |
| 2338 | && ! SINGLE_BYTE_CHAR_P (c1)) | ||
| 2339 | { | ||
| 2340 | /* Handle a range such as \177-\377 in multibyte mode. | ||
| 2341 | Split that into two ranges,, | ||
| 2342 | the low one ending at 0237, and the high one | ||
| 2343 | starting at ...040. */ | ||
| 2344 | int c1_base = (c1 & ~0177) | 040; | ||
| 2345 | SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); | ||
| 2346 | c1 = 0237; | ||
| 2347 | } | ||
| 2348 | else if (!SAME_CHARSET_P (c, c1)) | ||
| 2337 | FREE_STACK_RETURN (REG_ERANGE); | 2349 | FREE_STACK_RETURN (REG_ERANGE); |
| 2338 | } | 2350 | } |
| 2339 | else | 2351 | else |
| @@ -2359,8 +2371,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2359 | for (this_char = range_start; this_char <= range_end; | 2371 | for (this_char = range_start; this_char <= range_end; |
| 2360 | this_char++) | 2372 | this_char++) |
| 2361 | SET_LIST_BIT (TRANSLATE (this_char)); | 2373 | SET_LIST_BIT (TRANSLATE (this_char)); |
| 2374 | } | ||
| 2362 | } | 2375 | } |
| 2363 | } | ||
| 2364 | else | 2376 | else |
| 2365 | /* ... into range table. */ | 2377 | /* ... into range table. */ |
| 2366 | SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); | 2378 | SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); |
| @@ -2913,8 +2925,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2913 | /* Here, C may translated, therefore C may not equal to *P1. */ | 2925 | /* Here, C may translated, therefore C may not equal to *P1. */ |
| 2914 | while (1) | 2926 | while (1) |
| 2915 | { | 2927 | { |
| 2916 | BUF_PUSH (c); | 2928 | BUF_PUSH (c); |
| 2917 | (*pending_exact)++; | 2929 | (*pending_exact)++; |
| 2918 | if (++p1 == p) | 2930 | if (++p1 == p) |
| 2919 | break; | 2931 | break; |
| 2920 | 2932 | ||
| @@ -3121,64 +3133,6 @@ group_in_compile_stack (compile_stack, regnum) | |||
| 3121 | 3133 | ||
| 3122 | return false; | 3134 | return false; |
| 3123 | } | 3135 | } |
| 3124 | |||
| 3125 | |||
| 3126 | /* Read the ending character of a range (in a bracket expression) from the | ||
| 3127 | uncompiled pattern *P_PTR (which ends at PEND). We assume the | ||
| 3128 | starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | ||
| 3129 | Then we set the translation of all bits between the starting and | ||
| 3130 | ending characters (inclusive) in the compiled pattern B. | ||
| 3131 | |||
| 3132 | Return an error code. | ||
| 3133 | |||
| 3134 | We use these short variable names so we can use the same macros as | ||
| 3135 | `regex_compile' itself. */ | ||
| 3136 | |||
| 3137 | static reg_errcode_t | ||
| 3138 | compile_range (p_ptr, pend, translate, syntax, b) | ||
| 3139 | const char **p_ptr, *pend; | ||
| 3140 | RE_TRANSLATE_TYPE translate; | ||
| 3141 | reg_syntax_t syntax; | ||
| 3142 | unsigned char *b; | ||
| 3143 | { | ||
| 3144 | unsigned this_char; | ||
| 3145 | |||
| 3146 | const char *p = *p_ptr; | ||
| 3147 | int range_start, range_end; | ||
| 3148 | |||
| 3149 | if (p == pend) | ||
| 3150 | return REG_ERANGE; | ||
| 3151 | |||
| 3152 | /* Even though the pattern is a signed `char *', we need to fetch | ||
| 3153 | with unsigned char *'s; if the high bit of the pattern character | ||
| 3154 | is set, the range endpoints will be negative if we fetch using a | ||
| 3155 | signed char *. | ||
| 3156 | |||
| 3157 | We also want to fetch the endpoints without translating them; the | ||
| 3158 | appropriate translation is done in the bit-setting loop below. */ | ||
| 3159 | /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ | ||
| 3160 | range_start = ((const unsigned char *) p)[-2]; | ||
| 3161 | range_end = ((const unsigned char *) p)[0]; | ||
| 3162 | |||
| 3163 | /* Have to increment the pointer into the pattern string, so the | ||
| 3164 | caller isn't still at the ending character. */ | ||
| 3165 | (*p_ptr)++; | ||
| 3166 | |||
| 3167 | /* If the start is after the end, the range is empty. */ | ||
| 3168 | if (range_start > range_end) | ||
| 3169 | return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | ||
| 3170 | |||
| 3171 | /* Here we see why `this_char' has to be larger than an `unsigned | ||
| 3172 | char' -- the range is inclusive, so if `range_end' == 0xff | ||
| 3173 | (assuming 8-bit characters), we would otherwise go into an infinite | ||
| 3174 | loop, since all characters <= 0xff. */ | ||
| 3175 | for (this_char = range_start; this_char <= range_end; this_char++) | ||
| 3176 | { | ||
| 3177 | SET_LIST_BIT (TRANSLATE (this_char)); | ||
| 3178 | } | ||
| 3179 | |||
| 3180 | return REG_NOERROR; | ||
| 3181 | } | ||
| 3182 | 3136 | ||
| 3183 | /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | 3137 | /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in |
| 3184 | BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | 3138 | BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible |
| @@ -3812,24 +3766,45 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) | |||
| 3812 | the first null string. */ | 3766 | the first null string. */ |
| 3813 | if (fastmap && startpos < total_size && !bufp->can_be_null) | 3767 | if (fastmap && startpos < total_size && !bufp->can_be_null) |
| 3814 | { | 3768 | { |
| 3769 | register const char *d; | ||
| 3770 | register unsigned int buf_ch; | ||
| 3771 | |||
| 3772 | d = POS_ADDR_VSTRING (startpos); | ||
| 3773 | |||
| 3815 | if (range > 0) /* Searching forwards. */ | 3774 | if (range > 0) /* Searching forwards. */ |
| 3816 | { | 3775 | { |
| 3817 | register const char *d; | ||
| 3818 | register int lim = 0; | 3776 | register int lim = 0; |
| 3819 | int irange = range; | 3777 | int irange = range; |
| 3820 | 3778 | ||
| 3821 | if (startpos < size1 && startpos + range >= size1) | 3779 | if (startpos < size1 && startpos + range >= size1) |
| 3822 | lim = range - (size1 - startpos); | 3780 | lim = range - (size1 - startpos); |
| 3823 | 3781 | ||
| 3824 | d = POS_ADDR_VSTRING (startpos); | ||
| 3825 | |||
| 3826 | /* Written out as an if-else to avoid testing `translate' | 3782 | /* Written out as an if-else to avoid testing `translate' |
| 3827 | inside the loop. */ | 3783 | inside the loop. */ |
| 3828 | if (translate) | 3784 | if (translate) |
| 3829 | while (range > lim | 3785 | { |
| 3830 | && !fastmap[(unsigned char) | 3786 | if (multibyte) |
| 3831 | RE_TRANSLATE (translate, (unsigned char) *d++)]) | 3787 | while (range > lim) |
| 3832 | range--; | 3788 | { |
| 3789 | int buf_charlen; | ||
| 3790 | |||
| 3791 | buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, | ||
| 3792 | buf_charlen); | ||
| 3793 | |||
| 3794 | buf_ch = RE_TRANSLATE (translate, buf_ch); | ||
| 3795 | if (buf_ch >= 0400 | ||
| 3796 | || fastmap[buf_ch]) | ||
| 3797 | break; | ||
| 3798 | |||
| 3799 | range -= buf_charlen; | ||
| 3800 | d += buf_charlen; | ||
| 3801 | } | ||
| 3802 | else | ||
| 3803 | while (range > lim | ||
| 3804 | && !fastmap[(unsigned char) | ||
| 3805 | RE_TRANSLATE (translate, (unsigned char) *d++)]) | ||
| 3806 | range--; | ||
| 3807 | } | ||
| 3833 | else | 3808 | else |
| 3834 | while (range > lim && !fastmap[(unsigned char) *d++]) | 3809 | while (range > lim && !fastmap[(unsigned char) *d++]) |
| 3835 | range--; | 3810 | range--; |
| @@ -3838,11 +3813,16 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) | |||
| 3838 | } | 3813 | } |
| 3839 | else /* Searching backwards. */ | 3814 | else /* Searching backwards. */ |
| 3840 | { | 3815 | { |
| 3841 | register char c = (size1 == 0 || startpos >= size1 | 3816 | int room = (size1 == 0 || startpos >= size1 |
| 3842 | ? string2[startpos - size1] | 3817 | ? size2 + size1 - startpos |
| 3843 | : string1[startpos]); | 3818 | : size1 - startpos); |
| 3819 | |||
| 3820 | buf_ch = STRING_CHAR (d, room); | ||
| 3821 | if (translate) | ||
| 3822 | buf_ch = RE_TRANSLATE (translate, buf_ch); | ||
| 3844 | 3823 | ||
| 3845 | if (!fastmap[(unsigned char) TRANSLATE (c)]) | 3824 | if (! (buf_ch >= 0400 |
| 3825 | || fastmap[buf_ch])) | ||
| 3846 | goto advance; | 3826 | goto advance; |
| 3847 | } | 3827 | } |
| 3848 | } | 3828 | } |
| @@ -4515,14 +4495,36 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4515 | testing `translate' inside the loop. */ | 4495 | testing `translate' inside the loop. */ |
| 4516 | if (translate) | 4496 | if (translate) |
| 4517 | { | 4497 | { |
| 4518 | do | 4498 | #ifdef emacs |
| 4519 | { | 4499 | if (multibyte) |
| 4520 | PREFETCH (); | 4500 | do |
| 4521 | if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++) | 4501 | { |
| 4522 | != (unsigned char) *p++) | 4502 | int pat_charlen, buf_charlen; |
| 4523 | goto fail; | 4503 | int pat_ch, buf_ch; |
| 4524 | } | 4504 | |
| 4525 | while (--mcnt); | 4505 | PREFETCH (); |
| 4506 | pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); | ||
| 4507 | buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | ||
| 4508 | |||
| 4509 | if (RE_TRANSLATE (translate, buf_ch) | ||
| 4510 | != pat_ch) | ||
| 4511 | goto fail; | ||
| 4512 | |||
| 4513 | p += pat_charlen; | ||
| 4514 | d += buf_charlen; | ||
| 4515 | mcnt -= pat_charlen; | ||
| 4516 | } | ||
| 4517 | while (mcnt > 0); | ||
| 4518 | else | ||
| 4519 | #endif /* not emacs */ | ||
| 4520 | do | ||
| 4521 | { | ||
| 4522 | PREFETCH (); | ||
| 4523 | if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++) | ||
| 4524 | != (unsigned char) *p++) | ||
| 4525 | goto fail; | ||
| 4526 | } | ||
| 4527 | while (--mcnt); | ||
| 4526 | } | 4528 | } |
| 4527 | else | 4529 | else |
| 4528 | { | 4530 | { |
| @@ -4539,17 +4541,36 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4539 | 4541 | ||
| 4540 | /* Match any character except possibly a newline or a null. */ | 4542 | /* Match any character except possibly a newline or a null. */ |
| 4541 | case anychar: | 4543 | case anychar: |
| 4542 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); | 4544 | { |
| 4545 | int buf_charlen; | ||
| 4546 | int buf_ch; | ||
| 4543 | 4547 | ||
| 4544 | PREFETCH (); | 4548 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); |
| 4545 | 4549 | ||
| 4546 | if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') | 4550 | PREFETCH (); |
| 4547 | || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) | ||
| 4548 | goto fail; | ||
| 4549 | 4551 | ||
| 4550 | SET_REGS_MATCHED (); | 4552 | #ifdef emacs |
| 4551 | DEBUG_PRINT2 (" Matched `%d'.\n", *d); | 4553 | if (multibyte) |
| 4552 | d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1; | 4554 | buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); |
| 4555 | else | ||
| 4556 | #endif /* not emacs */ | ||
| 4557 | { | ||
| 4558 | buf_ch = *d; | ||
| 4559 | buf_charlen = 1; | ||
| 4560 | } | ||
| 4561 | |||
| 4562 | buf_ch = TRANSLATE (buf_ch); | ||
| 4563 | |||
| 4564 | if ((!(bufp->syntax & RE_DOT_NEWLINE) | ||
| 4565 | && buf_ch == '\n') | ||
| 4566 | || ((bufp->syntax & RE_DOT_NOT_NULL) | ||
| 4567 | && buf_ch == '\000')) | ||
| 4568 | goto fail; | ||
| 4569 | |||
| 4570 | SET_REGS_MATCHED (); | ||
| 4571 | DEBUG_PRINT2 (" Matched `%d'.\n", *d); | ||
| 4572 | d += buf_charlen; | ||
| 4573 | } | ||
| 4553 | break; | 4574 | break; |
| 4554 | 4575 | ||
| 4555 | 4576 | ||
| @@ -5926,12 +5947,27 @@ bcmp_translate (s1, s2, len, translate) | |||
| 5926 | RE_TRANSLATE_TYPE translate; | 5947 | RE_TRANSLATE_TYPE translate; |
| 5927 | { | 5948 | { |
| 5928 | register unsigned char *p1 = s1, *p2 = s2; | 5949 | register unsigned char *p1 = s1, *p2 = s2; |
| 5929 | while (len) | 5950 | unsigned char *p1_end = s1 + len; |
| 5951 | unsigned char *p2_end = s2 + len; | ||
| 5952 | |||
| 5953 | while (p1 != p1_end && p2 != p2_end) | ||
| 5930 | { | 5954 | { |
| 5931 | if (RE_TRANSLATE (translate, *p1++) != RE_TRANSLATE (translate, *p2++)) | 5955 | int p1_charlen, p2_charlen; |
| 5956 | int p1_ch, p2_ch; | ||
| 5957 | |||
| 5958 | p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); | ||
| 5959 | p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); | ||
| 5960 | |||
| 5961 | if (RE_TRANSLATE (translate, p1_ch) | ||
| 5962 | != RE_TRANSLATE (translate, p2_ch)) | ||
| 5932 | return 1; | 5963 | return 1; |
| 5933 | len--; | 5964 | |
| 5965 | p1 += p1_charlen, p2 += p2_charlen; | ||
| 5934 | } | 5966 | } |
| 5967 | |||
| 5968 | if (p1 != p1_end || p2 != p2_end) | ||
| 5969 | return 1; | ||
| 5970 | |||
| 5935 | return 0; | 5971 | return 0; |
| 5936 | } | 5972 | } |
| 5937 | 5973 | ||