diff options
| author | Stefan Monnier | 2000-04-02 23:56:46 +0000 |
|---|---|---|
| committer | Stefan Monnier | 2000-04-02 23:56:46 +0000 |
| commit | 2d1675e45c46d97aec4c6af28a0719778f79b8da (patch) | |
| tree | 11c6492fbe96211f75cc8f4ac37e7e0cde1538bb /src | |
| parent | 096540869a2be7676946ef9c4ee52e5cedb5c28a (diff) | |
| download | emacs-2d1675e45c46d97aec4c6af28a0719778f79b8da.tar.gz emacs-2d1675e45c46d97aec4c6af28a0719778f79b8da.zip | |
* regex.c (PTR_TO_OFFSET) [!emacs]: Remove.
(RE_MULTIBYTE_P, RE_STRING_CHAR_AND_LENGTH): New macros.
(GET_CHAR_BEFORE_2): Moved from charset.h plus fixed minor bug when
we are between str1 and str2.
(MAX_MULTIBYTE_LENGTH, CHAR_STRING) [!emacs]: Provide trivial default.
(PATFETCH): Use `TRANSLATE'.
(PATFETCH_RAW): Fetch multibyte char if applicable.
(PATUNFETCH): Remove.
(regex_compile): Rely on PATFETCH to do most of the multibyte magic.
When writing a char, write it directly into the pattern buffer rather
than going needlessly through a temp char-array.
(re_match_2_internal): Similarly, rely on RE_STRING_CHAR to do the
multibyte magic and remove the useless `#ifdef emacs'.
(bcmp_translate): Don't compare as multibyte chars when in a unibyte
buffer.
* regex.h (struct re_pattern_buffer): Make field `multibyte'
conditional on `emacs'.
* charset.h (GET_CHAR_BEFORE_2): Moved to regex.c.
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 23 | ||||
| -rw-r--r-- | src/charset.h | 12 | ||||
| -rw-r--r-- | src/regex.c | 216 | ||||
| -rw-r--r-- | src/regex.h | 2 |
4 files changed, 110 insertions, 143 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index de883b830a4..9f3f20bbb3e 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,26 @@ | |||
| 1 | 2000-04-02 Stefan Monnier <monnier@cs.yale.edu> | ||
| 2 | |||
| 3 | * regex.c (PTR_TO_OFFSET) [!emacs]: Remove. | ||
| 4 | (RE_MULTIBYTE_P, RE_STRING_CHAR_AND_LENGTH): New macros. | ||
| 5 | (GET_CHAR_BEFORE_2): Moved from charset.h plus fixed minor bug when | ||
| 6 | we are between str1 and str2. | ||
| 7 | (MAX_MULTIBYTE_LENGTH, CHAR_STRING) [!emacs]: Provide trivial default. | ||
| 8 | (PATFETCH): Use `TRANSLATE'. | ||
| 9 | (PATFETCH_RAW): Fetch multibyte char if applicable. | ||
| 10 | (PATUNFETCH): Remove. | ||
| 11 | (regex_compile): Rely on PATFETCH to do most of the multibyte magic. | ||
| 12 | When writing a char, write it directly into the pattern buffer rather | ||
| 13 | than going needlessly through a temp char-array. | ||
| 14 | (re_match_2_internal): Similarly, rely on RE_STRING_CHAR to do the | ||
| 15 | multibyte magic and remove the useless `#ifdef emacs'. | ||
| 16 | (bcmp_translate): Don't compare as multibyte chars when in a unibyte | ||
| 17 | buffer. | ||
| 18 | |||
| 19 | * regex.h (struct re_pattern_buffer): Make field `multibyte' | ||
| 20 | conditional on `emacs'. | ||
| 21 | |||
| 22 | * charset.h (GET_CHAR_BEFORE_2): Moved to regex.c. | ||
| 23 | |||
| 1 | 2000-04-01 Ken Raeburn <raeburn@gnu.org> | 24 | 2000-04-01 Ken Raeburn <raeburn@gnu.org> |
| 2 | 25 | ||
| 3 | * alloc.c (MARK_STRING, UNMARK_STRING, STRING_MARKED_P): Expand | 26 | * alloc.c (MARK_STRING, UNMARK_STRING, STRING_MARKED_P): Expand |
diff --git a/src/charset.h b/src/charset.h index 3acc447c5f8..d9257b8955a 100644 --- a/src/charset.h +++ b/src/charset.h | |||
| @@ -577,18 +577,6 @@ else | |||
| 577 | ? 1 \ | 577 | ? 1 \ |
| 578 | : multibyte_form_length (str, len)) | 578 | : multibyte_form_length (str, len)) |
| 579 | 579 | ||
| 580 | /* Set C a (possibly multibyte) character before P. P points into a | ||
| 581 | string which is the virtual concatenation of STR1 (which ends at | ||
| 582 | END1) or STR2 (which ends at END2). */ | ||
| 583 | |||
| 584 | #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | ||
| 585 | do { \ | ||
| 586 | const unsigned char *dtemp = (p); \ | ||
| 587 | const unsigned char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ | ||
| 588 | while (dtemp-- > dlimit && *dtemp >= 0xA0); \ | ||
| 589 | c = STRING_CHAR (dtemp, p - dtemp); \ | ||
| 590 | } while (0) | ||
| 591 | |||
| 592 | #ifdef emacs | 580 | #ifdef emacs |
| 593 | 581 | ||
| 594 | /* Increase the buffer byte position POS_BYTE of the current buffer to | 582 | /* Increase the buffer byte position POS_BYTE of the current buffer to |
diff --git a/src/regex.c b/src/regex.c index 911daed209d..9a56db728e5 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -20,7 +20,6 @@ | |||
| 20 | USA. */ | 20 | USA. */ |
| 21 | 21 | ||
| 22 | /* TODO: | 22 | /* TODO: |
| 23 | - clean up multibyte issues | ||
| 24 | - structure the opcode space into opcode+flag. | 23 | - structure the opcode space into opcode+flag. |
| 25 | - merge with glibc's regex.[ch] | 24 | - merge with glibc's regex.[ch] |
| 26 | */ | 25 | */ |
| @@ -37,8 +36,6 @@ | |||
| 37 | /* Converts the pointer to the char to BEG-based offset from the start. */ | 36 | /* Converts the pointer to the char to BEG-based offset from the start. */ |
| 38 | #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) | 37 | #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) |
| 39 | #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) | 38 | #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) |
| 40 | #else | ||
| 41 | #define PTR_TO_OFFSET(d) 0 | ||
| 42 | #endif | 39 | #endif |
| 43 | 40 | ||
| 44 | #ifdef HAVE_CONFIG_H | 41 | #ifdef HAVE_CONFIG_H |
| @@ -79,8 +76,28 @@ | |||
| 79 | #define realloc xrealloc | 76 | #define realloc xrealloc |
| 80 | #define free xfree | 77 | #define free xfree |
| 81 | 78 | ||
| 79 | #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) | ||
| 82 | #define RE_STRING_CHAR(p, s) \ | 80 | #define RE_STRING_CHAR(p, s) \ |
| 83 | (multibyte ? (STRING_CHAR (p, s)) : (*(p))) | 81 | (multibyte ? (STRING_CHAR (p, s)) : (*(p))) |
| 82 | #define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ | ||
| 83 | (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) | ||
| 84 | |||
| 85 | /* Set C a (possibly multibyte) character before P. P points into a | ||
| 86 | string which is the virtual concatenation of STR1 (which ends at | ||
| 87 | END1) or STR2 (which ends at END2). */ | ||
| 88 | #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | ||
| 89 | do { \ | ||
| 90 | if (multibyte) \ | ||
| 91 | { \ | ||
| 92 | re_char *dtemp = (p) == (str2) ? (end1) : (p); \ | ||
| 93 | re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ | ||
| 94 | while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ | ||
| 95 | c = STRING_CHAR (dtemp, (p) - dtemp); \ | ||
| 96 | } \ | ||
| 97 | else \ | ||
| 98 | (c = ((p) == (str2) ? (end1) : (p))[-1]); \ | ||
| 99 | } while (0) | ||
| 100 | |||
| 84 | 101 | ||
| 85 | #else /* not emacs */ | 102 | #else /* not emacs */ |
| 86 | 103 | ||
| @@ -181,6 +198,8 @@ init_syntax_once () | |||
| 181 | #define BASE_LEADING_CODE_P(c) (0) | 198 | #define BASE_LEADING_CODE_P(c) (0) |
| 182 | #define CHAR_CHARSET(c) 0 | 199 | #define CHAR_CHARSET(c) 0 |
| 183 | #define CHARSET_LEADING_CODE_BASE(c) 0 | 200 | #define CHARSET_LEADING_CODE_BASE(c) 0 |
| 201 | #define MAX_MULTIBYTE_LENGTH 1 | ||
| 202 | #define RE_MULTIBYTE_P(x) 0 | ||
| 184 | #define WORD_BOUNDARY_P(c1, c2) (0) | 203 | #define WORD_BOUNDARY_P(c1, c2) (0) |
| 185 | #define CHAR_HEAD_P(p) (1) | 204 | #define CHAR_HEAD_P(p) (1) |
| 186 | #define SINGLE_BYTE_CHAR_P(c) (1) | 205 | #define SINGLE_BYTE_CHAR_P(c) (1) |
| @@ -188,7 +207,9 @@ init_syntax_once () | |||
| 188 | #define MULTIBYTE_FORM_LENGTH(p, s) (1) | 207 | #define MULTIBYTE_FORM_LENGTH(p, s) (1) |
| 189 | #define STRING_CHAR(p, s) (*(p)) | 208 | #define STRING_CHAR(p, s) (*(p)) |
| 190 | #define RE_STRING_CHAR STRING_CHAR | 209 | #define RE_STRING_CHAR STRING_CHAR |
| 210 | #define CHAR_STRING(c, s) (*(s) = (c), 1) | ||
| 191 | #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) | 211 | #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) |
| 212 | #define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH | ||
| 192 | #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ | 213 | #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ |
| 193 | (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) | 214 | (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) |
| 194 | #endif /* not emacs */ | 215 | #endif /* not emacs */ |
| @@ -1550,19 +1571,19 @@ static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend, | |||
| 1550 | #define PATFETCH(c) \ | 1571 | #define PATFETCH(c) \ |
| 1551 | do { \ | 1572 | do { \ |
| 1552 | PATFETCH_RAW (c); \ | 1573 | PATFETCH_RAW (c); \ |
| 1553 | if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ | 1574 | c = TRANSLATE (c); \ |
| 1554 | } while (0) | 1575 | } while (0) |
| 1555 | 1576 | ||
| 1556 | /* Fetch the next character in the uncompiled pattern, with no | 1577 | /* Fetch the next character in the uncompiled pattern, with no |
| 1557 | translation. */ | 1578 | translation. */ |
| 1558 | #define PATFETCH_RAW(c) \ | 1579 | #define PATFETCH_RAW(c) \ |
| 1559 | do {if (p == pend) return REG_EEND; \ | 1580 | do { \ |
| 1560 | c = *p++; \ | 1581 | int len; \ |
| 1582 | if (p == pend) return REG_EEND; \ | ||
| 1583 | c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ | ||
| 1584 | p += len; \ | ||
| 1561 | } while (0) | 1585 | } while (0) |
| 1562 | 1586 | ||
| 1563 | /* Go backwards one character in the pattern. */ | ||
| 1564 | #define PATUNFETCH p-- | ||
| 1565 | |||
| 1566 | 1587 | ||
| 1567 | /* If `translate' is non-null, return translate[D], else just D. We | 1588 | /* If `translate' is non-null, return translate[D], else just D. We |
| 1568 | cast the subscript to translate because some data is declared as | 1589 | cast the subscript to translate because some data is declared as |
| @@ -1957,6 +1978,9 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 1957 | /* Work area for range table of charset. */ | 1978 | /* Work area for range table of charset. */ |
| 1958 | struct range_table_work_area range_table_work; | 1979 | struct range_table_work_area range_table_work; |
| 1959 | 1980 | ||
| 1981 | /* If the object matched can contain multibyte characters. */ | ||
| 1982 | const boolean multibyte = RE_MULTIBYTE_P (bufp); | ||
| 1983 | |||
| 1960 | #ifdef DEBUG | 1984 | #ifdef DEBUG |
| 1961 | debug++; | 1985 | debug++; |
| 1962 | DEBUG_PRINT1 ("\nCompiling pattern: "); | 1986 | DEBUG_PRINT1 ("\nCompiling pattern: "); |
| @@ -1994,14 +2018,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 1994 | /* Always count groups, whether or not bufp->no_sub is set. */ | 2018 | /* Always count groups, whether or not bufp->no_sub is set. */ |
| 1995 | bufp->re_nsub = 0; | 2019 | bufp->re_nsub = 0; |
| 1996 | 2020 | ||
| 1997 | #ifdef emacs | ||
| 1998 | /* bufp->multibyte is set before regex_compile is called, so don't alter | ||
| 1999 | it. */ | ||
| 2000 | #else /* not emacs */ | ||
| 2001 | /* Nothing is recognized as a multibyte character. */ | ||
| 2002 | bufp->multibyte = 0; | ||
| 2003 | #endif | ||
| 2004 | |||
| 2005 | #if !defined (emacs) && !defined (SYNTAX_TABLE) | 2021 | #if !defined (emacs) && !defined (SYNTAX_TABLE) |
| 2006 | /* Initialize the syntax table. */ | 2022 | /* Initialize the syntax table. */ |
| 2007 | init_syntax_once (); | 2023 | init_syntax_once (); |
| @@ -2254,8 +2270,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2254 | /* Read in characters and ranges, setting map bits. */ | 2270 | /* Read in characters and ranges, setting map bits. */ |
| 2255 | for (;;) | 2271 | for (;;) |
| 2256 | { | 2272 | { |
| 2257 | int len; | ||
| 2258 | boolean escaped_char = false; | 2273 | boolean escaped_char = false; |
| 2274 | const unsigned char *p2 = p; | ||
| 2259 | 2275 | ||
| 2260 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2276 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2261 | 2277 | ||
| @@ -2274,19 +2290,10 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2274 | /* Could be the end of the bracket expression. If it's | 2290 | /* Could be the end of the bracket expression. If it's |
| 2275 | not (i.e., when the bracket expression is `[]' so | 2291 | not (i.e., when the bracket expression is `[]' so |
| 2276 | far), the ']' character bit gets set way below. */ | 2292 | far), the ']' character bit gets set way below. */ |
| 2277 | if (c == ']' && p != p1 + 1) | 2293 | if (c == ']' && p2 != p1) |
| 2278 | break; | 2294 | break; |
| 2279 | } | 2295 | } |
| 2280 | 2296 | ||
| 2281 | /* If C indicates start of multibyte char, get the | ||
| 2282 | actual character code in C, and set the pattern | ||
| 2283 | pointer P to the next character boundary. */ | ||
| 2284 | if (bufp->multibyte && BASE_LEADING_CODE_P (c)) | ||
| 2285 | { | ||
| 2286 | PATUNFETCH; | ||
| 2287 | c = STRING_CHAR_AND_LENGTH (p, pend - p, len); | ||
| 2288 | p += len; | ||
| 2289 | } | ||
| 2290 | /* What should we do for the character which is | 2297 | /* What should we do for the character which is |
| 2291 | greater than 0x7F, but not BASE_LEADING_CODE_P? | 2298 | greater than 0x7F, but not BASE_LEADING_CODE_P? |
| 2292 | XXX */ | 2299 | XXX */ |
| @@ -2294,8 +2301,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2294 | /* See if we're at the beginning of a possible character | 2301 | /* See if we're at the beginning of a possible character |
| 2295 | class. */ | 2302 | class. */ |
| 2296 | 2303 | ||
| 2297 | else if (!escaped_char && | 2304 | if (!escaped_char && |
| 2298 | syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | 2305 | syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
| 2299 | { | 2306 | { |
| 2300 | /* Leave room for the null. */ | 2307 | /* Leave room for the null. */ |
| 2301 | char str[CHAR_CLASS_MAX_LENGTH + 1]; | 2308 | char str[CHAR_CLASS_MAX_LENGTH + 1]; |
| @@ -2358,7 +2365,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2358 | they can only match ASCII characters. We | 2365 | they can only match ASCII characters. We |
| 2359 | don't need to handle them for multibyte. */ | 2366 | don't need to handle them for multibyte. */ |
| 2360 | 2367 | ||
| 2361 | if (bufp->multibyte) | 2368 | if (multibyte) |
| 2362 | { | 2369 | { |
| 2363 | int bit = 0; | 2370 | int bit = 0; |
| 2364 | 2371 | ||
| @@ -2435,12 +2442,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2435 | 2442 | ||
| 2436 | /* Fetch the character which ends the range. */ | 2443 | /* Fetch the character which ends the range. */ |
| 2437 | PATFETCH (c1); | 2444 | PATFETCH (c1); |
| 2438 | if (bufp->multibyte && BASE_LEADING_CODE_P (c1)) | ||
| 2439 | { | ||
| 2440 | PATUNFETCH; | ||
| 2441 | c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len); | ||
| 2442 | p += len; | ||
| 2443 | } | ||
| 2444 | 2445 | ||
| 2445 | if (SINGLE_BYTE_CHAR_P (c) | 2446 | if (SINGLE_BYTE_CHAR_P (c) |
| 2446 | && ! SINGLE_BYTE_CHAR_P (c1)) | 2447 | && ! SINGLE_BYTE_CHAR_P (c1)) |
| @@ -3028,16 +3029,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3028 | default: | 3029 | default: |
| 3029 | /* Expects the character in `c'. */ | 3030 | /* Expects the character in `c'. */ |
| 3030 | normal_char: | 3031 | normal_char: |
| 3031 | p1 = p - 1; /* P1 points the head of C. */ | ||
| 3032 | #ifdef emacs | ||
| 3033 | if (bufp->multibyte) | ||
| 3034 | { | ||
| 3035 | c = STRING_CHAR (p1, pend - p1); | ||
| 3036 | c = TRANSLATE (c); | ||
| 3037 | /* Set P to the next character boundary. */ | ||
| 3038 | p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1; | ||
| 3039 | } | ||
| 3040 | #endif | ||
| 3041 | /* If no exactn currently being built. */ | 3032 | /* If no exactn currently being built. */ |
| 3042 | if (!pending_exact | 3033 | if (!pending_exact |
| 3043 | 3034 | ||
| @@ -3045,7 +3036,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3045 | || pending_exact + *pending_exact + 1 != b | 3036 | || pending_exact + *pending_exact + 1 != b |
| 3046 | 3037 | ||
| 3047 | /* We have only one byte following the exactn for the count. */ | 3038 | /* We have only one byte following the exactn for the count. */ |
| 3048 | || *pending_exact >= (1 << BYTEWIDTH) - (p - p1) | 3039 | || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH |
| 3049 | 3040 | ||
| 3050 | /* If followed by a repetition operator. */ | 3041 | /* If followed by a repetition operator. */ |
| 3051 | || (p != pend && (*p == '*' || *p == '^')) | 3042 | || (p != pend && (*p == '*' || *p == '^')) |
| @@ -3065,24 +3056,13 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3065 | pending_exact = b - 1; | 3056 | pending_exact = b - 1; |
| 3066 | } | 3057 | } |
| 3067 | 3058 | ||
| 3068 | #ifdef emacs | 3059 | GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH); |
| 3069 | if (! SINGLE_BYTE_CHAR_P (c)) | 3060 | { |
| 3070 | { | 3061 | int len = CHAR_STRING (c, b); |
| 3071 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | 3062 | b += len; |
| 3072 | int i = CHAR_STRING (c, str); | 3063 | (*pending_exact) += len; |
| 3073 | int j; | 3064 | } |
| 3074 | for (j = 0; j < i; j++) | 3065 | |
| 3075 | { | ||
| 3076 | BUF_PUSH (str[j]); | ||
| 3077 | (*pending_exact)++; | ||
| 3078 | } | ||
| 3079 | } | ||
| 3080 | else | ||
| 3081 | #endif | ||
| 3082 | { | ||
| 3083 | BUF_PUSH (c); | ||
| 3084 | (*pending_exact)++; | ||
| 3085 | } | ||
| 3086 | break; | 3066 | break; |
| 3087 | } /* switch (c) */ | 3067 | } /* switch (c) */ |
| 3088 | } /* while p != pend */ | 3068 | } /* while p != pend */ |
| @@ -3616,7 +3596,7 @@ re_compile_fastmap (bufp) | |||
| 3616 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ | 3596 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ |
| 3617 | 3597 | ||
| 3618 | analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, | 3598 | analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, |
| 3619 | fastmap, bufp->multibyte); | 3599 | fastmap, RE_MULTIBYTE_P (bufp)); |
| 3620 | if (analysis < -1) | 3600 | if (analysis < -1) |
| 3621 | return analysis; | 3601 | return analysis; |
| 3622 | bufp->can_be_null = (analysis != 0); | 3602 | bufp->can_be_null = (analysis != 0); |
| @@ -3723,7 +3703,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 3723 | int anchored_start = 0; | 3703 | int anchored_start = 0; |
| 3724 | 3704 | ||
| 3725 | /* Nonzero if we have to concern multibyte character. */ | 3705 | /* Nonzero if we have to concern multibyte character. */ |
| 3726 | const boolean multibyte = bufp->multibyte; | 3706 | const boolean multibyte = RE_MULTIBYTE_P (bufp); |
| 3727 | 3707 | ||
| 3728 | /* Check for out-of-range STARTPOS. */ | 3708 | /* Check for out-of-range STARTPOS. */ |
| 3729 | if (startpos < 0 || startpos > total_size) | 3709 | if (startpos < 0 || startpos > total_size) |
| @@ -3850,11 +3830,11 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 3850 | } | 3830 | } |
| 3851 | else /* Searching backwards. */ | 3831 | else /* Searching backwards. */ |
| 3852 | { | 3832 | { |
| 3853 | buf_ch = STRING_CHAR (d, (startpos >= size1 | 3833 | int room = (startpos >= size1 |
| 3854 | ? size2 + size1 - startpos | 3834 | ? size2 + size1 - startpos |
| 3855 | : size1 - startpos)); | 3835 | : size1 - startpos); |
| 3856 | if (RE_TRANSLATE_P (translate)) | 3836 | buf_ch = RE_STRING_CHAR (d, room); |
| 3857 | buf_ch = RE_TRANSLATE (translate, buf_ch); | 3837 | buf_ch = TRANSLATE (buf_ch); |
| 3858 | 3838 | ||
| 3859 | if (! (buf_ch >= 0400 | 3839 | if (! (buf_ch >= 0400 |
| 3860 | || fastmap[buf_ch])) | 3840 | || fastmap[buf_ch])) |
| @@ -3940,7 +3920,10 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 3940 | 3920 | ||
| 3941 | /* Declarations and macros for re_match_2. */ | 3921 | /* Declarations and macros for re_match_2. */ |
| 3942 | 3922 | ||
| 3943 | static int bcmp_translate (); | 3923 | static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2, |
| 3924 | register int len, | ||
| 3925 | RE_TRANSLATE_TYPE translate, | ||
| 3926 | const int multibyte)); | ||
| 3944 | 3927 | ||
| 3945 | /* This converts PTR, a pointer into one of the search strings `string1' | 3928 | /* This converts PTR, a pointer into one of the search strings `string1' |
| 3946 | and `string2' into an offset from the beginning of that string. */ | 3929 | and `string2' into an offset from the beginning of that string. */ |
| @@ -4093,7 +4076,7 @@ mutually_exclusive_p (bufp, p1, p2) | |||
| 4093 | unsigned char *p1, *p2; | 4076 | unsigned char *p1, *p2; |
| 4094 | { | 4077 | { |
| 4095 | re_opcode_t op2; | 4078 | re_opcode_t op2; |
| 4096 | const boolean multibyte = bufp->multibyte; | 4079 | const boolean multibyte = RE_MULTIBYTE_P (bufp); |
| 4097 | unsigned char *pend = bufp->buffer + bufp->used; | 4080 | unsigned char *pend = bufp->buffer + bufp->used; |
| 4098 | 4081 | ||
| 4099 | assert (p1 >= bufp->buffer && p1 < pend | 4082 | assert (p1 >= bufp->buffer && p1 < pend |
| @@ -4373,7 +4356,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4373 | RE_TRANSLATE_TYPE translate = bufp->translate; | 4356 | RE_TRANSLATE_TYPE translate = bufp->translate; |
| 4374 | 4357 | ||
| 4375 | /* Nonzero if we have to concern multibyte character. */ | 4358 | /* Nonzero if we have to concern multibyte character. */ |
| 4376 | const boolean multibyte = bufp->multibyte; | 4359 | const boolean multibyte = RE_MULTIBYTE_P (bufp); |
| 4377 | 4360 | ||
| 4378 | /* Failure point stack. Each place that can handle a failure further | 4361 | /* Failure point stack. Each place that can handle a failure further |
| 4379 | down the line pushes a failure point on this stack. It consists of | 4362 | down the line pushes a failure point on this stack. It consists of |
| @@ -4721,7 +4704,6 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4721 | testing `translate' inside the loop. */ | 4704 | testing `translate' inside the loop. */ |
| 4722 | if (RE_TRANSLATE_P (translate)) | 4705 | if (RE_TRANSLATE_P (translate)) |
| 4723 | { | 4706 | { |
| 4724 | #ifdef emacs | ||
| 4725 | if (multibyte) | 4707 | if (multibyte) |
| 4726 | do | 4708 | do |
| 4727 | { | 4709 | { |
| @@ -4745,7 +4727,6 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4745 | } | 4727 | } |
| 4746 | while (mcnt > 0); | 4728 | while (mcnt > 0); |
| 4747 | else | 4729 | else |
| 4748 | #endif /* not emacs */ | ||
| 4749 | do | 4730 | do |
| 4750 | { | 4731 | { |
| 4751 | PREFETCH (); | 4732 | PREFETCH (); |
| @@ -4783,17 +4764,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4783 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); | 4764 | DEBUG_PRINT1 ("EXECUTING anychar.\n"); |
| 4784 | 4765 | ||
| 4785 | PREFETCH (); | 4766 | PREFETCH (); |
| 4786 | 4767 | buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | |
| 4787 | #ifdef emacs | ||
| 4788 | if (multibyte) | ||
| 4789 | buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); | ||
| 4790 | else | ||
| 4791 | #endif /* not emacs */ | ||
| 4792 | { | ||
| 4793 | buf_ch = *d; | ||
| 4794 | buf_charlen = 1; | ||
| 4795 | } | ||
| 4796 | |||
| 4797 | buf_ch = TRANSLATE (buf_ch); | 4768 | buf_ch = TRANSLATE (buf_ch); |
| 4798 | 4769 | ||
| 4799 | if ((!(bufp->syntax & RE_DOT_NEWLINE) | 4770 | if ((!(bufp->syntax & RE_DOT_NEWLINE) |
| @@ -4828,27 +4799,20 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4828 | 4799 | ||
| 4829 | DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); | 4800 | DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); |
| 4830 | 4801 | ||
| 4831 | PREFETCH (); | ||
| 4832 | c = *d; | ||
| 4833 | |||
| 4834 | range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); | 4802 | range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); |
| 4835 | 4803 | ||
| 4836 | #ifdef emacs | ||
| 4837 | if (range_table_exists) | 4804 | if (range_table_exists) |
| 4838 | { | 4805 | { |
| 4839 | range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ | 4806 | range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ |
| 4840 | EXTRACT_NUMBER_AND_INCR (count, range_table); | 4807 | EXTRACT_NUMBER_AND_INCR (count, range_table); |
| 4841 | } | 4808 | } |
| 4842 | 4809 | ||
| 4843 | if (multibyte && BASE_LEADING_CODE_P (c)) | 4810 | PREFETCH (); |
| 4844 | c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | 4811 | c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); |
| 4845 | #endif /* emacs */ | 4812 | c = TRANSLATE (c); /* The character to match. */ |
| 4846 | 4813 | ||
| 4847 | if (SINGLE_BYTE_CHAR_P (c)) | 4814 | if (SINGLE_BYTE_CHAR_P (c)) |
| 4848 | { /* Lookup bitmap. */ | 4815 | { /* Lookup bitmap. */ |
| 4849 | c = TRANSLATE (c); /* The character to match. */ | ||
| 4850 | len = 1; | ||
| 4851 | |||
| 4852 | /* Cast to `unsigned' instead of `unsigned char' in | 4816 | /* Cast to `unsigned' instead of `unsigned char' in |
| 4853 | case the bit list is a full 32 bytes long. */ | 4817 | case the bit list is a full 32 bytes long. */ |
| 4854 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) | 4818 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) |
| @@ -4994,7 +4958,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4994 | /* Compare that many; failure if mismatch, else move | 4958 | /* Compare that many; failure if mismatch, else move |
| 4995 | past them. */ | 4959 | past them. */ |
| 4996 | if (RE_TRANSLATE_P (translate) | 4960 | if (RE_TRANSLATE_P (translate) |
| 4997 | ? bcmp_translate (d, d2, mcnt, translate) | 4961 | ? bcmp_translate (d, d2, mcnt, translate, multibyte) |
| 4998 | : bcmp (d, d2, mcnt)) | 4962 | : bcmp (d, d2, mcnt)) |
| 4999 | { | 4963 | { |
| 5000 | d = dfail; | 4964 | d = dfail; |
| @@ -5263,18 +5227,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5263 | is the character at D, and S2 is the syntax of C2. */ | 5227 | is the character at D, and S2 is the syntax of C2. */ |
| 5264 | int c1, c2, s1, s2; | 5228 | int c1, c2, s1, s2; |
| 5265 | #ifdef emacs | 5229 | #ifdef emacs |
| 5266 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1)); | 5230 | int offset = PTR_TO_OFFSET (d - 1); |
| 5231 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | ||
| 5267 | UPDATE_SYNTAX_TABLE (charpos); | 5232 | UPDATE_SYNTAX_TABLE (charpos); |
| 5268 | #endif | 5233 | #endif |
| 5269 | /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | ||
| 5270 | GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | 5234 | GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); |
| 5271 | s1 = SYNTAX (c1); | 5235 | s1 = SYNTAX (c1); |
| 5272 | #ifdef emacs | 5236 | #ifdef emacs |
| 5273 | UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); | 5237 | UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); |
| 5274 | #endif | 5238 | #endif |
| 5275 | PREFETCH (); | 5239 | PREFETCH (); |
| 5276 | /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5240 | c2 = RE_STRING_CHAR (d, dend - d); |
| 5277 | c2 = STRING_CHAR (d, dend - d); | ||
| 5278 | s2 = SYNTAX (c2); | 5241 | s2 = SYNTAX (c2); |
| 5279 | 5242 | ||
| 5280 | if (/* Case 2: Only one of S1 and S2 is Sword. */ | 5243 | if (/* Case 2: Only one of S1 and S2 is Sword. */ |
| @@ -5303,12 +5266,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5303 | is the character at D, and S2 is the syntax of C2. */ | 5266 | is the character at D, and S2 is the syntax of C2. */ |
| 5304 | int c1, c2, s1, s2; | 5267 | int c1, c2, s1, s2; |
| 5305 | #ifdef emacs | 5268 | #ifdef emacs |
| 5306 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | 5269 | int offset = PTR_TO_OFFSET (d); |
| 5270 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | ||
| 5307 | UPDATE_SYNTAX_TABLE (charpos); | 5271 | UPDATE_SYNTAX_TABLE (charpos); |
| 5308 | #endif | 5272 | #endif |
| 5309 | PREFETCH (); | 5273 | PREFETCH (); |
| 5310 | /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5274 | c2 = RE_STRING_CHAR (d, dend - d); |
| 5311 | c2 = STRING_CHAR (d, dend - d); | ||
| 5312 | s2 = SYNTAX (c2); | 5275 | s2 = SYNTAX (c2); |
| 5313 | 5276 | ||
| 5314 | /* Case 2: S2 is not Sword. */ | 5277 | /* Case 2: S2 is not Sword. */ |
| @@ -5346,7 +5309,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5346 | is the character at D, and S2 is the syntax of C2. */ | 5309 | is the character at D, and S2 is the syntax of C2. */ |
| 5347 | int c1, c2, s1, s2; | 5310 | int c1, c2, s1, s2; |
| 5348 | #ifdef emacs | 5311 | #ifdef emacs |
| 5349 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1); | 5312 | int offset = PTR_TO_OFFSET (d) - 1; |
| 5313 | int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | ||
| 5350 | UPDATE_SYNTAX_TABLE (charpos); | 5314 | UPDATE_SYNTAX_TABLE (charpos); |
| 5351 | #endif | 5315 | #endif |
| 5352 | GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); | 5316 | GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); |
| @@ -5360,8 +5324,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5360 | if (!AT_STRINGS_END (d)) | 5324 | if (!AT_STRINGS_END (d)) |
| 5361 | { | 5325 | { |
| 5362 | PREFETCH (); | 5326 | PREFETCH (); |
| 5363 | /* FIXME: This does a STRING_CHAR even for unibyte buffers. */ | 5327 | c2 = RE_STRING_CHAR (d, dend - d); |
| 5364 | c2 = STRING_CHAR (d, dend - d); | ||
| 5365 | #ifdef emacs | 5328 | #ifdef emacs |
| 5366 | UPDATE_SYNTAX_TABLE_FORWARD (charpos); | 5329 | UPDATE_SYNTAX_TABLE_FORWARD (charpos); |
| 5367 | #endif | 5330 | #endif |
| @@ -5383,20 +5346,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5383 | PREFETCH (); | 5346 | PREFETCH (); |
| 5384 | #ifdef emacs | 5347 | #ifdef emacs |
| 5385 | { | 5348 | { |
| 5386 | int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | 5349 | int offset = PTR_TO_OFFSET (d); |
| 5350 | int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset); | ||
| 5387 | UPDATE_SYNTAX_TABLE (pos1); | 5351 | UPDATE_SYNTAX_TABLE (pos1); |
| 5388 | } | 5352 | } |
| 5389 | #endif | 5353 | #endif |
| 5390 | { | 5354 | { |
| 5391 | int c, len; | 5355 | int c, len; |
| 5392 | 5356 | ||
| 5393 | if (multibyte) | 5357 | c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); |
| 5394 | /* we must concern about multibyte form, ... */ | ||
| 5395 | c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | ||
| 5396 | else | ||
| 5397 | /* everything should be handled as ASCII, even though it | ||
| 5398 | looks like multibyte form. */ | ||
| 5399 | c = *d, len = 1; | ||
| 5400 | 5358 | ||
| 5401 | if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) | 5359 | if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) |
| 5402 | goto fail; | 5360 | goto fail; |
| @@ -5431,11 +5389,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5431 | PREFETCH (); | 5389 | PREFETCH (); |
| 5432 | { | 5390 | { |
| 5433 | int c, len; | 5391 | int c, len; |
| 5434 | 5392 | c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); | |
| 5435 | if (multibyte) | ||
| 5436 | c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | ||
| 5437 | else | ||
| 5438 | c = *d, len = 1; | ||
| 5439 | 5393 | ||
| 5440 | if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) | 5394 | if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) |
| 5441 | goto fail; | 5395 | goto fail; |
| @@ -5512,23 +5466,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5512 | bytes; nonzero otherwise. */ | 5466 | bytes; nonzero otherwise. */ |
| 5513 | 5467 | ||
| 5514 | static int | 5468 | static int |
| 5515 | bcmp_translate (s1, s2, len, translate) | 5469 | bcmp_translate (s1, s2, len, translate, multibyte) |
| 5516 | unsigned char *s1, *s2; | 5470 | re_char *s1, *s2; |
| 5517 | register int len; | 5471 | register int len; |
| 5518 | RE_TRANSLATE_TYPE translate; | 5472 | RE_TRANSLATE_TYPE translate; |
| 5473 | const int multibyte; | ||
| 5519 | { | 5474 | { |
| 5520 | register unsigned char *p1 = s1, *p2 = s2; | 5475 | register re_char *p1 = s1, *p2 = s2; |
| 5521 | unsigned char *p1_end = s1 + len; | 5476 | re_char *p1_end = s1 + len; |
| 5522 | unsigned char *p2_end = s2 + len; | 5477 | re_char *p2_end = s2 + len; |
| 5523 | 5478 | ||
| 5524 | while (p1 != p1_end && p2 != p2_end) | 5479 | while (p1 != p1_end && p2 != p2_end) |
| 5525 | { | 5480 | { |
| 5526 | int p1_charlen, p2_charlen; | 5481 | int p1_charlen, p2_charlen; |
| 5527 | int p1_ch, p2_ch; | 5482 | int p1_ch, p2_ch; |
| 5528 | 5483 | ||
| 5529 | /* FIXME: This assumes `multibyte = true'. */ | 5484 | p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); |
| 5530 | p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); | 5485 | p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); |
| 5531 | p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); | ||
| 5532 | 5486 | ||
| 5533 | if (RE_TRANSLATE (translate, p1_ch) | 5487 | if (RE_TRANSLATE (translate, p1_ch) |
| 5534 | != RE_TRANSLATE (translate, p2_ch)) | 5488 | != RE_TRANSLATE (translate, p2_ch)) |
diff --git a/src/regex.h b/src/regex.h index 9ebc4e0bf22..9ee2060a1ed 100644 --- a/src/regex.h +++ b/src/regex.h | |||
| @@ -362,9 +362,11 @@ struct re_pattern_buffer | |||
| 362 | /* If true, an anchor at a newline matches. */ | 362 | /* If true, an anchor at a newline matches. */ |
| 363 | unsigned newline_anchor : 1; | 363 | unsigned newline_anchor : 1; |
| 364 | 364 | ||
| 365 | #ifdef emacs | ||
| 365 | /* If true, multi-byte form in the `buffer' should be recognized as a | 366 | /* If true, multi-byte form in the `buffer' should be recognized as a |
| 366 | multibyte character. */ | 367 | multibyte character. */ |
| 367 | unsigned multibyte : 1; | 368 | unsigned multibyte : 1; |
| 369 | #endif | ||
| 368 | 370 | ||
| 369 | /* [[[end pattern_buffer]]] */ | 371 | /* [[[end pattern_buffer]]] */ |
| 370 | }; | 372 | }; |