diff options
| author | Stefan Monnier | 2002-08-23 22:21:51 +0000 |
|---|---|---|
| committer | Stefan Monnier | 2002-08-23 22:21:51 +0000 |
| commit | 365958144ea38255d543a4232b926ca81e849fa9 (patch) | |
| tree | 43beeeefed478bcbfac634c44348351456decaff /src | |
| parent | d846a776e1043ad6d23a71a8daf42cc8b197c4f9 (diff) | |
| download | emacs-365958144ea38255d543a4232b926ca81e849fa9.tar.gz emacs-365958144ea38255d543a4232b926ca81e849fa9.zip | |
(PATFETCH): Remove the translating fetch.
(PATFETCH_RAW): Rename to PATFETCH.
(set_image_of_range): New fun.
(SET_RANGE_TABLE_WORK_AREA): Use it.
(regex_compile): Don't translate the pattern chars so eagerly.
Only do it when inserting an `exactn' bytecode or when handling a char-range.
(mutually_exclusive_p): Avoid empty statement.
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 22 | ||||
| -rw-r--r-- | src/regex.c | 76 |
2 files changed, 66 insertions, 32 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index 6dcc95b7f8d..c6180468193 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,14 @@ | |||
| 1 | 2002-08-23 Stefan Monnier <monnier@cs.yale.edu> | ||
| 2 | |||
| 3 | * regex.c (PATFETCH): Remove the translating fetch. | ||
| 4 | (PATFETCH_RAW): Rename to PATFETCH. | ||
| 5 | (set_image_of_range): New fun. | ||
| 6 | (SET_RANGE_TABLE_WORK_AREA): Use it. | ||
| 7 | (regex_compile): Don't translate the pattern chars so eagerly. | ||
| 8 | Only do it when inserting an `exactn' bytecode or when handling | ||
| 9 | a char-range. | ||
| 10 | (mutually_exclusive_p): Avoid empty statement. | ||
| 11 | |||
| 1 | 2002-08-22 Kim F. Storm <storm@cua.dk> | 12 | 2002-08-22 Kim F. Storm <storm@cua.dk> |
| 2 | 13 | ||
| 3 | * xdisp.c (redisplay_window): Do not `goto try_to_scroll' when we | 14 | * xdisp.c (redisplay_window): Do not `goto try_to_scroll' when we |
| @@ -511,11 +522,10 @@ | |||
| 511 | (parse_solitary_modifier, Fexecute_extended_command): Likewise. | 522 | (parse_solitary_modifier, Fexecute_extended_command): Likewise. |
| 512 | * textprop.c (validate_interval_range, interval_of): Likewise. | 523 | * textprop.c (validate_interval_range, interval_of): Likewise. |
| 513 | 524 | ||
| 514 | * fontset.c (Fset_fontset_font): Use SDATA instead of | 525 | * fontset.c (Fset_fontset_font): Use SDATA instead of XSTRING()->data. |
| 515 | XSTRING()->data. | ||
| 516 | 526 | ||
| 517 | * charset.h (FETCH_STRING_CHAR_ADVANCE, | 527 | * charset.h (FETCH_STRING_CHAR_ADVANCE) |
| 518 | FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of | 528 | (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of |
| 519 | XSTRING()->size_byte. | 529 | XSTRING()->size_byte. |
| 520 | 530 | ||
| 521 | * lisp.h (SDATA, SREF): Produce rvalue. | 531 | * lisp.h (SDATA, SREF): Produce rvalue. |
| @@ -524,8 +534,8 @@ | |||
| 524 | * buffer.c (Fother_buffer): Use SREF when retrieving a byte from | 534 | * buffer.c (Fother_buffer): Use SREF when retrieving a byte from |
| 525 | a string. | 535 | a string. |
| 526 | * casefiddle.c (casify_object): Use SSET. | 536 | * casefiddle.c (casify_object): Use SSET. |
| 527 | * charset.h (FETCH_STRING_CHAR_ADVANCE, | 537 | * charset.h (FETCH_STRING_CHAR_ADVANCE) |
| 528 | FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting | 538 | (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting |
| 529 | address of string contents. | 539 | address of string contents. |
| 530 | * data.c (Faref): Use SDATA. | 540 | * data.c (Faref): Use SDATA. |
| 531 | (Faset): Use SDATA, SSET. | 541 | (Faset): Use SDATA, SSET. |
diff --git a/src/regex.c b/src/regex.c index 591d6f14e12..e01259cc85a 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -19,7 +19,9 @@ | |||
| 19 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | 19 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 20 | USA. */ | 20 | USA. */ |
| 21 | 21 | ||
| 22 | /* TODO: | 22 | /* BUGS: |
| 23 | - (x?)*y\1z should match both xxxxyxz and xxxyz. | ||
| 24 | TODO: | ||
| 23 | - structure the opcode space into opcode+flag. | 25 | - structure the opcode space into opcode+flag. |
| 24 | - merge with glibc's regex.[ch]. | 26 | - merge with glibc's regex.[ch]. |
| 25 | - replace (succeed_n + jump_n + set_number_at) with something that doesn't | 27 | - replace (succeed_n + jump_n + set_number_at) with something that doesn't |
| @@ -1682,17 +1684,9 @@ static re_char *skip_one_char _RE_ARGS ((re_char *p)); | |||
| 1682 | static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, | 1684 | static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, |
| 1683 | char *fastmap, const int multibyte)); | 1685 | char *fastmap, const int multibyte)); |
| 1684 | 1686 | ||
| 1685 | /* Fetch the next character in the uncompiled pattern---translating it | ||
| 1686 | if necessary. */ | ||
| 1687 | #define PATFETCH(c) \ | ||
| 1688 | do { \ | ||
| 1689 | PATFETCH_RAW (c); \ | ||
| 1690 | c = TRANSLATE (c); \ | ||
| 1691 | } while (0) | ||
| 1692 | |||
| 1693 | /* Fetch the next character in the uncompiled pattern, with no | 1687 | /* Fetch the next character in the uncompiled pattern, with no |
| 1694 | translation. */ | 1688 | translation. */ |
| 1695 | #define PATFETCH_RAW(c) \ | 1689 | #define PATFETCH(c) \ |
| 1696 | do { \ | 1690 | do { \ |
| 1697 | int len; \ | 1691 | int len; \ |
| 1698 | if (p == pend) return REG_EEND; \ | 1692 | if (p == pend) return REG_EEND; \ |
| @@ -1914,12 +1908,13 @@ struct range_table_work_area | |||
| 1914 | #define BIT_UPPER 0x10 | 1908 | #define BIT_UPPER 0x10 |
| 1915 | #define BIT_MULTIBYTE 0x20 | 1909 | #define BIT_MULTIBYTE 0x20 |
| 1916 | 1910 | ||
| 1917 | /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ | 1911 | /* Set a range START..END to WORK_AREA. |
| 1918 | #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ | 1912 | The range is passed through TRANSLATE, so START and END |
| 1919 | do { \ | 1913 | should be untranslated. */ |
| 1920 | EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ | 1914 | #define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \ |
| 1921 | (work_area).table[(work_area).used++] = (range_start); \ | 1915 | do { \ |
| 1922 | (work_area).table[(work_area).used++] = (range_end); \ | 1916 | EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ |
| 1917 | set_image_of_range (&work_area, start, end, translate); \ | ||
| 1923 | } while (0) | 1918 | } while (0) |
| 1924 | 1919 | ||
| 1925 | /* Free allocated memory for WORK_AREA. */ | 1920 | /* Free allocated memory for WORK_AREA. */ |
| @@ -2077,6 +2072,31 @@ re_wctype_to_bit (cc) | |||
| 2077 | } | 2072 | } |
| 2078 | #endif | 2073 | #endif |
| 2079 | 2074 | ||
| 2075 | |||
| 2076 | |||
| 2077 | /* We need to find the image of the range start..end when passed through | ||
| 2078 | TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end) | ||
| 2079 | and is not even necessarily contiguous. | ||
| 2080 | We approximate it with the smallest contiguous range that contains | ||
| 2081 | all the chars we need. */ | ||
| 2082 | static void | ||
| 2083 | set_image_of_range (work_area, start, end, translate) | ||
| 2084 | RE_TRANSLATE_TYPE translate; | ||
| 2085 | struct range_table_work_area *work_area; | ||
| 2086 | re_wchar_t start, end; | ||
| 2087 | { | ||
| 2088 | re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end); | ||
| 2089 | if (RE_TRANSLATE_P (translate)) | ||
| 2090 | for (; start <= end; start++) | ||
| 2091 | { | ||
| 2092 | re_wchar_t c = TRANSLATE (start); | ||
| 2093 | cmin = MIN (cmin, c); | ||
| 2094 | cmax = MAX (cmax, c); | ||
| 2095 | } | ||
| 2096 | work_area->table[work_area->used++] = (cmin); | ||
| 2097 | work_area->table[work_area->used++] = (cmax); | ||
| 2098 | } | ||
| 2099 | |||
| 2080 | /* Explicit quit checking is only used on NTemacs. */ | 2100 | /* Explicit quit checking is only used on NTemacs. */ |
| 2081 | #if defined WINDOWSNT && defined emacs && defined QUIT | 2101 | #if defined WINDOWSNT && defined emacs && defined QUIT |
| 2082 | extern int immediate_quit; | 2102 | extern int immediate_quit; |
| @@ -2525,6 +2545,10 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2525 | 2545 | ||
| 2526 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2546 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2527 | 2547 | ||
| 2548 | /* Don't translate yet. The range TRANSLATE(X..Y) cannot | ||
| 2549 | always be determined from TRANSLATE(X) and TRANSLATE(Y) | ||
| 2550 | So the translation is done later in a loop. Example: | ||
| 2551 | (let ((case-fold-search t)) (string-match "[A-_]" "A")) */ | ||
| 2528 | PATFETCH (c); | 2552 | PATFETCH (c); |
| 2529 | 2553 | ||
| 2530 | /* \ might escape characters inside [...] and [^...]. */ | 2554 | /* \ might escape characters inside [...] and [^...]. */ |
| @@ -2584,7 +2608,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2584 | them). */ | 2608 | them). */ |
| 2585 | if (c == ':' && *p == ']') | 2609 | if (c == ':' && *p == ']') |
| 2586 | { | 2610 | { |
| 2587 | int ch; | 2611 | re_wchar_t ch; |
| 2588 | re_wctype_t cc; | 2612 | re_wctype_t cc; |
| 2589 | 2613 | ||
| 2590 | cc = re_wctype (str); | 2614 | cc = re_wctype (str); |
| @@ -2653,8 +2677,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2653 | starting at the smallest character in | 2677 | starting at the smallest character in |
| 2654 | the charset of C1 and ending at C1. */ | 2678 | the charset of C1 and ending at C1. */ |
| 2655 | int charset = CHAR_CHARSET (c1); | 2679 | int charset = CHAR_CHARSET (c1); |
| 2656 | int c2 = MAKE_CHAR (charset, 0, 0); | 2680 | re_wchar_t c2 = MAKE_CHAR (charset, 0, 0); |
| 2657 | 2681 | ||
| 2658 | SET_RANGE_TABLE_WORK_AREA (range_table_work, | 2682 | SET_RANGE_TABLE_WORK_AREA (range_table_work, |
| 2659 | c2, c1); | 2683 | c2, c1); |
| 2660 | c1 = 0377; | 2684 | c1 = 0377; |
| @@ -2672,7 +2696,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2672 | /* ... into bitmap. */ | 2696 | /* ... into bitmap. */ |
| 2673 | { | 2697 | { |
| 2674 | re_wchar_t this_char; | 2698 | re_wchar_t this_char; |
| 2675 | int range_start = c, range_end = c1; | 2699 | re_wchar_t range_start = c, range_end = c1; |
| 2676 | 2700 | ||
| 2677 | /* If the start is after the end, the range is empty. */ | 2701 | /* If the start is after the end, the range is empty. */ |
| 2678 | if (range_start > range_end) | 2702 | if (range_start > range_end) |
| @@ -2769,7 +2793,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2769 | /* Do not translate the character after the \, so that we can | 2793 | /* Do not translate the character after the \, so that we can |
| 2770 | distinguish, e.g., \B from \b, even if we normally would | 2794 | distinguish, e.g., \B from \b, even if we normally would |
| 2771 | translate, e.g., B to b. */ | 2795 | translate, e.g., B to b. */ |
| 2772 | PATFETCH_RAW (c); | 2796 | PATFETCH (c); |
| 2773 | 2797 | ||
| 2774 | switch (c) | 2798 | switch (c) |
| 2775 | { | 2799 | { |
| @@ -3129,13 +3153,13 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3129 | 3153 | ||
| 3130 | case 'c': | 3154 | case 'c': |
| 3131 | laststart = b; | 3155 | laststart = b; |
| 3132 | PATFETCH_RAW (c); | 3156 | PATFETCH (c); |
| 3133 | BUF_PUSH_2 (categoryspec, c); | 3157 | BUF_PUSH_2 (categoryspec, c); |
| 3134 | break; | 3158 | break; |
| 3135 | 3159 | ||
| 3136 | case 'C': | 3160 | case 'C': |
| 3137 | laststart = b; | 3161 | laststart = b; |
| 3138 | PATFETCH_RAW (c); | 3162 | PATFETCH (c); |
| 3139 | BUF_PUSH_2 (notcategoryspec, c); | 3163 | BUF_PUSH_2 (notcategoryspec, c); |
| 3140 | break; | 3164 | break; |
| 3141 | #endif /* emacs */ | 3165 | #endif /* emacs */ |
| @@ -3225,7 +3249,6 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3225 | /* You might think it would be useful for \ to mean | 3249 | /* You might think it would be useful for \ to mean |
| 3226 | not to translate; but if we don't translate it | 3250 | not to translate; but if we don't translate it |
| 3227 | it will never match anything. */ | 3251 | it will never match anything. */ |
| 3228 | c = TRANSLATE (c); | ||
| 3229 | goto normal_char; | 3252 | goto normal_char; |
| 3230 | } | 3253 | } |
| 3231 | break; | 3254 | break; |
| @@ -3234,7 +3257,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3234 | default: | 3257 | default: |
| 3235 | /* Expects the character in `c'. */ | 3258 | /* Expects the character in `c'. */ |
| 3236 | normal_char: | 3259 | normal_char: |
| 3237 | /* If no exactn currently being built. */ | 3260 | /* If no exactn currently being built. */ |
| 3238 | if (!pending_exact | 3261 | if (!pending_exact |
| 3239 | 3262 | ||
| 3240 | /* If last exactn not at current position. */ | 3263 | /* If last exactn not at current position. */ |
| @@ -3265,6 +3288,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3265 | { | 3288 | { |
| 3266 | int len; | 3289 | int len; |
| 3267 | 3290 | ||
| 3291 | c = TRANSLATE (c); | ||
| 3268 | if (multibyte) | 3292 | if (multibyte) |
| 3269 | len = CHAR_STRING (c, b); | 3293 | len = CHAR_STRING (c, b); |
| 3270 | else | 3294 | else |
| @@ -4427,7 +4451,7 @@ mutually_exclusive_p (bufp, p1, p2) | |||
| 4427 | they don't overlap. The union of the two sets of excluded | 4451 | they don't overlap. The union of the two sets of excluded |
| 4428 | chars should cover all possible chars, which, as a matter of | 4452 | chars should cover all possible chars, which, as a matter of |
| 4429 | fact, is virtually impossible in multibyte buffers. */ | 4453 | fact, is virtually impossible in multibyte buffers. */ |
| 4430 | ; | 4454 | break; |
| 4431 | } | 4455 | } |
| 4432 | break; | 4456 | break; |
| 4433 | 4457 | ||