aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorStefan Monnier2000-03-27 22:26:54 +0000
committerStefan Monnier2000-03-27 22:26:54 +0000
commited0767d80f1d3ae00b4a89576b009e44398cc1ba (patch)
tree3628c75f0a930d59d45d3ba1b0dc9c38b1c4e7a5 /src
parente11e7e46bcd0f5310d5a8ed56908993d8662866f (diff)
downloademacs-ed0767d80f1d3ae00b4a89576b009e44398cc1ba.tar.gz
emacs-ed0767d80f1d3ae00b4a89576b009e44398cc1ba.zip
(REGEX_FREE_STACK, RESET_FAIL_STACK): Make them usable as an expression.
(enum re_opcode_t): Update description of succeed_n. (PATFETCH): Always define. (regex_compile): Use lookahead rather than PATUNFETCH (for repetition operators, char classes, shy-groups and intervals). Optimize special cases of intervals so as to only use succeed_n and jump_n when really needed. (re_compile_fastmap): Simplify handling of jump_n and succeed_n now that we don't have to handle the special cases any more. Simplify on_failure_jump handling as well.
Diffstat (limited to 'src')
-rw-r--r--src/ChangeLog14
-rw-r--r--src/regex.c227
2 files changed, 118 insertions, 123 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index 3a01bac0160..d3ea1be941d 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,17 @@
12000-03-27 Stefan Monnier <monnier@cs.yale.edu>
2
3 * regex.c (REGEX_FREE_STACK, RESET_FAIL_STACK): Make them usable as
4 an expression.
5 (enum re_opcode_t): Update description of succeed_n.
6 (PATFETCH): Always define.
7 (regex_compile): Use lookahead rather than PATUNFETCH (for repetition
8 operators, char classes, shy-groups and intervals).
9 Optimize special cases of intervals so as to only use succeed_n and
10 jump_n when really needed.
11 (re_compile_fastmap): Simplify handling of jump_n and succeed_n now
12 that we don't have to handle the special cases any more.
13 Simplify on_failure_jump handling as well.
14
12000-03-28 Jason Rumney <jasonr@gnu.org> 152000-03-28 Jason Rumney <jasonr@gnu.org>
2 16
3 * lread.c (Fload): Move safe_p definition to above #ifdef DOS_NT. 17 * lread.c (Fload): Move safe_p definition to above #ifdef DOS_NT.
diff --git a/src/regex.c b/src/regex.c
index 671f6c152f3..f7ca3a7b8b4 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -21,7 +21,6 @@
21 21
22/* TODO: 22/* TODO:
23 - use analyze_first to optimize non-empty loops 23 - use analyze_first to optimize non-empty loops
24 - optimize succeed_n and jump_n away when possible
25 - clean up multibyte issues 24 - clean up multibyte issues
26 - structure the opcode space into opcode+flag. 25 - structure the opcode space into opcode+flag.
27 - merge with glic's regex.[ch] 26 - merge with glic's regex.[ch]
@@ -411,7 +410,7 @@ char *alloca ();
411#define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 410#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
412 REGEX_REALLOCATE (source, osize, nsize) 411 REGEX_REALLOCATE (source, osize, nsize)
413/* No need to explicitly free anything. */ 412/* No need to explicitly free anything. */
414#define REGEX_FREE_STACK(arg) 413#define REGEX_FREE_STACK(arg) ((void)0)
415 414
416#endif /* not REGEX_MALLOC */ 415#endif /* not REGEX_MALLOC */
417#endif /* not using relocating allocator */ 416#endif /* not using relocating allocator */
@@ -546,7 +545,9 @@ typedef enum
546 on_failure_jump_smart, 545 on_failure_jump_smart,
547 546
548 /* Followed by two-byte relative address and two-byte number n. 547 /* Followed by two-byte relative address and two-byte number n.
549 After matching N times, jump to the address upon failure. */ 548 After matching N times, jump to the address upon failure.
549 Does not work if N starts at 0: use on_failure_jump_loop
550 instead. */
550 succeed_n, 551 succeed_n,
551 552
552 /* Followed by two-byte relative address, and two-byte number n. 553 /* Followed by two-byte relative address, and two-byte number n.
@@ -1289,7 +1290,7 @@ typedef struct
1289 fail_stack.frame = 0; \ 1290 fail_stack.frame = 0; \
1290 } while (0) 1291 } while (0)
1291 1292
1292#define RESET_FAIL_STACK() 1293#define RESET_FAIL_STACK() ((void)0)
1293#endif 1294#endif
1294 1295
1295 1296
@@ -1546,13 +1547,11 @@ static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
1546 if necessary. Also cast from a signed character in the constant 1547 if necessary. Also cast from a signed character in the constant
1547 string passed to us by the user to an unsigned char that we can use 1548 string passed to us by the user to an unsigned char that we can use
1548 as an array index (in, e.g., `translate'). */ 1549 as an array index (in, e.g., `translate'). */
1549#ifndef PATFETCH
1550#define PATFETCH(c) \ 1550#define PATFETCH(c) \
1551 do { \ 1551 do { \
1552 PATFETCH_RAW (c); \ 1552 PATFETCH_RAW (c); \
1553 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ 1553 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
1554 } while (0) 1554 } while (0)
1555#endif
1556 1555
1557/* Fetch the next character in the uncompiled pattern, with no 1556/* Fetch the next character in the uncompiled pattern, with no
1558 translation. */ 1557 translation. */
@@ -2103,35 +2102,24 @@ regex_compile (pattern, size, syntax, bufp)
2103 2102
2104 if (p == pend) 2103 if (p == pend)
2105 break; 2104 break;
2106 2105 else if (*p == '*'
2107 PATFETCH (c); 2106 || (!(syntax & RE_BK_PLUS_QM)
2108 2107 && (*p == '+' || *p == '?')))
2109 if (c == '*'
2110 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2111 ; 2108 ;
2112 2109 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
2113 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2114 { 2110 {
2115 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2111 if (p+1 == pend)
2116 2112 FREE_STACK_RETURN (REG_EESCAPE);
2117 PATFETCH (c1); 2113 if (p[1] == '+' || p[1] == '?')
2118 if (!(c1 == '+' || c1 == '?')) 2114 PATFETCH (c); /* Gobble up the backslash. */
2119 { 2115 else
2120 PATUNFETCH; 2116 break;
2121 PATUNFETCH;
2122 break;
2123 }
2124
2125 c = c1;
2126 } 2117 }
2127 else 2118 else
2128 { 2119 break;
2129 PATUNFETCH;
2130 break;
2131 }
2132
2133 /* If we get here, we found another repeat character. */ 2120 /* If we get here, we found another repeat character. */
2134 } 2121 PATFETCH (c);
2122 }
2135 2123
2136 /* Star, etc. applied to an empty pattern is equivalent 2124 /* Star, etc. applied to an empty pattern is equivalent
2137 to an empty pattern. */ 2125 to an empty pattern. */
@@ -2306,9 +2294,11 @@ regex_compile (pattern, size, syntax, bufp)
2306 { 2294 {
2307 /* Leave room for the null. */ 2295 /* Leave room for the null. */
2308 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2296 char str[CHAR_CLASS_MAX_LENGTH + 1];
2297 const unsigned char *class_beg;
2309 2298
2310 PATFETCH (c); 2299 PATFETCH (c);
2311 c1 = 0; 2300 c1 = 0;
2301 class_beg = p;
2312 2302
2313 /* If pattern is `[[:'. */ 2303 /* If pattern is `[[:'. */
2314 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2304 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
@@ -2421,9 +2411,8 @@ regex_compile (pattern, size, syntax, bufp)
2421 } 2411 }
2422 else 2412 else
2423 { 2413 {
2424 c1++; 2414 /* Go back to right after the "[:". */
2425 while (c1--) 2415 p = class_beg;
2426 PATUNFETCH;
2427 SET_LIST_BIT ('['); 2416 SET_LIST_BIT ('[');
2428 2417
2429 /* Because the `:' may starts the range, we 2418 /* Because the `:' may starts the range, we
@@ -2584,9 +2573,9 @@ regex_compile (pattern, size, syntax, bufp)
2584 if (p+1 < pend) 2573 if (p+1 < pend)
2585 { 2574 {
2586 /* Look for a special (?...) construct */ 2575 /* Look for a special (?...) construct */
2587 PATFETCH (c); 2576 if ((syntax & RE_SHY_GROUPS) && *p == '?')
2588 if ((syntax & RE_SHY_GROUPS) && c == '?')
2589 { 2577 {
2578 PATFETCH (c); /* Gobble up the '?'. */
2590 PATFETCH (c); 2579 PATFETCH (c);
2591 switch (c) 2580 switch (c)
2592 { 2581 {
@@ -2596,7 +2585,6 @@ regex_compile (pattern, size, syntax, bufp)
2596 FREE_STACK_RETURN (REG_BADPAT); 2585 FREE_STACK_RETURN (REG_BADPAT);
2597 } 2586 }
2598 } 2587 }
2599 else PATUNFETCH;
2600 } 2588 }
2601 2589
2602 if (!shy) 2590 if (!shy)
@@ -2744,8 +2732,9 @@ regex_compile (pattern, size, syntax, bufp)
2744 if (!(syntax & RE_INTERVALS) 2732 if (!(syntax & RE_INTERVALS)
2745 /* If we're at `\{' and it's not the open-interval 2733 /* If we're at `\{' and it's not the open-interval
2746 operator. */ 2734 operator. */
2747 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2735 || (syntax & RE_NO_BK_BRACES)
2748 || (p - 2 == pattern && p == pend)) 2736 /* What is that? -sm */
2737 /* || (p - 2 == pattern && p == pend) */)
2749 goto normal_backslash; 2738 goto normal_backslash;
2750 2739
2751 handle_interval: 2740 handle_interval:
@@ -2755,7 +2744,7 @@ regex_compile (pattern, size, syntax, bufp)
2755 /* At least (most) this many matches must be made. */ 2744 /* At least (most) this many matches must be made. */
2756 int lower_bound = 0, upper_bound = -1; 2745 int lower_bound = 0, upper_bound = -1;
2757 2746
2758 beg_interval = p - 1; 2747 beg_interval = p;
2759 2748
2760 if (p == pend) 2749 if (p == pend)
2761 { 2750 {
@@ -2768,16 +2757,13 @@ regex_compile (pattern, size, syntax, bufp)
2768 GET_UNSIGNED_NUMBER (lower_bound); 2757 GET_UNSIGNED_NUMBER (lower_bound);
2769 2758
2770 if (c == ',') 2759 if (c == ',')
2771 { 2760 GET_UNSIGNED_NUMBER (upper_bound);
2772 GET_UNSIGNED_NUMBER (upper_bound);
2773 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2774 }
2775 else 2761 else
2776 /* Interval such as `{1}' => match exactly once. */ 2762 /* Interval such as `{1}' => match exactly once. */
2777 upper_bound = lower_bound; 2763 upper_bound = lower_bound;
2778 2764
2779 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 2765 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2780 || lower_bound > upper_bound) 2766 || (upper_bound >= 0 && lower_bound > upper_bound))
2781 { 2767 {
2782 if (syntax & RE_NO_BK_BRACES) 2768 if (syntax & RE_NO_BK_BRACES)
2783 goto unfetch_interval; 2769 goto unfetch_interval;
@@ -2813,15 +2799,13 @@ regex_compile (pattern, size, syntax, bufp)
2813 goto unfetch_interval; 2799 goto unfetch_interval;
2814 } 2800 }
2815 2801
2816 /* If the upper bound is zero, don't want to succeed at
2817 all; jump from `laststart' to `b + 3', which will be
2818 the end of the buffer after we insert the jump. */
2819 if (upper_bound == 0) 2802 if (upper_bound == 0)
2820 { 2803 /* If the upper bound is zero, just drop the sub pattern
2821 GET_BUFFER_SPACE (3); 2804 altogether. */
2822 INSERT_JUMP (jump, laststart, b + 3); 2805 b = laststart;
2823 b += 3; 2806 else if (lower_bound == 1 && upper_bound == 1)
2824 } 2807 /* Just match it once: nothing to do here. */
2808 ;
2825 2809
2826 /* Otherwise, we have a nontrivial interval. When 2810 /* Otherwise, we have a nontrivial interval. When
2827 we're all done, the pattern will look like: 2811 we're all done, the pattern will look like:
@@ -2835,28 +2819,49 @@ regex_compile (pattern, size, syntax, bufp)
2835 else 2819 else
2836 { /* If the upper bound is > 1, we need to insert 2820 { /* If the upper bound is > 1, we need to insert
2837 more at the end of the loop. */ 2821 more at the end of the loop. */
2838 unsigned nbytes = 10 + (upper_bound > 1) * 10; 2822 unsigned int nbytes = (upper_bound < 0 ? 3
2839 2823 : upper_bound > 1 ? 5 : 0);
2840 GET_BUFFER_SPACE (nbytes); 2824 unsigned int startoffset = 0;
2841 2825
2842 /* Initialize lower bound of the `succeed_n', even 2826 GET_BUFFER_SPACE (20); /* We might use less. */
2843 though it will be set during matching by its 2827
2844 attendant `set_number_at' (inserted next), 2828 if (lower_bound == 0)
2845 because `re_compile_fastmap' needs to know. 2829 {
2846 Jump to the `jump_n' we might insert below. */ 2830 /* A succeed_n that starts with 0 is really a
2847 INSERT_JUMP2 (succeed_n, laststart, 2831 a simple on_failure_jump_loop. */
2848 b + 5 + (upper_bound > 1) * 5, 2832 INSERT_JUMP (on_failure_jump_loop, laststart,
2849 lower_bound); 2833 b + 3 + nbytes);
2850 b += 5; 2834 b += 3;
2851 2835 }
2852 /* Code to initialize the lower bound. Insert 2836 else
2853 before the `succeed_n'. The `5' is the last two 2837 {
2854 bytes of this `set_number_at', plus 3 bytes of 2838 /* Initialize lower bound of the `succeed_n', even
2855 the following `succeed_n'. */ 2839 though it will be set during matching by its
2856 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 2840 attendant `set_number_at' (inserted next),
2857 b += 5; 2841 because `re_compile_fastmap' needs to know.
2858 2842 Jump to the `jump_n' we might insert below. */
2859 if (upper_bound > 1) 2843 INSERT_JUMP2 (succeed_n, laststart,
2844 b + 5 + nbytes,
2845 lower_bound);
2846 b += 5;
2847
2848 /* Code to initialize the lower bound. Insert
2849 before the `succeed_n'. The `5' is the last two
2850 bytes of this `set_number_at', plus 3 bytes of
2851 the following `succeed_n'. */
2852 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2853 b += 5;
2854 startoffset += 5;
2855 }
2856
2857 if (upper_bound < 0)
2858 {
2859 /* A negative upper bound stands for infinity,
2860 in which case it degenerates to a plain jump. */
2861 STORE_JUMP (jump, b, laststart + startoffset);
2862 b += 3;
2863 }
2864 else if (upper_bound > 1)
2860 { /* More than one repetition is allowed, so 2865 { /* More than one repetition is allowed, so
2861 append a backward jump to the `succeed_n' 2866 append a backward jump to the `succeed_n'
2862 that starts this interval. 2867 that starts this interval.
@@ -2864,7 +2869,7 @@ regex_compile (pattern, size, syntax, bufp)
2864 When we've reached this during matching, 2869 When we've reached this during matching,
2865 we'll have matched the interval once, so 2870 we'll have matched the interval once, so
2866 jump back only `upper_bound - 1' times. */ 2871 jump back only `upper_bound - 1' times. */
2867 STORE_JUMP2 (jump_n, b, laststart + 5, 2872 STORE_JUMP2 (jump_n, b, laststart + startoffset,
2868 upper_bound - 1); 2873 upper_bound - 1);
2869 b += 5; 2874 b += 5;
2870 2875
@@ -2899,14 +2904,15 @@ regex_compile (pattern, size, syntax, bufp)
2899 beg_interval = NULL; 2904 beg_interval = NULL;
2900 2905
2901 /* normal_char and normal_backslash need `c'. */ 2906 /* normal_char and normal_backslash need `c'. */
2902 PATFETCH (c); 2907 c = '{';
2903 2908
2904 if (!(syntax & RE_NO_BK_BRACES)) 2909 if (!(syntax & RE_NO_BK_BRACES))
2905 { 2910 {
2906 if (p > pattern && p[-1] == '\\') 2911 assert (p > pattern && p[-1] == '\\');
2907 goto normal_backslash; 2912 goto normal_backslash;
2908 } 2913 }
2909 goto normal_char; 2914 else
2915 goto normal_char;
2910 2916
2911#ifdef emacs 2917#ifdef emacs
2912 /* There is no way to specify the before_dot and after_dot 2918 /* There is no way to specify the before_dot and after_dot
@@ -3311,9 +3317,6 @@ re_compile_fastmap (bufp)
3311 match the empty string. */ 3317 match the empty string. */
3312 boolean path_can_be_null = true; 3318 boolean path_can_be_null = true;
3313 3319
3314 /* We aren't doing a `succeed_n' to begin with. */
3315 boolean succeed_n_p = false;
3316
3317 /* If all elements for base leading-codes in fastmap is set, this 3320 /* If all elements for base leading-codes in fastmap is set, this
3318 flag is set true. */ 3321 flag is set true. */
3319 boolean match_any_multibyte_characters = false; 3322 boolean match_any_multibyte_characters = false;
@@ -3353,7 +3356,7 @@ re_compile_fastmap (bufp)
3353 as used for the *? operator. */ 3356 as used for the *? operator. */
3354 unsigned char *p1 = p; 3357 unsigned char *p1 = p;
3355 3358
3356 if (p == pend || *p == succeed) 3359 if (p >= pend || *p == succeed)
3357 { 3360 {
3358 /* We have reached the (effective) end of pattern. */ 3361 /* We have reached the (effective) end of pattern. */
3359 if (!PATTERN_STACK_EMPTY ()) 3362 if (!PATTERN_STACK_EMPTY ())
@@ -3510,7 +3513,6 @@ re_compile_fastmap (bufp)
3510 continue; 3513 continue;
3511 3514
3512 3515
3513 case jump_n:
3514 case jump: 3516 case jump:
3515 EXTRACT_NUMBER_AND_INCR (j, p); 3517 EXTRACT_NUMBER_AND_INCR (j, p);
3516 if (j < 0) 3518 if (j < 0)
@@ -3539,51 +3541,30 @@ re_compile_fastmap (bufp)
3539 case on_failure_jump_nastyloop: 3541 case on_failure_jump_nastyloop:
3540 case on_failure_jump_loop: 3542 case on_failure_jump_loop:
3541 case on_failure_jump_smart: 3543 case on_failure_jump_smart:
3542 handle_on_failure_jump:
3543 EXTRACT_NUMBER_AND_INCR (j, p); 3544 EXTRACT_NUMBER_AND_INCR (j, p);
3544
3545 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3546 end of the pattern. We don't want to push such a point,
3547 since when we restore it above, entering the switch will
3548 increment `p' past the end of the pattern. We don't need
3549 to push such a point since we obviously won't find any more
3550 fastmap entries beyond `pend'. Such a pattern can match
3551 the null string, though. */
3552 if (p + j <= p1) 3545 if (p + j <= p1)
3553 /* Backward jump to be ignored. */ 3546 ; /* Backward jump to be ignored. */
3554 ; 3547 else if (!PUSH_PATTERN_OP (p + j, fail_stack))
3555 else if (p + j < pend) 3548 return (RESET_FAIL_STACK (), -2);
3556 {
3557 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3558 {
3559 RESET_FAIL_STACK ();
3560 return -2;
3561 }
3562 }
3563 else
3564 bufp->can_be_null = 1;
3565
3566 if (succeed_n_p)
3567 {
3568 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3569 succeed_n_p = false;
3570 }
3571
3572 continue; 3549 continue;
3573 3550
3574 3551
3552 case jump_n:
3553 /* This code simply does not properly handle forward jump_n. */
3554 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3555 p += 4;
3556 /* jump_n can either jump or fall through. The (backward) jump
3557 case has already been handled, so we only need to look at the
3558 fallthrough case. */
3559 continue;
3560
3575 case succeed_n: 3561 case succeed_n:
3576 /* Get to the number of times to succeed. */ 3562 /* If N == 0, it should be an on_failure_jump_loop instead. */
3577 p += 2; 3563 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3578 3564 p += 4;
3579 /* Increment p past the n for when k != 0. */ 3565 /* We only care about one iteration of the loop, so we don't
3580 EXTRACT_NUMBER_AND_INCR (k, p); 3566 need to consider the case where this behaves like an
3581 if (k == 0) 3567 on_failure_jump. */
3582 {
3583 p -= 4;
3584 succeed_n_p = true; /* Spaghetti code alert. */
3585 goto handle_on_failure_jump;
3586 }
3587 continue; 3568 continue;
3588 3569
3589 3570