aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/regex-emacs.c127
1 files changed, 86 insertions, 41 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 55d0a6e8df8..06befe1b189 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -1923,12 +1923,22 @@ regex_compile (re_char *pattern, ptrdiff_t size,
1923 1923
1924 if (!zero_times_ok && simple) 1924 if (!zero_times_ok && simple)
1925 { /* Since simple * loops can be made faster by using 1925 { /* Since simple * loops can be made faster by using
1926 on_failure_keep_string_jump, we turn simple P+ 1926 on_failure_keep_string_jump, we turn P+ into PP*
1927 into PP* if P is simple. */ 1927 if P is simple.
1928 We can't use `top: <BODY>; OFJS exit; J top; exit:`
1929 because the OFJS needs to be at the beginning
1930 so we can replace
1931 top: OFJS exit; <BODY>; J top; exit
1932 with
1933 OFKSJ exit; loop: <BODY>; J loop; exit
1934 i.e. a single OFJ at the beginning of the loop
1935 rather than once per iteration. */
1928 unsigned char *p1, *p2; 1936 unsigned char *p1, *p2;
1929 startoffset = b - laststart; 1937 startoffset = b - laststart;
1930 GET_BUFFER_SPACE (startoffset); 1938 GET_BUFFER_SPACE (startoffset);
1931 p1 = b; p2 = laststart; 1939 p1 = b; p2 = laststart;
1940 /* We presume that the code skipped
1941 by `skip_one_char` is position-independent. */
1932 while (p2 < p1) 1942 while (p2 < p1)
1933 *b++ = *p2++; 1943 *b++ = *p2++;
1934 zero_times_ok = 1; 1944 zero_times_ok = 1;
@@ -3068,8 +3078,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
3068 continue; 3078 continue;
3069 3079
3070 case succeed_n: 3080 case succeed_n:
3071 /* If N == 0, it should be an on_failure_jump_loop instead. */ 3081 /* If N == 0, it should be an on_failure_jump_loop instead.
3072 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0)); 3082 `j` can be negative because `EXTRACT_NUMBER` extracts a
3083 signed number whereas `succeed_n` treats it as unsigned. */
3084 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
3073 p += 4; 3085 p += 4;
3074 /* We only care about one iteration of the loop, so we don't 3086 /* We only care about one iteration of the loop, so we don't
3075 need to consider the case where this behaves like an 3087 need to consider the case where this behaves like an
@@ -3743,20 +3755,32 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
3743} 3755}
3744 3756
3745/* True if "p1 matches something" implies "p2 fails". */ 3757/* True if "p1 matches something" implies "p2 fails". */
3746/* Avoiding inf-loops: 3758/* We're trying to follow all paths reachable from `p2`, but since some
3747 We're trying to follow all paths reachable from `p2`, but since some
3748 loops can match the empty string, this can loop back to `p2`. 3759 loops can match the empty string, this can loop back to `p2`.
3749 To avoid inf-looping, we keep track of points that have been considered 3760
3750 "already". Instead of keeping a list of such points, `done_beg` and 3761 To avoid inf-looping, we take advantage of the fact that
3751 `done_end` delimit a chunk of bytecode we already considered. 3762 the bytecode we generate is made of syntactically nested loops, more
3752 To guarantee termination, a lexical ordering between `done_*` and `p2` 3763 specifically, every loop has a single entry point and single exit point.
3753 should be obeyed: 3764
3754 At each recursion, either `done_beg` gets smaller, 3765 The function takes 2 more arguments (`loop_entry` and `loop_exit`).
3755 or `done_beg` is unchanged and `done_end` gets larger 3766 `loop_entry` points to the sole entry point of the current loop and
3756 or they're both unchanged and `p2` gets larger. */ 3767 `loop_exit` points to its sole exit point (when non-NULL).
3768
3769 Jumps outside of `loop_entry..exit` should not occur.
3770 The function can assume that `loop_exit` is "mutually exclusive".
3771 The same holds for `loop_entry` except when `p2 == loop_entry`.
3772
3773 To guarantee termination, recursive calls should make sure that either
3774 `loop_entry` is larger, or it's unchanged but `p2` is larger.
3775
3776 FIXME: This is failsafe (can't return true when it shouldn't)
3777 but it could be too conservative if we start generating bytecode
3778 with a different shape, so maybe we should bite the bullet and
3779 replace done_beg/end with an actual list of positions we've
3780 already processed. */
3757static bool 3781static bool
3758mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, 3782mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
3759 re_char *p2, re_char *done_beg, re_char *done_end) 3783 re_char *p2, re_char *loop_entry, re_char *loop_exit)
3760{ 3784{
3761 re_opcode_t op2; 3785 re_opcode_t op2;
3762 unsigned char *pend = bufp->buffer + bufp->used; 3786 unsigned char *pend = bufp->buffer + bufp->used;
@@ -3765,8 +3789,15 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
3765 eassert (p1 >= bufp->buffer && p1 < pend 3789 eassert (p1 >= bufp->buffer && p1 < pend
3766 && p2 >= bufp->buffer && p2 <= pend); 3790 && p2 >= bufp->buffer && p2 <= pend);
3767 3791
3768 eassert (done_beg <= done_end); 3792 if (p2 == loop_exit)
3769 eassert (done_end <= p2); 3793 return true; /* Presumably already checked elsewhere. */
3794 eassert (loop_entry && p2 >= loop_entry);
3795 if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
3796 /* The assumptions about the shape of the code aren't true :-( */
3797#ifdef ENABLE_CHECKING
3798 error ("Broken assumption in regex.c:mutually_exclusive_aux");
3799#endif
3800 return false;
3770 3801
3771 /* Skip over open/close-group commands. 3802 /* Skip over open/close-group commands.
3772 If what follows this loop is a ...+ construct, 3803 If what follows this loop is a ...+ construct,
@@ -3858,29 +3889,43 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
3858 int mcnt; 3889 int mcnt;
3859 p2++; 3890 p2++;
3860 EXTRACT_NUMBER_AND_INCR (mcnt, p2); 3891 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
3861 re_char *p2_other = p2 + mcnt; 3892 re_char *p2_other = p2 + mcnt, *tmp;
3862 3893 /* For `+` loops, we often have an `on_failure_jump` that skips forward
3863 /* When we jump backward we bump `done_end` up to `p3` under 3894 over a subsequent `jump` for lack of an `on_failure_dont_jump`
3864 the assumption that any other position between `done_end` 3895 kind of thing. Recognize this pattern since that subsequent
3865 and `p3` is either: 3896 `jump` is the one that jumps to the loop-entry. */
3866 - checked by the other call to RECURSE. 3897 if ((re_opcode_t) p2[0] == jump && mcnt == 3)
3867 - not reachable from here (e.g. for positions before the 3898 {
3868 `on_failure_jump`), or at least not without first 3899 EXTRACT_NUMBER (mcnt, p2 + 1);
3869 jumping before `done_beg`. 3900 p2 += mcnt + 3;
3870 This should hold because our state machines are not arbitrary: 3901 }
3871 they consists of syntaxically nested loops with limited 3902
3872 control flow. 3903 /* We have to check that both destinations are safe.
3873 FIXME: This can fail (i.e. return true when it shouldn't) 3904 Arrange for `p2` to be the smaller of the two. */
3874 if we start generating bytecode with a different shape, 3905 if (p2 > p2_other)
3875 so maybe we should bite the bullet and replace done_beg/end 3906 (tmp = p2, p2 = p2_other, p2_other = tmp);
3876 with an actual list of positions we've already processed. */ 3907
3877#define RECURSE(p3) \ 3908 if (p2_other <= p2_orig /* Both destinations go backward! */
3878 ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \ 3909 || !mutually_exclusive_aux (bufp, p1, p2_other,
3879 : (p3) <= done_end ? true \ 3910 loop_entry, loop_exit))
3880 : mutually_exclusive_aux (bufp, p1, p3, done_beg, \ 3911 return false;
3881 (p3) > p2_orig ? done_end : (p3))) 3912
3882 3913 /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
3883 return RECURSE (p2) && RECURSE (p2_other); 3914 position, let's check `p2`. */
3915 if (p2 == loop_entry)
3916 /* If we jump backward to the entry point of the current loop
3917 it means it's a zero-length cycle through that loop, so
3918 this cycle itself does not break mutual-exclusion. */
3919 return true;
3920 else if (p2 > p2_orig)
3921 /* Boring forward jump. */
3922 return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
3923 else if (loop_entry < p2 && p2 < p2_orig)
3924 /* We jump backward to a new loop, nested within the current one.
3925 `p2` is the entry point and `p2_other` the exit of that inner. */
3926 return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
3927 else
3928 return false;
3884 } 3929 }
3885 3930
3886 default: 3931 default:
@@ -3895,7 +3940,7 @@ static bool
3895mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, 3940mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
3896 re_char *p2) 3941 re_char *p2)
3897{ 3942{
3898 return mutually_exclusive_aux (bufp, p1, p2, p2, p2); 3943 return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL);
3899} 3944}
3900 3945
3901/* Matching routines. */ 3946/* Matching routines. */