diff options
| author | Stefan Monnier | 2023-09-21 11:47:31 -0400 |
|---|---|---|
| committer | Stefan Monnier | 2023-09-21 11:47:31 -0400 |
| commit | 424b35fa24f07f85c5287c682e140c1c8daa25d0 (patch) | |
| tree | dbf8c1b632dbb82e0ae736aa916ef622d1f58b25 | |
| parent | 57c6c067d35e519bc3787966cd6346904bc75e16 (diff) | |
| download | emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.tar.gz emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.zip | |
regex-emacs.c (mutually_exclusive_aux): Rework again
Rework the way we handle loops. This new code does not really work
better than the previous one, but it has the advantage of being "fail
safe" and also that we can dynamically check if our assumptions about
the shape of the bytecode are satisfied or not.
* src/regex-emacs.c (mutually_exclusive_aux): Replace `done_beg` and
`done_end` with `loop_beg` and `loop_end`.
(mutually_exclusive_p): Adjust accordingly.
(analyze_first): Fix incorrect assertion.
| -rw-r--r-- | src/regex-emacs.c | 127 |
1 files changed, 86 insertions, 41 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 55d0a6e8df8..06befe1b189 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -1923,12 +1923,22 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1923 | 1923 | ||
| 1924 | if (!zero_times_ok && simple) | 1924 | if (!zero_times_ok && simple) |
| 1925 | { /* Since simple * loops can be made faster by using | 1925 | { /* Since simple * loops can be made faster by using |
| 1926 | on_failure_keep_string_jump, we turn simple P+ | 1926 | on_failure_keep_string_jump, we turn P+ into PP* |
| 1927 | into PP* if P is simple. */ | 1927 | if P is simple. |
| 1928 | We can't use `top: <BODY>; OFJS exit; J top; exit:` | ||
| 1929 | because the OFJS needs to be at the beginning | ||
| 1930 | so we can replace | ||
| 1931 | top: OFJS exit; <BODY>; J top; exit | ||
| 1932 | with | ||
| 1933 | OFKSJ exit; loop: <BODY>; J loop; exit | ||
| 1934 | i.e. a single OFJ at the beginning of the loop | ||
| 1935 | rather than once per iteration. */ | ||
| 1928 | unsigned char *p1, *p2; | 1936 | unsigned char *p1, *p2; |
| 1929 | startoffset = b - laststart; | 1937 | startoffset = b - laststart; |
| 1930 | GET_BUFFER_SPACE (startoffset); | 1938 | GET_BUFFER_SPACE (startoffset); |
| 1931 | p1 = b; p2 = laststart; | 1939 | p1 = b; p2 = laststart; |
| 1940 | /* We presume that the code skipped | ||
| 1941 | by `skip_one_char` is position-independent. */ | ||
| 1932 | while (p2 < p1) | 1942 | while (p2 < p1) |
| 1933 | *b++ = *p2++; | 1943 | *b++ = *p2++; |
| 1934 | zero_times_ok = 1; | 1944 | zero_times_ok = 1; |
| @@ -3068,8 +3078,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 3068 | continue; | 3078 | continue; |
| 3069 | 3079 | ||
| 3070 | case succeed_n: | 3080 | case succeed_n: |
| 3071 | /* If N == 0, it should be an on_failure_jump_loop instead. */ | 3081 | /* If N == 0, it should be an on_failure_jump_loop instead. |
| 3072 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0)); | 3082 | `j` can be negative because `EXTRACT_NUMBER` extracts a |
| 3083 | signed number whereas `succeed_n` treats it as unsigned. */ | ||
| 3084 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0)); | ||
| 3073 | p += 4; | 3085 | p += 4; |
| 3074 | /* We only care about one iteration of the loop, so we don't | 3086 | /* We only care about one iteration of the loop, so we don't |
| 3075 | need to consider the case where this behaves like an | 3087 | need to consider the case where this behaves like an |
| @@ -3743,20 +3755,32 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3743 | } | 3755 | } |
| 3744 | 3756 | ||
| 3745 | /* True if "p1 matches something" implies "p2 fails". */ | 3757 | /* True if "p1 matches something" implies "p2 fails". */ |
| 3746 | /* Avoiding inf-loops: | 3758 | /* We're trying to follow all paths reachable from `p2`, but since some |
| 3747 | We're trying to follow all paths reachable from `p2`, but since some | ||
| 3748 | loops can match the empty string, this can loop back to `p2`. | 3759 | loops can match the empty string, this can loop back to `p2`. |
| 3749 | To avoid inf-looping, we keep track of points that have been considered | 3760 | |
| 3750 | "already". Instead of keeping a list of such points, `done_beg` and | 3761 | To avoid inf-looping, we take advantage of the fact that |
| 3751 | `done_end` delimit a chunk of bytecode we already considered. | 3762 | the bytecode we generate is made of syntactically nested loops, more |
| 3752 | To guarantee termination, a lexical ordering between `done_*` and `p2` | 3763 | specifically, every loop has a single entry point and single exit point. |
| 3753 | should be obeyed: | 3764 | |
| 3754 | At each recursion, either `done_beg` gets smaller, | 3765 | The function takes 2 more arguments (`loop_entry` and `loop_exit`). |
| 3755 | or `done_beg` is unchanged and `done_end` gets larger | 3766 | `loop_entry` points to the sole entry point of the current loop and |
| 3756 | or they're both unchanged and `p2` gets larger. */ | 3767 | `loop_exit` points to its sole exit point (when non-NULL). |
| 3768 | |||
| 3769 | Jumps outside of `loop_entry..exit` should not occur. | ||
| 3770 | The function can assume that `loop_exit` is "mutually exclusive". | ||
| 3771 | The same holds for `loop_entry` except when `p2 == loop_entry`. | ||
| 3772 | |||
| 3773 | To guarantee termination, recursive calls should make sure that either | ||
| 3774 | `loop_entry` is larger, or it's unchanged but `p2` is larger. | ||
| 3775 | |||
| 3776 | FIXME: This is failsafe (can't return true when it shouldn't) | ||
| 3777 | but it could be too conservative if we start generating bytecode | ||
| 3778 | with a different shape, so maybe we should bite the bullet and | ||
| 3779 | replace done_beg/end with an actual list of positions we've | ||
| 3780 | already processed. */ | ||
| 3757 | static bool | 3781 | static bool |
| 3758 | mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | 3782 | mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, |
| 3759 | re_char *p2, re_char *done_beg, re_char *done_end) | 3783 | re_char *p2, re_char *loop_entry, re_char *loop_exit) |
| 3760 | { | 3784 | { |
| 3761 | re_opcode_t op2; | 3785 | re_opcode_t op2; |
| 3762 | unsigned char *pend = bufp->buffer + bufp->used; | 3786 | unsigned char *pend = bufp->buffer + bufp->used; |
| @@ -3765,8 +3789,15 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3765 | eassert (p1 >= bufp->buffer && p1 < pend | 3789 | eassert (p1 >= bufp->buffer && p1 < pend |
| 3766 | && p2 >= bufp->buffer && p2 <= pend); | 3790 | && p2 >= bufp->buffer && p2 <= pend); |
| 3767 | 3791 | ||
| 3768 | eassert (done_beg <= done_end); | 3792 | if (p2 == loop_exit) |
| 3769 | eassert (done_end <= p2); | 3793 | return true; /* Presumably already checked elsewhere. */ |
| 3794 | eassert (loop_entry && p2 >= loop_entry); | ||
| 3795 | if (p2 < loop_entry || (loop_exit && p2 > loop_exit)) | ||
| 3796 | /* The assumptions about the shape of the code aren't true :-( */ | ||
| 3797 | #ifdef ENABLE_CHECKING | ||
| 3798 | error ("Broken assumption in regex.c:mutually_exclusive_aux"); | ||
| 3799 | #endif | ||
| 3800 | return false; | ||
| 3770 | 3801 | ||
| 3771 | /* Skip over open/close-group commands. | 3802 | /* Skip over open/close-group commands. |
| 3772 | If what follows this loop is a ...+ construct, | 3803 | If what follows this loop is a ...+ construct, |
| @@ -3858,29 +3889,43 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3858 | int mcnt; | 3889 | int mcnt; |
| 3859 | p2++; | 3890 | p2++; |
| 3860 | EXTRACT_NUMBER_AND_INCR (mcnt, p2); | 3891 | EXTRACT_NUMBER_AND_INCR (mcnt, p2); |
| 3861 | re_char *p2_other = p2 + mcnt; | 3892 | re_char *p2_other = p2 + mcnt, *tmp; |
| 3862 | 3893 | /* For `+` loops, we often have an `on_failure_jump` that skips forward | |
| 3863 | /* When we jump backward we bump `done_end` up to `p3` under | 3894 | over a subsequent `jump` for lack of an `on_failure_dont_jump` |
| 3864 | the assumption that any other position between `done_end` | 3895 | kind of thing. Recognize this pattern since that subsequent |
| 3865 | and `p3` is either: | 3896 | `jump` is the one that jumps to the loop-entry. */ |
| 3866 | - checked by the other call to RECURSE. | 3897 | if ((re_opcode_t) p2[0] == jump && mcnt == 3) |
| 3867 | - not reachable from here (e.g. for positions before the | 3898 | { |
| 3868 | `on_failure_jump`), or at least not without first | 3899 | EXTRACT_NUMBER (mcnt, p2 + 1); |
| 3869 | jumping before `done_beg`. | 3900 | p2 += mcnt + 3; |
| 3870 | This should hold because our state machines are not arbitrary: | 3901 | } |
| 3871 | they consists of syntaxically nested loops with limited | 3902 | |
| 3872 | control flow. | 3903 | /* We have to check that both destinations are safe. |
| 3873 | FIXME: This can fail (i.e. return true when it shouldn't) | 3904 | Arrange for `p2` to be the smaller of the two. */ |
| 3874 | if we start generating bytecode with a different shape, | 3905 | if (p2 > p2_other) |
| 3875 | so maybe we should bite the bullet and replace done_beg/end | 3906 | (tmp = p2, p2 = p2_other, p2_other = tmp); |
| 3876 | with an actual list of positions we've already processed. */ | 3907 | |
| 3877 | #define RECURSE(p3) \ | 3908 | if (p2_other <= p2_orig /* Both destinations go backward! */ |
| 3878 | ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \ | 3909 | || !mutually_exclusive_aux (bufp, p1, p2_other, |
| 3879 | : (p3) <= done_end ? true \ | 3910 | loop_entry, loop_exit)) |
| 3880 | : mutually_exclusive_aux (bufp, p1, p3, done_beg, \ | 3911 | return false; |
| 3881 | (p3) > p2_orig ? done_end : (p3))) | 3912 | |
| 3882 | 3913 | /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive) | |
| 3883 | return RECURSE (p2) && RECURSE (p2_other); | 3914 | position, let's check `p2`. */ |
| 3915 | if (p2 == loop_entry) | ||
| 3916 | /* If we jump backward to the entry point of the current loop | ||
| 3917 | it means it's a zero-length cycle through that loop, so | ||
| 3918 | this cycle itself does not break mutual-exclusion. */ | ||
| 3919 | return true; | ||
| 3920 | else if (p2 > p2_orig) | ||
| 3921 | /* Boring forward jump. */ | ||
| 3922 | return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit); | ||
| 3923 | else if (loop_entry < p2 && p2 < p2_orig) | ||
| 3924 | /* We jump backward to a new loop, nested within the current one. | ||
| 3925 | `p2` is the entry point and `p2_other` the exit of that inner. */ | ||
| 3926 | return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other); | ||
| 3927 | else | ||
| 3928 | return false; | ||
| 3884 | } | 3929 | } |
| 3885 | 3930 | ||
| 3886 | default: | 3931 | default: |
| @@ -3895,7 +3940,7 @@ static bool | |||
| 3895 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, | 3940 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, |
| 3896 | re_char *p2) | 3941 | re_char *p2) |
| 3897 | { | 3942 | { |
| 3898 | return mutually_exclusive_aux (bufp, p1, p2, p2, p2); | 3943 | return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL); |
| 3899 | } | 3944 | } |
| 3900 | 3945 | ||
| 3901 | /* Matching routines. */ | 3946 | /* Matching routines. */ |