diff options
| -rw-r--r-- | src/regex-emacs.c | 127 |
1 files changed, 86 insertions, 41 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 55d0a6e8df8..06befe1b189 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -1923,12 +1923,22 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1923 | 1923 | ||
| 1924 | if (!zero_times_ok && simple) | 1924 | if (!zero_times_ok && simple) |
| 1925 | { /* Since simple * loops can be made faster by using | 1925 | { /* Since simple * loops can be made faster by using |
| 1926 | on_failure_keep_string_jump, we turn simple P+ | 1926 | on_failure_keep_string_jump, we turn P+ into PP* |
| 1927 | into PP* if P is simple. */ | 1927 | if P is simple. |
| 1928 | We can't use `top: <BODY>; OFJS exit; J top; exit:` | ||
| 1929 | because the OFJS needs to be at the beginning | ||
| 1930 | so we can replace | ||
| 1931 | top: OFJS exit; <BODY>; J top; exit | ||
| 1932 | with | ||
| 1933 | OFKSJ exit; loop: <BODY>; J loop; exit | ||
| 1934 | i.e. a single OFJ at the beginning of the loop | ||
| 1935 | rather than once per iteration. */ | ||
| 1928 | unsigned char *p1, *p2; | 1936 | unsigned char *p1, *p2; |
| 1929 | startoffset = b - laststart; | 1937 | startoffset = b - laststart; |
| 1930 | GET_BUFFER_SPACE (startoffset); | 1938 | GET_BUFFER_SPACE (startoffset); |
| 1931 | p1 = b; p2 = laststart; | 1939 | p1 = b; p2 = laststart; |
| 1940 | /* We presume that the code skipped | ||
| 1941 | by `skip_one_char` is position-independent. */ | ||
| 1932 | while (p2 < p1) | 1942 | while (p2 < p1) |
| 1933 | *b++ = *p2++; | 1943 | *b++ = *p2++; |
| 1934 | zero_times_ok = 1; | 1944 | zero_times_ok = 1; |
| @@ -3068,8 +3078,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 3068 | continue; | 3078 | continue; |
| 3069 | 3079 | ||
| 3070 | case succeed_n: | 3080 | case succeed_n: |
| 3071 | /* If N == 0, it should be an on_failure_jump_loop instead. */ | 3081 | /* If N == 0, it should be an on_failure_jump_loop instead. |
| 3072 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0)); | 3082 | `j` can be negative because `EXTRACT_NUMBER` extracts a |
| 3083 | signed number whereas `succeed_n` treats it as unsigned. */ | ||
| 3084 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0)); | ||
| 3073 | p += 4; | 3085 | p += 4; |
| 3074 | /* We only care about one iteration of the loop, so we don't | 3086 | /* We only care about one iteration of the loop, so we don't |
| 3075 | need to consider the case where this behaves like an | 3087 | need to consider the case where this behaves like an |
| @@ -3743,20 +3755,32 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3743 | } | 3755 | } |
| 3744 | 3756 | ||
| 3745 | /* True if "p1 matches something" implies "p2 fails". */ | 3757 | /* True if "p1 matches something" implies "p2 fails". */ |
| 3746 | /* Avoiding inf-loops: | 3758 | /* We're trying to follow all paths reachable from `p2`, but since some |
| 3747 | We're trying to follow all paths reachable from `p2`, but since some | ||
| 3748 | loops can match the empty string, this can loop back to `p2`. | 3759 | loops can match the empty string, this can loop back to `p2`. |
| 3749 | To avoid inf-looping, we keep track of points that have been considered | 3760 | |
| 3750 | "already". Instead of keeping a list of such points, `done_beg` and | 3761 | To avoid inf-looping, we take advantage of the fact that |
| 3751 | `done_end` delimit a chunk of bytecode we already considered. | 3762 | the bytecode we generate is made of syntactically nested loops, more |
| 3752 | To guarantee termination, a lexical ordering between `done_*` and `p2` | 3763 | specifically, every loop has a single entry point and single exit point. |
| 3753 | should be obeyed: | 3764 | |
| 3754 | At each recursion, either `done_beg` gets smaller, | 3765 | The function takes 2 more arguments (`loop_entry` and `loop_exit`). |
| 3755 | or `done_beg` is unchanged and `done_end` gets larger | 3766 | `loop_entry` points to the sole entry point of the current loop and |
| 3756 | or they're both unchanged and `p2` gets larger. */ | 3767 | `loop_exit` points to its sole exit point (when non-NULL). |
| 3768 | |||
| 3769 | Jumps outside of `loop_entry..exit` should not occur. | ||
| 3770 | The function can assume that `loop_exit` is "mutually exclusive". | ||
| 3771 | The same holds for `loop_entry` except when `p2 == loop_entry`. | ||
| 3772 | |||
| 3773 | To guarantee termination, recursive calls should make sure that either | ||
| 3774 | `loop_entry` is larger, or it's unchanged but `p2` is larger. | ||
| 3775 | |||
| 3776 | FIXME: This is failsafe (can't return true when it shouldn't) | ||
| 3777 | but it could be too conservative if we start generating bytecode | ||
| 3778 | with a different shape, so maybe we should bite the bullet and | ||
| 3779 | replace done_beg/end with an actual list of positions we've | ||
| 3780 | already processed. */ | ||
| 3757 | static bool | 3781 | static bool |
| 3758 | mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | 3782 | mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, |
| 3759 | re_char *p2, re_char *done_beg, re_char *done_end) | 3783 | re_char *p2, re_char *loop_entry, re_char *loop_exit) |
| 3760 | { | 3784 | { |
| 3761 | re_opcode_t op2; | 3785 | re_opcode_t op2; |
| 3762 | unsigned char *pend = bufp->buffer + bufp->used; | 3786 | unsigned char *pend = bufp->buffer + bufp->used; |
| @@ -3765,8 +3789,15 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3765 | eassert (p1 >= bufp->buffer && p1 < pend | 3789 | eassert (p1 >= bufp->buffer && p1 < pend |
| 3766 | && p2 >= bufp->buffer && p2 <= pend); | 3790 | && p2 >= bufp->buffer && p2 <= pend); |
| 3767 | 3791 | ||
| 3768 | eassert (done_beg <= done_end); | 3792 | if (p2 == loop_exit) |
| 3769 | eassert (done_end <= p2); | 3793 | return true; /* Presumably already checked elsewhere. */ |
| 3794 | eassert (loop_entry && p2 >= loop_entry); | ||
| 3795 | if (p2 < loop_entry || (loop_exit && p2 > loop_exit)) | ||
| 3796 | /* The assumptions about the shape of the code aren't true :-( */ | ||
| 3797 | #ifdef ENABLE_CHECKING | ||
| 3798 | error ("Broken assumption in regex.c:mutually_exclusive_aux"); | ||
| 3799 | #endif | ||
| 3800 | return false; | ||
| 3770 | 3801 | ||
| 3771 | /* Skip over open/close-group commands. | 3802 | /* Skip over open/close-group commands. |
| 3772 | If what follows this loop is a ...+ construct, | 3803 | If what follows this loop is a ...+ construct, |
| @@ -3858,29 +3889,43 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3858 | int mcnt; | 3889 | int mcnt; |
| 3859 | p2++; | 3890 | p2++; |
| 3860 | EXTRACT_NUMBER_AND_INCR (mcnt, p2); | 3891 | EXTRACT_NUMBER_AND_INCR (mcnt, p2); |
| 3861 | re_char *p2_other = p2 + mcnt; | 3892 | re_char *p2_other = p2 + mcnt, *tmp; |
| 3862 | 3893 | /* For `+` loops, we often have an `on_failure_jump` that skips forward | |
| 3863 | /* When we jump backward we bump `done_end` up to `p3` under | 3894 | over a subsequent `jump` for lack of an `on_failure_dont_jump` |
| 3864 | the assumption that any other position between `done_end` | 3895 | kind of thing. Recognize this pattern since that subsequent |
| 3865 | and `p3` is either: | 3896 | `jump` is the one that jumps to the loop-entry. */ |
| 3866 | - checked by the other call to RECURSE. | 3897 | if ((re_opcode_t) p2[0] == jump && mcnt == 3) |
| 3867 | - not reachable from here (e.g. for positions before the | 3898 | { |
| 3868 | `on_failure_jump`), or at least not without first | 3899 | EXTRACT_NUMBER (mcnt, p2 + 1); |
| 3869 | jumping before `done_beg`. | 3900 | p2 += mcnt + 3; |
| 3870 | This should hold because our state machines are not arbitrary: | 3901 | } |
| 3871 | they consists of syntaxically nested loops with limited | 3902 | |
| 3872 | control flow. | 3903 | /* We have to check that both destinations are safe. |
| 3873 | FIXME: This can fail (i.e. return true when it shouldn't) | 3904 | Arrange for `p2` to be the smaller of the two. */ |
| 3874 | if we start generating bytecode with a different shape, | 3905 | if (p2 > p2_other) |
| 3875 | so maybe we should bite the bullet and replace done_beg/end | 3906 | (tmp = p2, p2 = p2_other, p2_other = tmp); |
| 3876 | with an actual list of positions we've already processed. */ | 3907 | |
| 3877 | #define RECURSE(p3) \ | 3908 | if (p2_other <= p2_orig /* Both destinations go backward! */ |
| 3878 | ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \ | 3909 | || !mutually_exclusive_aux (bufp, p1, p2_other, |
| 3879 | : (p3) <= done_end ? true \ | 3910 | loop_entry, loop_exit)) |
| 3880 | : mutually_exclusive_aux (bufp, p1, p3, done_beg, \ | 3911 | return false; |
| 3881 | (p3) > p2_orig ? done_end : (p3))) | 3912 | |
| 3882 | 3913 | /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive) | |
| 3883 | return RECURSE (p2) && RECURSE (p2_other); | 3914 | position, let's check `p2`. */ |
| 3915 | if (p2 == loop_entry) | ||
| 3916 | /* If we jump backward to the entry point of the current loop | ||
| 3917 | it means it's a zero-length cycle through that loop, so | ||
| 3918 | this cycle itself does not break mutual-exclusion. */ | ||
| 3919 | return true; | ||
| 3920 | else if (p2 > p2_orig) | ||
| 3921 | /* Boring forward jump. */ | ||
| 3922 | return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit); | ||
| 3923 | else if (loop_entry < p2 && p2 < p2_orig) | ||
| 3924 | /* We jump backward to a new loop, nested within the current one. | ||
| 3925 | `p2` is the entry point and `p2_other` the exit of that inner. */ | ||
| 3926 | return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other); | ||
| 3927 | else | ||
| 3928 | return false; | ||
| 3884 | } | 3929 | } |
| 3885 | 3930 | ||
| 3886 | default: | 3931 | default: |
| @@ -3895,7 +3940,7 @@ static bool | |||
| 3895 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, | 3940 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, |
| 3896 | re_char *p2) | 3941 | re_char *p2) |
| 3897 | { | 3942 | { |
| 3898 | return mutually_exclusive_aux (bufp, p1, p2, p2, p2); | 3943 | return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL); |
| 3899 | } | 3944 | } |
| 3900 | 3945 | ||
| 3901 | /* Matching routines. */ | 3946 | /* Matching routines. */ |