regex-emacs.c (mutually_exclusive_aux): Rework again

Rework the way we handle loops. This new code does not really work better than the previous one, but it has the advantage of being "fail safe" and also that we can dynamically check if our assumptions about the shape of the bytecode are satisfied or not. * src/regex-emacs.c (mutually_exclusive_aux): Replace `done_beg` and `done_end` with `loop_beg` and `loop_end`. (mutually_exclusive_p): Adjust accordingly. (analyze_first): Fix incorrect assertion.
author: Stefan Monnier 2023-09-21 11:47:31 -0400
committer: Stefan Monnier 2023-09-21 11:47:31 -0400
commit: 424b35fa24f07f85c5287c682e140c1c8daa25d0 (patch)
tree: dbf8c1b632dbb82e0ae736aa916ef622d1f58b25
parent: 57c6c067d35e519bc3787966cd6346904bc75e16 (diff)
download: emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.tar.gz
emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.zip
1 files changed, 86 insertions, 41 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 55d0a6e8df8..06befe1b189 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -1923,12 +1923,22 @@ regex_compile (re_char *pattern, ptrdiff_t size,
                    if (!zero_times_ok && simple)
                      { /* Since simple * loops can be made faster by using
-                           on_failure_keep_string_jump, we turn simple P+
+                           on_failure_keep_string_jump, we turn P+ into PP*
-                           into PP* if P is simple.  */
+                           if P is simple.
+                           We can't use `top: <BODY>; OFJS exit; J top; exit:`
+                           because the OFJS needs to be at the beginning
+                           so we can replace
+                               top: OFJS exit; <BODY>; J top; exit
+                           with
+                               OFKSJ exit; loop: <BODY>; J loop; exit
+                           i.e. a single OFJ at the beginning of the loop
+                           rather than once per iteration.  */
                        unsigned char *p1, *p2;
                        startoffset = b - laststart;
                        GET_BUFFER_SPACE (startoffset);
                        p1 = b; p2 = laststart;
+                        /* We presume that the code skipped
+                           by `skip_one_char` is position-independent.  */
                        while (p2 < p1)
                          *b++ = *p2++;
                        zero_times_ok = 1;
@@ -3068,8 +3078,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
          continue;
        case succeed_n:
-          /* If N == 0, it should be an on_failure_jump_loop instead.  */
+          /* If N == 0, it should be an on_failure_jump_loop instead.
-          DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
+             `j` can be negative because `EXTRACT_NUMBER` extracts a
+             signed number whereas `succeed_n` treats it as unsigned.  */
+          DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
          p += 4;
          /* We only care about one iteration of the loop, so we don't
             need to consider the case where this behaves like an
@@ -3743,20 +3755,32 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
 }
 /* True if "p1 matches something" implies "p2 fails".  */
-/* Avoiding inf-loops:
+/* We're trying to follow all paths reachable from `p2`, but since some
-   We're trying to follow all paths reachable from `p2`, but since some
   loops can match the empty string, this can loop back to `p2`.
-   To avoid inf-looping, we keep track of points that have been considered
-   "already".  Instead of keeping a list of such points, `done_beg` and
+   To avoid inf-looping, we take advantage of the fact that
-   `done_end` delimit a chunk of bytecode we already considered.
+   the bytecode we generate is made of syntactically nested loops, more
-   To guarantee termination, a lexical ordering between `done_*` and `p2`
+   specifically, every loop has a single entry point and single exit point.
-   should be obeyed:
-       At each recursion, either `done_beg` gets smaller,
+   The function takes 2 more arguments (`loop_entry` and `loop_exit`).
-       or `done_beg` is unchanged and `done_end` gets larger
+   `loop_entry` points to the sole entry point of the current loop and
-       or they're both unchanged and `p2` gets larger.  */
+   `loop_exit` points to its sole exit point (when non-NULL).
+   Jumps outside of `loop_entry..exit` should not occur.
+   The function can assume that `loop_exit` is "mutually exclusive".
+   The same holds for `loop_entry` except when `p2 == loop_entry`.
+   To guarantee termination, recursive calls should make sure that either
+   `loop_entry` is larger, or it's unchanged but `p2` is larger.
+   FIXME: This is failsafe (can't return true when it shouldn't)
+   but it could be too conservative if we start generating bytecode
+   with a different shape, so maybe we should bite the bullet and
+   replace done_beg/end with an actual list of positions we've
+   already processed.  */
 static bool
 mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
-                        re_char *p2, re_char *done_beg, re_char *done_end)
+                        re_char *p2, re_char *loop_entry, re_char *loop_exit)
 {
  re_opcode_t op2;
  unsigned char *pend = bufp->buffer + bufp->used;
@@ -3765,8 +3789,15 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
  eassert (p1 >= bufp->buffer && p1 < pend
           && p2 >= bufp->buffer && p2 <= pend);
-  eassert (done_beg <= done_end);
+  if (p2 == loop_exit)
-  eassert (done_end <= p2);
+    return true;          /* Presumably already checked elsewhere.  */
+  eassert (loop_entry && p2 >= loop_entry);
+  if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
+    /* The assumptions about the shape of the code aren't true :-(  */
+#ifdef ENABLE_CHECKING
+    error ("Broken assumption in regex.c:mutually_exclusive_aux");
+#endif
+    return false;
  /* Skip over open/close-group commands.
     If what follows this loop is a ...+ construct,
@@ -3858,29 +3889,43 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
        int mcnt;
        p2++;
        EXTRACT_NUMBER_AND_INCR (mcnt, p2);
-        re_char *p2_other = p2 + mcnt;
+        re_char *p2_other = p2 + mcnt, *tmp;
+        /* For `+` loops, we often have an `on_failure_jump` that skips forward
-        /* When we jump backward we bump `done_end` up to `p3` under
+           over a subsequent `jump` for lack of an `on_failure_dont_jump`
-           the assumption that any other position between `done_end`
+           kind of thing.  Recognize this pattern since that subsequent
-           and `p3` is either:
+           `jump` is the one that jumps to the loop-entry.  */
-           - checked by the other call to RECURSE.
+        if ((re_opcode_t) p2[0] == jump && mcnt == 3)
-           - not reachable from here (e.g. for positions before the
+          {
-             `on_failure_jump`), or at least not without first
+            EXTRACT_NUMBER (mcnt, p2 + 1);
-             jumping before `done_beg`.
+            p2 += mcnt + 3;
-           This should hold because our state machines are not arbitrary:
+          }
-           they consists of syntaxically nested loops with limited
-           control flow.
+        /* We have to check that both destinations are safe.
-           FIXME: This can fail (i.e. return true when it shouldn't)
+           Arrange for `p2` to be the smaller of the two.  */
-           if we start generating bytecode with a different shape,
+        if (p2 > p2_other)
-           so maybe we should bite the bullet and replace done_beg/end
+          (tmp = p2, p2 = p2_other, p2_other = tmp);
-           with an actual list of positions we've already processed.  */
-#define RECURSE(p3)                                                \
+        if (p2_other <= p2_orig /* Both destinations go backward!  */
-  ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \
+            || !mutually_exclusive_aux (bufp, p1, p2_other,
-   : (p3) <= done_end ? true                                       \
+                                        loop_entry, loop_exit))
-   : mutually_exclusive_aux (bufp, p1, p3, done_beg,               \
+          return false;
-                             (p3) > p2_orig ? done_end : (p3)))
+        /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
-        return RECURSE (p2) && RECURSE (p2_other);
+           position, let's check `p2`.  */
+        if (p2 == loop_entry)
+          /* If we jump backward to the entry point of the current loop
+             it means it's a zero-length cycle through that loop, so
+             this cycle itself does not break mutual-exclusion.  */
+          return true;
+        else if (p2 > p2_orig)
+          /* Boring forward jump.  */
+          return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
+        else if (loop_entry < p2 && p2 < p2_orig)
+          /* We jump backward to a new loop, nested within the current one.
+             `p2` is the entry point and `p2_other` the exit of that inner.  */
+          return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
+        else
+          return false;
      }
    default:
@@ -3895,7 +3940,7 @@ static bool
 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
                      re_char *p2)
 {
-  return mutually_exclusive_aux (bufp, p1, p2, p2, p2);
+  return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL);
 }
 /* Matching routines.  */
author	Stefan Monnier	2023-09-21 11:47:31 -0400
committer	Stefan Monnier	2023-09-21 11:47:31 -0400
commit	424b35fa24f07f85c5287c682e140c1c8daa25d0 (patch)
tree	dbf8c1b632dbb82e0ae736aa916ef622d1f58b25
parent	57c6c067d35e519bc3787966cd6346904bc75e16 (diff)
download	emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.tar.gz emacs-424b35fa24f07f85c5287c682e140c1c8daa25d0.zip

diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 55d0a6e8df8..06befe1b189 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c
@@ -1923,12 +1923,22 @@ regex_compile (re_char *pattern, ptrdiff_t size,
1923		1923
1924	if (!zero_times_ok && simple)	1924	if (!zero_times_ok && simple)
1925	{ /* Since simple * loops can be made faster by using	1925	{ /* Since simple * loops can be made faster by using
1926	on_failure_keep_string_jump, we turn simple P+	1926	on_failure_keep_string_jump, we turn P+ into PP*
1927	into PP* if P is simple. */	1927	if P is simple.
		1928	We can't use `top: <BODY>; OFJS exit; J top; exit:`
		1929	because the OFJS needs to be at the beginning
		1930	so we can replace
		1931	top: OFJS exit; <BODY>; J top; exit
		1932	with
		1933	OFKSJ exit; loop: <BODY>; J loop; exit
		1934	i.e. a single OFJ at the beginning of the loop
		1935	rather than once per iteration. */
1928	unsigned char p1, p2;	1936	unsigned char p1, p2;
1929	startoffset = b - laststart;	1937	startoffset = b - laststart;
1930	GET_BUFFER_SPACE (startoffset);	1938	GET_BUFFER_SPACE (startoffset);
1931	p1 = b; p2 = laststart;	1939	p1 = b; p2 = laststart;
		1940	/* We presume that the code skipped
		1941	by `skip_one_char` is position-independent. */
1932	while (p2 < p1)	1942	while (p2 < p1)
1933	b++ = p2++;	1943	b++ = p2++;
1934	zero_times_ok = 1;	1944	zero_times_ok = 1;
@@ -3068,8 +3078,10 @@ analyze_first (re_char p, re_char pend, char *fastmap, bool multibyte)
3068	continue;	3078	continue;
3069		3079
3070	case succeed_n:	3080	case succeed_n:
3071	/* If N == 0, it should be an on_failure_jump_loop instead. */	3081	/* If N == 0, it should be an on_failure_jump_loop instead.
3072	DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));	3082	`j` can be negative because `EXTRACT_NUMBER` extracts a
		3083	signed number whereas `succeed_n` treats it as unsigned. */
		3084	DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
3073	p += 4;	3085	p += 4;
3074	/* We only care about one iteration of the loop, so we don't	3086	/* We only care about one iteration of the loop, so we don't
3075	need to consider the case where this behaves like an	3087	need to consider the case where this behaves like an
@@ -3743,20 +3755,32 @@ mutually_exclusive_charset (struct re_pattern_buffer bufp, re_char p1,
3743	}	3755	}
3744		3756
3745	/* True if "p1 matches something" implies "p2 fails". */	3757	/* True if "p1 matches something" implies "p2 fails". */
3746	/* Avoiding inf-loops:	3758	/* We're trying to follow all paths reachable from `p2`, but since some
3747	We're trying to follow all paths reachable from `p2`, but since some
3748	loops can match the empty string, this can loop back to `p2`.	3759	loops can match the empty string, this can loop back to `p2`.
3749	To avoid inf-looping, we keep track of points that have been considered	3760
3750	"already". Instead of keeping a list of such points, `done_beg` and	3761	To avoid inf-looping, we take advantage of the fact that
3751	`done_end` delimit a chunk of bytecode we already considered.	3762	the bytecode we generate is made of syntactically nested loops, more
3752	To guarantee termination, a lexical ordering between `done_*` and `p2`	3763	specifically, every loop has a single entry point and single exit point.
3753	should be obeyed:	3764
3754	At each recursion, either `done_beg` gets smaller,	3765	The function takes 2 more arguments (`loop_entry` and `loop_exit`).
3755	or `done_beg` is unchanged and `done_end` gets larger	3766	`loop_entry` points to the sole entry point of the current loop and
3756	or they're both unchanged and `p2` gets larger. */	3767	`loop_exit` points to its sole exit point (when non-NULL).
		3768
		3769	Jumps outside of `loop_entry..exit` should not occur.
		3770	The function can assume that `loop_exit` is "mutually exclusive".
		3771	The same holds for `loop_entry` except when `p2 == loop_entry`.
		3772
		3773	To guarantee termination, recursive calls should make sure that either
		3774	`loop_entry` is larger, or it's unchanged but `p2` is larger.
		3775
		3776	FIXME: This is failsafe (can't return true when it shouldn't)
		3777	but it could be too conservative if we start generating bytecode
		3778	with a different shape, so maybe we should bite the bullet and
		3779	replace done_beg/end with an actual list of positions we've
		3780	already processed. */
3757	static bool	3781	static bool
3758	mutually_exclusive_aux (struct re_pattern_buffer bufp, re_char p1,	3782	mutually_exclusive_aux (struct re_pattern_buffer bufp, re_char p1,
3759	re_char p2, re_char done_beg, re_char *done_end)	3783	re_char p2, re_char loop_entry, re_char *loop_exit)
3760	{	3784	{
3761	re_opcode_t op2;	3785	re_opcode_t op2;
3762	unsigned char *pend = bufp->buffer + bufp->used;	3786	unsigned char *pend = bufp->buffer + bufp->used;
@@ -3765,8 +3789,15 @@ mutually_exclusive_aux (struct re_pattern_buffer bufp, re_char p1,
3765	eassert (p1 >= bufp->buffer && p1 < pend	3789	eassert (p1 >= bufp->buffer && p1 < pend
3766	&& p2 >= bufp->buffer && p2 <= pend);	3790	&& p2 >= bufp->buffer && p2 <= pend);
3767		3791
3768	eassert (done_beg <= done_end);	3792	if (p2 == loop_exit)
3769	eassert (done_end <= p2);	3793	return true; /* Presumably already checked elsewhere. */
		3794	eassert (loop_entry && p2 >= loop_entry);
		3795	if (p2 < loop_entry \|\| (loop_exit && p2 > loop_exit))
		3796	/* The assumptions about the shape of the code aren't true :-( */
		3797	#ifdef ENABLE_CHECKING
		3798	error ("Broken assumption in regex.c:mutually_exclusive_aux");
		3799	#endif
		3800	return false;
3770		3801
3771	/* Skip over open/close-group commands.	3802	/* Skip over open/close-group commands.
3772	If what follows this loop is a ...+ construct,	3803	If what follows this loop is a ...+ construct,
@@ -3858,29 +3889,43 @@ mutually_exclusive_aux (struct re_pattern_buffer bufp, re_char p1,
3858	int mcnt;	3889	int mcnt;
3859	p2++;	3890	p2++;
3860	EXTRACT_NUMBER_AND_INCR (mcnt, p2);	3891	EXTRACT_NUMBER_AND_INCR (mcnt, p2);
3861	re_char *p2_other = p2 + mcnt;	3892	re_char p2_other = p2 + mcnt, tmp;
3862		3893	/* For `+` loops, we often have an `on_failure_jump` that skips forward
3863	/* When we jump backward we bump `done_end` up to `p3` under	3894	over a subsequent `jump` for lack of an `on_failure_dont_jump`
3864	the assumption that any other position between `done_end`	3895	kind of thing. Recognize this pattern since that subsequent
3865	and `p3` is either:	3896	`jump` is the one that jumps to the loop-entry. */
3866	- checked by the other call to RECURSE.	3897	if ((re_opcode_t) p2[0] == jump && mcnt == 3)
3867	- not reachable from here (e.g. for positions before the	3898	{
3868	`on_failure_jump`), or at least not without first	3899	EXTRACT_NUMBER (mcnt, p2 + 1);
3869	jumping before `done_beg`.	3900	p2 += mcnt + 3;
3870	This should hold because our state machines are not arbitrary:	3901	}
3871	they consists of syntaxically nested loops with limited	3902
3872	control flow.	3903	/* We have to check that both destinations are safe.
3873	FIXME: This can fail (i.e. return true when it shouldn't)	3904	Arrange for `p2` to be the smaller of the two. */
3874	if we start generating bytecode with a different shape,	3905	if (p2 > p2_other)
3875	so maybe we should bite the bullet and replace done_beg/end	3906	(tmp = p2, p2 = p2_other, p2_other = tmp);
3876	with an actual list of positions we've already processed. */	3907
3877	#define RECURSE(p3) \	3908	if (p2_other <= p2_orig /* Both destinations go backward! */
3878	((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \	3909	\|\| !mutually_exclusive_aux (bufp, p1, p2_other,
3879	: (p3) <= done_end ? true \	3910	loop_entry, loop_exit))
3880	: mutually_exclusive_aux (bufp, p1, p3, done_beg, \	3911	return false;
3881	(p3) > p2_orig ? done_end : (p3)))	3912
3882		3913	/* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
3883	return RECURSE (p2) && RECURSE (p2_other);	3914	position, let's check `p2`. */
		3915	if (p2 == loop_entry)
		3916	/* If we jump backward to the entry point of the current loop
		3917	it means it's a zero-length cycle through that loop, so
		3918	this cycle itself does not break mutual-exclusion. */
		3919	return true;
		3920	else if (p2 > p2_orig)
		3921	/* Boring forward jump. */
		3922	return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
		3923	else if (loop_entry < p2 && p2 < p2_orig)
		3924	/* We jump backward to a new loop, nested within the current one.
		3925	`p2` is the entry point and `p2_other` the exit of that inner. */
		3926	return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
		3927	else
		3928	return false;
3884	}	3929	}
3885		3930
3886	default:	3931	default:
@@ -3895,7 +3940,7 @@ static bool
3895	mutually_exclusive_p (struct re_pattern_buffer bufp, re_char p1,	3940	mutually_exclusive_p (struct re_pattern_buffer bufp, re_char p1,
3896	re_char *p2)	3941	re_char *p2)
3897	{	3942	{
3898	return mutually_exclusive_aux (bufp, p1, p2, p2, p2);	3943	return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL);
3899	}	3944	}
3900		3945
3901	/* Matching routines. */	3946	/* Matching routines. */