aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2023-06-20 12:12:50 +0200
committerMattias EngdegÄrd2023-06-21 18:00:26 +0200
commitbe91192ecb1e0dff794582cd463f0a6480d160ef (patch)
tree156031723d39088ce4e2dd52a3a7b7068fbc5880
parentdae8aab52874441a70a94435d50f25b27301d9b0 (diff)
downloademacs-be91192ecb1e0dff794582cd463f0a6480d160ef.tar.gz
emacs-be91192ecb1e0dff794582cd463f0a6480d160ef.zip
Straighten regexp postfix operator after zero-width assertion parse
The zero-width assertions \` \' \b \B were parsed in a sloppy way so that a following postfix repetition operator could yield surprising results. For instance, "\\b*" would act as "\\b\\*", and "xy\\b*" would act as "\\(?:xy\\b\\)*". Except for \` and ^, any following postfix operator now applies to the zero-width assertion itself only, which is predictable and consistent with other assertions, if useless in practice. For historical compatibility, an operator character following \` and ^ always becomes a literal. (Bug#64128) * src/regex-emacs.c (regex_compile): Set `laststart` appropriately for each zero-width assertion instead of leaving it with whatever value it had before. Remove a redundant condition. * test/src/regex-emacs-tests.el (regexp-tests-zero-width-assertion-repetition): New test. * doc/lispref/searching.texi (Regexp Special): Say that repetition operators are not special after \`, and that they work as expected after other backslash escapes. * etc/NEWS: Announce.
-rw-r--r--doc/lispref/searching.texi6
-rw-r--r--etc/NEWS8
-rw-r--r--src/regex-emacs.c15
-rw-r--r--test/src/regex-emacs-tests.el66
4 files changed, 88 insertions, 7 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index 28230cea643..7c9893054d9 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -546,15 +546,11 @@ example, the regular expression that matches the @samp{\} character is
546 546
547For historical compatibility, a repetition operator is treated as ordinary 547For historical compatibility, a repetition operator is treated as ordinary
548if it appears at the start of a regular expression 548if it appears at the start of a regular expression
549or after @samp{^}, @samp{\(}, @samp{\(?:} or @samp{\|}. 549or after @samp{^}, @samp{\`}, @samp{\(}, @samp{\(?:} or @samp{\|}.
550For example, @samp{*foo} is treated as @samp{\*foo}, and 550For example, @samp{*foo} is treated as @samp{\*foo}, and
551@samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}. 551@samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}.
552It is poor practice to depend on this behavior; use proper backslash 552It is poor practice to depend on this behavior; use proper backslash
553escaping anyway, regardless of where the repetition operator appears. 553escaping anyway, regardless of where the repetition operator appears.
554Also, a repetition operator should not immediately follow a backslash escape
555that matches only empty strings, as Emacs has bugs in this area.
556For example, it is unwise to use @samp{\b*}, which can be omitted
557without changing the documented meaning of the regular expression.
558 554
559As a @samp{\} is not special inside a bracket expression, it can 555As a @samp{\} is not special inside a bracket expression, it can
560never remove the special meaning of @samp{-}, @samp{^} or @samp{]}. 556never remove the special meaning of @samp{-}, @samp{^} or @samp{]}.
diff --git a/etc/NEWS b/etc/NEWS
index d703b7e77be..7552640663f 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -475,6 +475,14 @@ symbol, and either that symbol is ':eval' and the second element of
475the list evaluates to 'nil' or the symbol's value as a variable is 475the list evaluates to 'nil' or the symbol's value as a variable is
476'nil' or void. 476'nil' or void.
477 477
478+++
479** Regexp zero-width assertions followed by operators are better defined.
480Previously, regexps such as "xy\\B*" would have ill-defined behaviour.
481Now any operator following a zero-width assertion applies to that
482assertion only (which is useless). For historical compatibility, an
483operator character following '^' or '\`' becomes literal, but we
484advise against relying on this.
485
478 486
479* Lisp Changes in Emacs 30.1 487* Lisp Changes in Emacs 30.1
480 488
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index fea34df991b..9e298b81ebb 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -1716,7 +1716,8 @@ regex_compile (re_char *pattern, ptrdiff_t size,
1716 1716
1717 /* Address of start of the most recently finished expression. 1717 /* Address of start of the most recently finished expression.
1718 This tells, e.g., postfix * where to find the start of its 1718 This tells, e.g., postfix * where to find the start of its
1719 operand. Reset at the beginning of groups and alternatives. */ 1719 operand. Reset at the beginning of groups and alternatives,
1720 and after ^ and \` for dusty-deck compatibility. */
1720 unsigned char *laststart = 0; 1721 unsigned char *laststart = 0;
1721 1722
1722 /* Address of beginning of regexp, or inside of last group. */ 1723 /* Address of beginning of regexp, or inside of last group. */
@@ -1847,12 +1848,16 @@ regex_compile (re_char *pattern, ptrdiff_t size,
1847 case '^': 1848 case '^':
1848 if (! (p == pattern + 1 || at_begline_loc_p (pattern, p))) 1849 if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
1849 goto normal_char; 1850 goto normal_char;
1851 /* Special case for compatibility: postfix ops after ^ become
1852 literals. */
1853 laststart = 0;
1850 BUF_PUSH (begline); 1854 BUF_PUSH (begline);
1851 break; 1855 break;
1852 1856
1853 case '$': 1857 case '$':
1854 if (! (p == pend || at_endline_loc_p (p, pend))) 1858 if (! (p == pend || at_endline_loc_p (p, pend)))
1855 goto normal_char; 1859 goto normal_char;
1860 laststart = b;
1856 BUF_PUSH (endline); 1861 BUF_PUSH (endline);
1857 break; 1862 break;
1858 1863
@@ -1892,7 +1897,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
1892 1897
1893 /* Star, etc. applied to an empty pattern is equivalent 1898 /* Star, etc. applied to an empty pattern is equivalent
1894 to an empty pattern. */ 1899 to an empty pattern. */
1895 if (!laststart || laststart == b) 1900 if (laststart == b)
1896 break; 1901 break;
1897 1902
1898 /* Now we know whether or not zero matches is allowed 1903 /* Now we know whether or not zero matches is allowed
@@ -2544,18 +2549,24 @@ regex_compile (re_char *pattern, ptrdiff_t size,
2544 break; 2549 break;
2545 2550
2546 case 'b': 2551 case 'b':
2552 laststart = b;
2547 BUF_PUSH (wordbound); 2553 BUF_PUSH (wordbound);
2548 break; 2554 break;
2549 2555
2550 case 'B': 2556 case 'B':
2557 laststart = b;
2551 BUF_PUSH (notwordbound); 2558 BUF_PUSH (notwordbound);
2552 break; 2559 break;
2553 2560
2554 case '`': 2561 case '`':
2562 /* Special case for compatibility: postfix ops after \` become
2563 literals, as for ^ (see above). */
2564 laststart = 0;
2555 BUF_PUSH (begbuf); 2565 BUF_PUSH (begbuf);
2556 break; 2566 break;
2557 2567
2558 case '\'': 2568 case '\'':
2569 laststart = b;
2559 BUF_PUSH (endbuf); 2570 BUF_PUSH (endbuf);
2560 break; 2571 break;
2561 2572
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 52d43775b8e..08a93dbf30e 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -883,4 +883,70 @@ This evaluates the TESTS test cases from glibc."
883 (should (looking-at "x*\\(=\\|:\\)*")) 883 (should (looking-at "x*\\(=\\|:\\)*"))
884 (should (looking-at "x*=*?")))) 884 (should (looking-at "x*=*?"))))
885 885
886(ert-deftest regexp-tests-zero-width-assertion-repetition ()
887 ;; Check compatibility behaviour with repetition operators after
888 ;; certain zero-width assertions (bug#64128).
889
890 ;; This function is just to hide ugly regexps from relint so that it
891 ;; doesn't complain about them.
892 (cl-flet ((smatch (re str) (string-match re str)))
893 ;; Postfix operators after ^ and \` become literals, for historical
894 ;; compatibility. Only the first character of a lazy operator (like *?)
895 ;; becomes a literal.
896 (should (equal (smatch "^*a" "x\n*a") 2))
897 (should (equal (smatch "^*?a" "x\n*a") 2))
898 (should (equal (smatch "^*?a" "x\na") 2))
899 (should (equal (smatch "^*?a" "x\n**a") nil))
900
901 (should (equal (smatch "\\`*a" "*a") 0))
902 (should (equal (smatch "\\`*?a" "*a") 0))
903 (should (equal (smatch "\\`*?a" "a") 0))
904 (should (equal (smatch "\\`*?a" "**a") nil))
905
906 ;; Other zero-width assertions are treated as normal elements, so postfix
907 ;; operators apply to them alone (which is pointless but valid).
908 (should (equal (smatch "\\b*!" "*!") 1))
909 (should (equal (smatch "!\\b+;" "!;") nil))
910 (should (equal (smatch "!\\b+a" "!a") 0))
911
912 (should (equal (smatch "\\B*!" "*!") 1))
913 (should (equal (smatch "!\\B+;" "!;") 0))
914 (should (equal (smatch "!\\B+a" "!a") nil))
915
916 (should (equal (smatch "\\<*b" "*b") 1))
917 (should (equal (smatch "a\\<*b" "ab") 0))
918 (should (equal (smatch ";\\<*b" ";b") 0))
919 (should (equal (smatch "a\\<+b" "ab") nil))
920 (should (equal (smatch ";\\<+b" ";b") 0))
921
922 (should (equal (smatch "\\>*;" "*;") 1))
923 (should (equal (smatch "a\\>*b" "ab") 0))
924 (should (equal (smatch "a\\>*;" "a;") 0))
925 (should (equal (smatch "a\\>+b" "ab") nil))
926 (should (equal (smatch "a\\>+;" "a;") 0))
927
928 (should (equal (smatch "a\\'" "ab") nil))
929 (should (equal (smatch "b\\'" "ab") 1))
930 (should (equal (smatch "a\\'*b" "ab") 0))
931 (should (equal (smatch "a\\'+" "ab") nil))
932 (should (equal (smatch "b\\'+" "ab") 1))
933 (should (equal (smatch "\\'+" "+") 1))
934
935 (should (equal (smatch "\\_<*b" "*b") 1))
936 (should (equal (smatch "a\\_<*b" "ab") 0))
937 (should (equal (smatch " \\_<*b" " b") 0))
938 (should (equal (smatch "a\\_<+b" "ab") nil))
939 (should (equal (smatch " \\_<+b" " b") 0))
940
941 (should (equal (smatch "\\_>*;" "*;") 1))
942 (should (equal (smatch "a\\_>*b" "ab") 0))
943 (should (equal (smatch "a\\_>* " "a ") 0))
944 (should (equal (smatch "a\\_>+b" "ab") nil))
945 (should (equal (smatch "a\\_>+ " "a ") 0))
946
947 (should (equal (smatch "\\=*b" "*b") 1))
948 (should (equal (smatch "a\\=*b" "a*b") nil))
949 (should (equal (smatch "a\\=*b" "ab") 0))
950 ))
951
886;;; regex-emacs-tests.el ends here 952;;; regex-emacs-tests.el ends here