diff options
| author | Mattias EngdegÄrd | 2023-06-20 12:12:50 +0200 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2023-06-21 18:00:26 +0200 |
| commit | be91192ecb1e0dff794582cd463f0a6480d160ef (patch) | |
| tree | 156031723d39088ce4e2dd52a3a7b7068fbc5880 | |
| parent | dae8aab52874441a70a94435d50f25b27301d9b0 (diff) | |
| download | emacs-be91192ecb1e0dff794582cd463f0a6480d160ef.tar.gz emacs-be91192ecb1e0dff794582cd463f0a6480d160ef.zip | |
Straighten regexp postfix operator after zero-width assertion parse
The zero-width assertions \` \' \b \B were parsed in a sloppy way so
that a following postfix repetition operator could yield surprising
results. For instance, "\\b*" would act as "\\b\\*", and "xy\\b*"
would act as "\\(?:xy\\b\\)*".
Except for \` and ^, any following postfix operator now applies to the
zero-width assertion itself only, which is predictable and consistent
with other assertions, if useless in practice.
For historical compatibility, an operator character following \` and ^
always becomes a literal. (Bug#64128)
* src/regex-emacs.c (regex_compile):
Set `laststart` appropriately for each zero-width assertion instead
of leaving it with whatever value it had before.
Remove a redundant condition.
* test/src/regex-emacs-tests.el
(regexp-tests-zero-width-assertion-repetition): New test.
* doc/lispref/searching.texi (Regexp Special):
Say that repetition operators are not special after \`,
and that they work as expected after other backslash escapes.
* etc/NEWS: Announce.
| -rw-r--r-- | doc/lispref/searching.texi | 6 | ||||
| -rw-r--r-- | etc/NEWS | 8 | ||||
| -rw-r--r-- | src/regex-emacs.c | 15 | ||||
| -rw-r--r-- | test/src/regex-emacs-tests.el | 66 |
4 files changed, 88 insertions, 7 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 28230cea643..7c9893054d9 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi | |||
| @@ -546,15 +546,11 @@ example, the regular expression that matches the @samp{\} character is | |||
| 546 | 546 | ||
| 547 | For historical compatibility, a repetition operator is treated as ordinary | 547 | For historical compatibility, a repetition operator is treated as ordinary |
| 548 | if it appears at the start of a regular expression | 548 | if it appears at the start of a regular expression |
| 549 | or after @samp{^}, @samp{\(}, @samp{\(?:} or @samp{\|}. | 549 | or after @samp{^}, @samp{\`}, @samp{\(}, @samp{\(?:} or @samp{\|}. |
| 550 | For example, @samp{*foo} is treated as @samp{\*foo}, and | 550 | For example, @samp{*foo} is treated as @samp{\*foo}, and |
| 551 | @samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}. | 551 | @samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}. |
| 552 | It is poor practice to depend on this behavior; use proper backslash | 552 | It is poor practice to depend on this behavior; use proper backslash |
| 553 | escaping anyway, regardless of where the repetition operator appears. | 553 | escaping anyway, regardless of where the repetition operator appears. |
| 554 | Also, a repetition operator should not immediately follow a backslash escape | ||
| 555 | that matches only empty strings, as Emacs has bugs in this area. | ||
| 556 | For example, it is unwise to use @samp{\b*}, which can be omitted | ||
| 557 | without changing the documented meaning of the regular expression. | ||
| 558 | 554 | ||
| 559 | As a @samp{\} is not special inside a bracket expression, it can | 555 | As a @samp{\} is not special inside a bracket expression, it can |
| 560 | never remove the special meaning of @samp{-}, @samp{^} or @samp{]}. | 556 | never remove the special meaning of @samp{-}, @samp{^} or @samp{]}. |
| @@ -475,6 +475,14 @@ symbol, and either that symbol is ':eval' and the second element of | |||
| 475 | the list evaluates to 'nil' or the symbol's value as a variable is | 475 | the list evaluates to 'nil' or the symbol's value as a variable is |
| 476 | 'nil' or void. | 476 | 'nil' or void. |
| 477 | 477 | ||
| 478 | +++ | ||
| 479 | ** Regexp zero-width assertions followed by operators are better defined. | ||
| 480 | Previously, regexps such as "xy\\B*" would have ill-defined behaviour. | ||
| 481 | Now any operator following a zero-width assertion applies to that | ||
| 482 | assertion only (which is useless). For historical compatibility, an | ||
| 483 | operator character following '^' or '\`' becomes literal, but we | ||
| 484 | advise against relying on this. | ||
| 485 | |||
| 478 | 486 | ||
| 479 | * Lisp Changes in Emacs 30.1 | 487 | * Lisp Changes in Emacs 30.1 |
| 480 | 488 | ||
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index fea34df991b..9e298b81ebb 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -1716,7 +1716,8 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1716 | 1716 | ||
| 1717 | /* Address of start of the most recently finished expression. | 1717 | /* Address of start of the most recently finished expression. |
| 1718 | This tells, e.g., postfix * where to find the start of its | 1718 | This tells, e.g., postfix * where to find the start of its |
| 1719 | operand. Reset at the beginning of groups and alternatives. */ | 1719 | operand. Reset at the beginning of groups and alternatives, |
| 1720 | and after ^ and \` for dusty-deck compatibility. */ | ||
| 1720 | unsigned char *laststart = 0; | 1721 | unsigned char *laststart = 0; |
| 1721 | 1722 | ||
| 1722 | /* Address of beginning of regexp, or inside of last group. */ | 1723 | /* Address of beginning of regexp, or inside of last group. */ |
| @@ -1847,12 +1848,16 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1847 | case '^': | 1848 | case '^': |
| 1848 | if (! (p == pattern + 1 || at_begline_loc_p (pattern, p))) | 1849 | if (! (p == pattern + 1 || at_begline_loc_p (pattern, p))) |
| 1849 | goto normal_char; | 1850 | goto normal_char; |
| 1851 | /* Special case for compatibility: postfix ops after ^ become | ||
| 1852 | literals. */ | ||
| 1853 | laststart = 0; | ||
| 1850 | BUF_PUSH (begline); | 1854 | BUF_PUSH (begline); |
| 1851 | break; | 1855 | break; |
| 1852 | 1856 | ||
| 1853 | case '$': | 1857 | case '$': |
| 1854 | if (! (p == pend || at_endline_loc_p (p, pend))) | 1858 | if (! (p == pend || at_endline_loc_p (p, pend))) |
| 1855 | goto normal_char; | 1859 | goto normal_char; |
| 1860 | laststart = b; | ||
| 1856 | BUF_PUSH (endline); | 1861 | BUF_PUSH (endline); |
| 1857 | break; | 1862 | break; |
| 1858 | 1863 | ||
| @@ -1892,7 +1897,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1892 | 1897 | ||
| 1893 | /* Star, etc. applied to an empty pattern is equivalent | 1898 | /* Star, etc. applied to an empty pattern is equivalent |
| 1894 | to an empty pattern. */ | 1899 | to an empty pattern. */ |
| 1895 | if (!laststart || laststart == b) | 1900 | if (laststart == b) |
| 1896 | break; | 1901 | break; |
| 1897 | 1902 | ||
| 1898 | /* Now we know whether or not zero matches is allowed | 1903 | /* Now we know whether or not zero matches is allowed |
| @@ -2544,18 +2549,24 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 2544 | break; | 2549 | break; |
| 2545 | 2550 | ||
| 2546 | case 'b': | 2551 | case 'b': |
| 2552 | laststart = b; | ||
| 2547 | BUF_PUSH (wordbound); | 2553 | BUF_PUSH (wordbound); |
| 2548 | break; | 2554 | break; |
| 2549 | 2555 | ||
| 2550 | case 'B': | 2556 | case 'B': |
| 2557 | laststart = b; | ||
| 2551 | BUF_PUSH (notwordbound); | 2558 | BUF_PUSH (notwordbound); |
| 2552 | break; | 2559 | break; |
| 2553 | 2560 | ||
| 2554 | case '`': | 2561 | case '`': |
| 2562 | /* Special case for compatibility: postfix ops after \` become | ||
| 2563 | literals, as for ^ (see above). */ | ||
| 2564 | laststart = 0; | ||
| 2555 | BUF_PUSH (begbuf); | 2565 | BUF_PUSH (begbuf); |
| 2556 | break; | 2566 | break; |
| 2557 | 2567 | ||
| 2558 | case '\'': | 2568 | case '\'': |
| 2569 | laststart = b; | ||
| 2559 | BUF_PUSH (endbuf); | 2570 | BUF_PUSH (endbuf); |
| 2560 | break; | 2571 | break; |
| 2561 | 2572 | ||
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index 52d43775b8e..08a93dbf30e 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el | |||
| @@ -883,4 +883,70 @@ This evaluates the TESTS test cases from glibc." | |||
| 883 | (should (looking-at "x*\\(=\\|:\\)*")) | 883 | (should (looking-at "x*\\(=\\|:\\)*")) |
| 884 | (should (looking-at "x*=*?")))) | 884 | (should (looking-at "x*=*?")))) |
| 885 | 885 | ||
| 886 | (ert-deftest regexp-tests-zero-width-assertion-repetition () | ||
| 887 | ;; Check compatibility behaviour with repetition operators after | ||
| 888 | ;; certain zero-width assertions (bug#64128). | ||
| 889 | |||
| 890 | ;; This function is just to hide ugly regexps from relint so that it | ||
| 891 | ;; doesn't complain about them. | ||
| 892 | (cl-flet ((smatch (re str) (string-match re str))) | ||
| 893 | ;; Postfix operators after ^ and \` become literals, for historical | ||
| 894 | ;; compatibility. Only the first character of a lazy operator (like *?) | ||
| 895 | ;; becomes a literal. | ||
| 896 | (should (equal (smatch "^*a" "x\n*a") 2)) | ||
| 897 | (should (equal (smatch "^*?a" "x\n*a") 2)) | ||
| 898 | (should (equal (smatch "^*?a" "x\na") 2)) | ||
| 899 | (should (equal (smatch "^*?a" "x\n**a") nil)) | ||
| 900 | |||
| 901 | (should (equal (smatch "\\`*a" "*a") 0)) | ||
| 902 | (should (equal (smatch "\\`*?a" "*a") 0)) | ||
| 903 | (should (equal (smatch "\\`*?a" "a") 0)) | ||
| 904 | (should (equal (smatch "\\`*?a" "**a") nil)) | ||
| 905 | |||
| 906 | ;; Other zero-width assertions are treated as normal elements, so postfix | ||
| 907 | ;; operators apply to them alone (which is pointless but valid). | ||
| 908 | (should (equal (smatch "\\b*!" "*!") 1)) | ||
| 909 | (should (equal (smatch "!\\b+;" "!;") nil)) | ||
| 910 | (should (equal (smatch "!\\b+a" "!a") 0)) | ||
| 911 | |||
| 912 | (should (equal (smatch "\\B*!" "*!") 1)) | ||
| 913 | (should (equal (smatch "!\\B+;" "!;") 0)) | ||
| 914 | (should (equal (smatch "!\\B+a" "!a") nil)) | ||
| 915 | |||
| 916 | (should (equal (smatch "\\<*b" "*b") 1)) | ||
| 917 | (should (equal (smatch "a\\<*b" "ab") 0)) | ||
| 918 | (should (equal (smatch ";\\<*b" ";b") 0)) | ||
| 919 | (should (equal (smatch "a\\<+b" "ab") nil)) | ||
| 920 | (should (equal (smatch ";\\<+b" ";b") 0)) | ||
| 921 | |||
| 922 | (should (equal (smatch "\\>*;" "*;") 1)) | ||
| 923 | (should (equal (smatch "a\\>*b" "ab") 0)) | ||
| 924 | (should (equal (smatch "a\\>*;" "a;") 0)) | ||
| 925 | (should (equal (smatch "a\\>+b" "ab") nil)) | ||
| 926 | (should (equal (smatch "a\\>+;" "a;") 0)) | ||
| 927 | |||
| 928 | (should (equal (smatch "a\\'" "ab") nil)) | ||
| 929 | (should (equal (smatch "b\\'" "ab") 1)) | ||
| 930 | (should (equal (smatch "a\\'*b" "ab") 0)) | ||
| 931 | (should (equal (smatch "a\\'+" "ab") nil)) | ||
| 932 | (should (equal (smatch "b\\'+" "ab") 1)) | ||
| 933 | (should (equal (smatch "\\'+" "+") 1)) | ||
| 934 | |||
| 935 | (should (equal (smatch "\\_<*b" "*b") 1)) | ||
| 936 | (should (equal (smatch "a\\_<*b" "ab") 0)) | ||
| 937 | (should (equal (smatch " \\_<*b" " b") 0)) | ||
| 938 | (should (equal (smatch "a\\_<+b" "ab") nil)) | ||
| 939 | (should (equal (smatch " \\_<+b" " b") 0)) | ||
| 940 | |||
| 941 | (should (equal (smatch "\\_>*;" "*;") 1)) | ||
| 942 | (should (equal (smatch "a\\_>*b" "ab") 0)) | ||
| 943 | (should (equal (smatch "a\\_>* " "a ") 0)) | ||
| 944 | (should (equal (smatch "a\\_>+b" "ab") nil)) | ||
| 945 | (should (equal (smatch "a\\_>+ " "a ") 0)) | ||
| 946 | |||
| 947 | (should (equal (smatch "\\=*b" "*b") 1)) | ||
| 948 | (should (equal (smatch "a\\=*b" "a*b") nil)) | ||
| 949 | (should (equal (smatch "a\\=*b" "ab") 0)) | ||
| 950 | )) | ||
| 951 | |||
| 886 | ;;; regex-emacs-tests.el ends here | 952 | ;;; regex-emacs-tests.el ends here |