diff options
| author | Mattias EngdegÄrd | 2019-02-24 22:12:52 +0100 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2019-03-02 15:35:28 +0100 |
| commit | da758046da74e33273265cd2e72a8aa1a0c9c7e3 (patch) | |
| tree | 4337523f0b56c12d69f27a91ee0a1b61376c0e7e | |
| parent | dbffbe08815644fd30404891ef81496277ed27da (diff) | |
| download | emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.gz emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.zip | |
rx: fix `or' ordering by adding argument to regexp-opt
The rx `or' form may reorder its arguments in an unpredictable way,
contrary to user expectation, since it sometimes uses `regexp-opt'.
Add a NOREORDER option to `regexp-opt' for preventing it from
producing a reordered regexp (Bug#34641).
* doc/lispref/searching.texi (Regular Expression Functions):
* etc/NEWS (Lisp Changes in Emacs 27.1):
Describe the new regexp-opt NOREORDER argument.
* lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER.
Make no attempt at regexp improvement if the set of strings contains
a prefix of another string.
(regexp-opt--contains-prefix): New.
* lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER.
* test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
| -rw-r--r-- | doc/lispref/searching.texi | 13 | ||||
| -rw-r--r-- | etc/NEWS | 7 | ||||
| -rw-r--r-- | lisp/emacs-lisp/regexp-opt.el | 38 | ||||
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 2 | ||||
| -rw-r--r-- | test/lisp/emacs-lisp/rx-tests.el | 13 |
5 files changed, 65 insertions, 8 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index cfbd2449b13..fb7f48474d5 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi | |||
| @@ -950,7 +950,7 @@ whitespace: | |||
| 950 | @end defun | 950 | @end defun |
| 951 | 951 | ||
| 952 | @cindex optimize regexp | 952 | @cindex optimize regexp |
| 953 | @defun regexp-opt strings &optional paren | 953 | @defun regexp-opt strings &optional paren noreorder |
| 954 | This function returns an efficient regular expression that will match | 954 | This function returns an efficient regular expression that will match |
| 955 | any of the strings in the list @var{strings}. This is useful when you | 955 | any of the strings in the list @var{strings}. This is useful when you |
| 956 | need to make matching or searching as fast as possible---for example, | 956 | need to make matching or searching as fast as possible---for example, |
| @@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator appended to | |||
| 985 | it will apply to the whole expression. | 985 | it will apply to the whole expression. |
| 986 | @end table | 986 | @end table |
| 987 | 987 | ||
| 988 | The resulting regexp of @code{regexp-opt} is equivalent to but usually | 988 | The optional argument @var{noreorder}, if @code{nil} or omitted, |
| 989 | more efficient than that of a simplified version: | 989 | allows the returned regexp to match the strings in any order. If |
| 990 | non-@code{nil}, the match is guaranteed to be performed in the order | ||
| 991 | given, as if the strings were made into a regexp by joining them with | ||
| 992 | the @samp{\|} operator. | ||
| 993 | |||
| 994 | Up to reordering, the resulting regexp of @code{regexp-opt} is | ||
| 995 | equivalent to but usually more efficient than that of a simplified | ||
| 996 | version: | ||
| 990 | 997 | ||
| 991 | @example | 998 | @example |
| 992 | (defun simplified-regexp-opt (strings &optional paren) | 999 | (defun simplified-regexp-opt (strings &optional paren) |
| @@ -1642,6 +1642,13 @@ MS-Windows. | |||
| 1642 | ** New module environment function 'process_input' to process user | 1642 | ** New module environment function 'process_input' to process user |
| 1643 | input while module code is running. | 1643 | input while module code is running. |
| 1644 | 1644 | ||
| 1645 | +++ | ||
| 1646 | ** The function 'regexp-opt' accepts an additional optional argument. | ||
| 1647 | By default, the regexp returned by 'regexp-opt' may match the strings | ||
| 1648 | in any order. If the new third argument is non-nil, the match is | ||
| 1649 | guaranteed to be performed in the order given, as if the strings were | ||
| 1650 | made into a regexp by joining them with '\|'. | ||
| 1651 | |||
| 1645 | 1652 | ||
| 1646 | * Changes in Emacs 27.1 on Non-Free Operating Systems | 1653 | * Changes in Emacs 27.1 on Non-Free Operating Systems |
| 1647 | 1654 | ||
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index 63786c1508c..d0c5f2d3fc4 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el | |||
| @@ -84,7 +84,7 @@ | |||
| 84 | ;;; Code: | 84 | ;;; Code: |
| 85 | 85 | ||
| 86 | ;;;###autoload | 86 | ;;;###autoload |
| 87 | (defun regexp-opt (strings &optional paren) | 87 | (defun regexp-opt (strings &optional paren noreorder) |
| 88 | "Return a regexp to match a string in the list STRINGS. | 88 | "Return a regexp to match a string in the list STRINGS. |
| 89 | Each string should be unique in STRINGS and should not contain | 89 | Each string should be unique in STRINGS and should not contain |
| 90 | any regexps, quoted or not. Optional PAREN specifies how the | 90 | any regexps, quoted or not. Optional PAREN specifies how the |
| @@ -111,8 +111,14 @@ nil | |||
| 111 | necessary to ensure that a postfix operator appended to it will | 111 | necessary to ensure that a postfix operator appended to it will |
| 112 | apply to the whole expression. | 112 | apply to the whole expression. |
| 113 | 113 | ||
| 114 | The resulting regexp is equivalent to but usually more efficient | 114 | The optional argument NOREORDER, if nil or omitted, allows the |
| 115 | than that of a simplified version: | 115 | returned regexp to match the strings in any order. If non-nil, |
| 116 | the match is guaranteed to be performed in the order given, as if | ||
| 117 | the strings were made into a regexp by joining them with the | ||
| 118 | `\\|' operator. | ||
| 119 | |||
| 120 | Up to reordering, the resulting regexp is equivalent to but | ||
| 121 | usually more efficient than that of a simplified version: | ||
| 116 | 122 | ||
| 117 | (defun simplified-regexp-opt (strings &optional paren) | 123 | (defun simplified-regexp-opt (strings &optional paren) |
| 118 | (let ((parens | 124 | (let ((parens |
| @@ -133,7 +139,15 @@ than that of a simplified version: | |||
| 133 | (open (cond ((stringp paren) paren) (paren "\\("))) | 139 | (open (cond ((stringp paren) paren) (paren "\\("))) |
| 134 | (sorted-strings (delete-dups | 140 | (sorted-strings (delete-dups |
| 135 | (sort (copy-sequence strings) 'string-lessp))) | 141 | (sort (copy-sequence strings) 'string-lessp))) |
| 136 | (re (regexp-opt-group sorted-strings (or open t) (not open)))) | 142 | (re |
| 143 | ;; If NOREORDER is non-nil and the list contains a prefix | ||
| 144 | ;; of another string, we give up all attempts at optimisation. | ||
| 145 | ;; There is plenty of room for improvement (Bug#34641). | ||
| 146 | (if (and noreorder (regexp-opt--contains-prefix sorted-strings)) | ||
| 147 | (concat (or open "\\(?:") | ||
| 148 | (mapconcat #'regexp-quote strings "\\|") | ||
| 149 | "\\)") | ||
| 150 | (regexp-opt-group sorted-strings (or open t) (not open))))) | ||
| 137 | (cond ((eq paren 'words) | 151 | (cond ((eq paren 'words) |
| 138 | (concat "\\<" re "\\>")) | 152 | (concat "\\<" re "\\>")) |
| 139 | ((eq paren 'symbols) | 153 | ((eq paren 'symbols) |
| @@ -313,6 +327,22 @@ CHARS should be a list of characters." | |||
| 313 | (concat "[" dash caret "]")) | 327 | (concat "[" dash caret "]")) |
| 314 | (concat "[" bracket charset caret dash "]")))) | 328 | (concat "[" bracket charset caret dash "]")))) |
| 315 | 329 | ||
| 330 | |||
| 331 | (defun regexp-opt--contains-prefix (strings) | ||
| 332 | "Whether STRINGS contains a proper prefix of one of its other elements. | ||
| 333 | STRINGS must be a list of sorted strings without duplicates." | ||
| 334 | (let ((s strings)) | ||
| 335 | ;; In a lexicographically sorted list, a string always immediately | ||
| 336 | ;; succeeds one of its prefixes. | ||
| 337 | (while (and (cdr s) | ||
| 338 | (not (string-equal | ||
| 339 | (car s) | ||
| 340 | (substring (cadr s) 0 (min (length (car s)) | ||
| 341 | (length (cadr s))))))) | ||
| 342 | (setq s (cdr s))) | ||
| 343 | (cdr s))) | ||
| 344 | |||
| 345 | |||
| 316 | (provide 'regexp-opt) | 346 | (provide 'regexp-opt) |
| 317 | 347 | ||
| 318 | ;;; regexp-opt.el ends here | 348 | ;;; regexp-opt.el ends here |
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 715cd608c46..ca756efb493 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'." | |||
| 393 | (rx-group-if | 393 | (rx-group-if |
| 394 | (if (memq nil (mapcar 'stringp (cdr form))) | 394 | (if (memq nil (mapcar 'stringp (cdr form))) |
| 395 | (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") | 395 | (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") |
| 396 | (regexp-opt (cdr form))) | 396 | (regexp-opt (cdr form) nil t)) |
| 397 | (and (memq rx-parent '(: * t)) rx-parent))) | 397 | (and (memq rx-parent '(: * t)) rx-parent))) |
| 398 | 398 | ||
| 399 | 399 | ||
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index e14feda347f..fa3d9b0d5ea 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el | |||
| @@ -92,5 +92,18 @@ | |||
| 92 | (*? "e") (+? "f") (\?? "g") (?? "h")))) | 92 | (*? "e") (+? "f") (\?? "g") (?? "h")))) |
| 93 | "a*b+c?d?e*?f+?g??h??"))) | 93 | "a*b+c?d?e*?f+?g??h??"))) |
| 94 | 94 | ||
| 95 | (ert-deftest rx-or () | ||
| 96 | ;; Test or-pattern reordering (Bug#34641). | ||
| 97 | (let ((s "abc")) | ||
| 98 | (should (equal (and (string-match (rx (or "abc" "ab" "a")) s) | ||
| 99 | (match-string 0 s)) | ||
| 100 | "abc")) | ||
| 101 | (should (equal (and (string-match (rx (or "ab" "abc" "a")) s) | ||
| 102 | (match-string 0 s)) | ||
| 103 | "ab")) | ||
| 104 | (should (equal (and (string-match (rx (or "a" "ab" "abc")) s) | ||
| 105 | (match-string 0 s)) | ||
| 106 | "a")))) | ||
| 107 | |||
| 95 | (provide 'rx-tests) | 108 | (provide 'rx-tests) |
| 96 | ;; rx-tests.el ends here. | 109 | ;; rx-tests.el ends here. |