aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2019-02-24 22:12:52 +0100
committerMattias EngdegÄrd2019-03-02 15:35:28 +0100
commitda758046da74e33273265cd2e72a8aa1a0c9c7e3 (patch)
tree4337523f0b56c12d69f27a91ee0a1b61376c0e7e
parentdbffbe08815644fd30404891ef81496277ed27da (diff)
downloademacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.gz
emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.zip
rx: fix `or' ordering by adding argument to regexp-opt
The rx `or' form may reorder its arguments in an unpredictable way, contrary to user expectation, since it sometimes uses `regexp-opt'. Add a NOREORDER option to `regexp-opt' for preventing it from producing a reordered regexp (Bug#34641). * doc/lispref/searching.texi (Regular Expression Functions): * etc/NEWS (Lisp Changes in Emacs 27.1): Describe the new regexp-opt NOREORDER argument. * lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER. Make no attempt at regexp improvement if the set of strings contains a prefix of another string. (regexp-opt--contains-prefix): New. * lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER. * test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
-rw-r--r--doc/lispref/searching.texi13
-rw-r--r--etc/NEWS7
-rw-r--r--lisp/emacs-lisp/regexp-opt.el38
-rw-r--r--lisp/emacs-lisp/rx.el2
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el13
5 files changed, 65 insertions, 8 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index cfbd2449b13..fb7f48474d5 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -950,7 +950,7 @@ whitespace:
950@end defun 950@end defun
951 951
952@cindex optimize regexp 952@cindex optimize regexp
953@defun regexp-opt strings &optional paren 953@defun regexp-opt strings &optional paren noreorder
954This function returns an efficient regular expression that will match 954This function returns an efficient regular expression that will match
955any of the strings in the list @var{strings}. This is useful when you 955any of the strings in the list @var{strings}. This is useful when you
956need to make matching or searching as fast as possible---for example, 956need to make matching or searching as fast as possible---for example,
@@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator appended to
985it will apply to the whole expression. 985it will apply to the whole expression.
986@end table 986@end table
987 987
988The resulting regexp of @code{regexp-opt} is equivalent to but usually 988The optional argument @var{noreorder}, if @code{nil} or omitted,
989more efficient than that of a simplified version: 989allows the returned regexp to match the strings in any order. If
990non-@code{nil}, the match is guaranteed to be performed in the order
991given, as if the strings were made into a regexp by joining them with
992the @samp{\|} operator.
993
994Up to reordering, the resulting regexp of @code{regexp-opt} is
995equivalent to but usually more efficient than that of a simplified
996version:
990 997
991@example 998@example
992(defun simplified-regexp-opt (strings &optional paren) 999(defun simplified-regexp-opt (strings &optional paren)
diff --git a/etc/NEWS b/etc/NEWS
index 29ed7ab4819..7c95988ff52 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1642,6 +1642,13 @@ MS-Windows.
1642** New module environment function 'process_input' to process user 1642** New module environment function 'process_input' to process user
1643input while module code is running. 1643input while module code is running.
1644 1644
1645+++
1646** The function 'regexp-opt' accepts an additional optional argument.
1647By default, the regexp returned by 'regexp-opt' may match the strings
1648in any order. If the new third argument is non-nil, the match is
1649guaranteed to be performed in the order given, as if the strings were
1650made into a regexp by joining them with '\|'.
1651
1645 1652
1646* Changes in Emacs 27.1 on Non-Free Operating Systems 1653* Changes in Emacs 27.1 on Non-Free Operating Systems
1647 1654
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el
index 63786c1508c..d0c5f2d3fc4 100644
--- a/lisp/emacs-lisp/regexp-opt.el
+++ b/lisp/emacs-lisp/regexp-opt.el
@@ -84,7 +84,7 @@
84;;; Code: 84;;; Code:
85 85
86;;;###autoload 86;;;###autoload
87(defun regexp-opt (strings &optional paren) 87(defun regexp-opt (strings &optional paren noreorder)
88 "Return a regexp to match a string in the list STRINGS. 88 "Return a regexp to match a string in the list STRINGS.
89Each string should be unique in STRINGS and should not contain 89Each string should be unique in STRINGS and should not contain
90any regexps, quoted or not. Optional PAREN specifies how the 90any regexps, quoted or not. Optional PAREN specifies how the
@@ -111,8 +111,14 @@ nil
111 necessary to ensure that a postfix operator appended to it will 111 necessary to ensure that a postfix operator appended to it will
112 apply to the whole expression. 112 apply to the whole expression.
113 113
114The resulting regexp is equivalent to but usually more efficient 114The optional argument NOREORDER, if nil or omitted, allows the
115than that of a simplified version: 115returned regexp to match the strings in any order. If non-nil,
116the match is guaranteed to be performed in the order given, as if
117the strings were made into a regexp by joining them with the
118`\\|' operator.
119
120Up to reordering, the resulting regexp is equivalent to but
121usually more efficient than that of a simplified version:
116 122
117 (defun simplified-regexp-opt (strings &optional paren) 123 (defun simplified-regexp-opt (strings &optional paren)
118 (let ((parens 124 (let ((parens
@@ -133,7 +139,15 @@ than that of a simplified version:
133 (open (cond ((stringp paren) paren) (paren "\\("))) 139 (open (cond ((stringp paren) paren) (paren "\\(")))
134 (sorted-strings (delete-dups 140 (sorted-strings (delete-dups
135 (sort (copy-sequence strings) 'string-lessp))) 141 (sort (copy-sequence strings) 'string-lessp)))
136 (re (regexp-opt-group sorted-strings (or open t) (not open)))) 142 (re
143 ;; If NOREORDER is non-nil and the list contains a prefix
144 ;; of another string, we give up all attempts at optimisation.
145 ;; There is plenty of room for improvement (Bug#34641).
146 (if (and noreorder (regexp-opt--contains-prefix sorted-strings))
147 (concat (or open "\\(?:")
148 (mapconcat #'regexp-quote strings "\\|")
149 "\\)")
150 (regexp-opt-group sorted-strings (or open t) (not open)))))
137 (cond ((eq paren 'words) 151 (cond ((eq paren 'words)
138 (concat "\\<" re "\\>")) 152 (concat "\\<" re "\\>"))
139 ((eq paren 'symbols) 153 ((eq paren 'symbols)
@@ -313,6 +327,22 @@ CHARS should be a list of characters."
313 (concat "[" dash caret "]")) 327 (concat "[" dash caret "]"))
314 (concat "[" bracket charset caret dash "]")))) 328 (concat "[" bracket charset caret dash "]"))))
315 329
330
331(defun regexp-opt--contains-prefix (strings)
332 "Whether STRINGS contains a proper prefix of one of its other elements.
333STRINGS must be a list of sorted strings without duplicates."
334 (let ((s strings))
335 ;; In a lexicographically sorted list, a string always immediately
336 ;; succeeds one of its prefixes.
337 (while (and (cdr s)
338 (not (string-equal
339 (car s)
340 (substring (cadr s) 0 (min (length (car s))
341 (length (cadr s)))))))
342 (setq s (cdr s)))
343 (cdr s)))
344
345
316(provide 'regexp-opt) 346(provide 'regexp-opt)
317 347
318;;; regexp-opt.el ends here 348;;; regexp-opt.el ends here
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 715cd608c46..ca756efb493 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'."
393 (rx-group-if 393 (rx-group-if
394 (if (memq nil (mapcar 'stringp (cdr form))) 394 (if (memq nil (mapcar 'stringp (cdr form)))
395 (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") 395 (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
396 (regexp-opt (cdr form))) 396 (regexp-opt (cdr form) nil t))
397 (and (memq rx-parent '(: * t)) rx-parent))) 397 (and (memq rx-parent '(: * t)) rx-parent)))
398 398
399 399
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index e14feda347f..fa3d9b0d5ea 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -92,5 +92,18 @@
92 (*? "e") (+? "f") (\?? "g") (?? "h")))) 92 (*? "e") (+? "f") (\?? "g") (?? "h"))))
93 "a*b+c?d?e*?f+?g??h??"))) 93 "a*b+c?d?e*?f+?g??h??")))
94 94
95(ert-deftest rx-or ()
96 ;; Test or-pattern reordering (Bug#34641).
97 (let ((s "abc"))
98 (should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
99 (match-string 0 s))
100 "abc"))
101 (should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
102 (match-string 0 s))
103 "ab"))
104 (should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
105 (match-string 0 s))
106 "a"))))
107
95(provide 'rx-tests) 108(provide 'rx-tests)
96;; rx-tests.el ends here. 109;; rx-tests.el ends here.