aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2020-02-11 20:04:42 +0100
committerMattias EngdegÄrd2020-03-01 10:47:14 +0100
commit49d3cd90bd80a225d5ec26027318ffb4606ff513 (patch)
tree13ba7b7faec7014b626d1c4dfdbfec98df44631f
parent6b48aedb6b3b1de0b41b61b727d14ab8277d2f73 (diff)
downloademacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.tar.gz
emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.zip
rx: Improve 'or' compositionality (bug#37659)
Perform 'regexp-opt' on nested 'or' forms, and after expansion of user-defined and 'eval' forms. Characters are now turned into strings for wider 'regexp-opt' scope. This preserves the longest-match semantics for string in 'or' forms over composition. * doc/lispref/searching.texi (Rx Constructs): Document. * lisp/emacs-lisp/rx.el (rx--normalise-or-arg) (rx--all-string-or-args): New. (rx--translate-or): Normalise arguments first, and check for strings in subforms. (rx--expand-eval): Extracted from rx--translate-eval. (rx--translate-eval): Call rx--expand-eval. * test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests. * etc/NEWS: Announce.
-rw-r--r--doc/lispref/searching.texi5
-rw-r--r--etc/NEWS6
-rw-r--r--lisp/emacs-lisp/rx.el76
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el13
4 files changed, 69 insertions, 31 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index a4d5a27203f..1a090ebe101 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}}
1086@itemx @code{(| @var{rx}@dots{})} 1086@itemx @code{(| @var{rx}@dots{})}
1087@cindex @code{|} in rx 1087@cindex @code{|} in rx
1088Match exactly one of the @var{rx}s. 1088Match exactly one of the @var{rx}s.
1089If all arguments are string literals, the longest possible match 1089If all arguments are strings, characters, or @code{or} forms
1090will always be used. Otherwise, either the longest match or the 1090so constrained, the longest possible match will always be used.
1091Otherwise, either the longest match or the
1091first (in left-to-right order) will be used. 1092first (in left-to-right order) will be used.
1092Without arguments, the expression will not match anything at all.@* 1093Without arguments, the expression will not match anything at all.@*
1093Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}. 1094Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.
diff --git a/etc/NEWS b/etc/NEWS
index e9dfd266b46..6e2b1fe00e2 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2325,6 +2325,12 @@ expressions from simpler parts.
2325+++ 2325+++
2326*** 'not' argument can now be a character or single-char string. 2326*** 'not' argument can now be a character or single-char string.
2327 2327
2328+++
2329*** Nested 'or' forms of strings guarantee a longest match.
2330For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches
2331the whole string "INPUT" if present, not just "IN". Previously, this
2332was only guaranteed inside a single 'or' form of string literals.
2333
2328** Frames 2334** Frames
2329 2335
2330+++ 2336+++
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 1ee5e8294a6..a0b2444346a 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F."
254 (setq l (cdr l))) 254 (setq l (cdr l)))
255 x) 255 x)
256 256
257(defun rx--normalise-or-arg (form)
258 "Normalise the `or' argument FORM.
259Characters become strings, user-definitions and `eval' forms are expanded,
260and `or' forms are normalised recursively."
261 (cond ((characterp form)
262 (char-to-string form))
263 ((and (consp form) (memq (car form) '(or |)))
264 (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
265 ((and (consp form) (eq (car form) 'eval))
266 (rx--normalise-or-arg (rx--expand-eval (cdr form))))
267 (t
268 (let ((expanded (rx--expand-def form)))
269 (if expanded
270 (rx--normalise-or-arg expanded)
271 form)))))
272
273(defun rx--all-string-or-args (body)
274 "If BODY only consists of strings or such `or' forms, return all the strings.
275Otherwise throw `rx--nonstring'."
276 (mapcan (lambda (form)
277 (cond ((stringp form) (list form))
278 ((and (consp form) (memq (car form) '(or |)))
279 (rx--all-string-or-args (cdr form)))
280 (t (throw 'rx--nonstring nil))))
281 body))
282
257(defun rx--translate-or (body) 283(defun rx--translate-or (body)
258 "Translate an or-pattern of zero or more rx items. 284 "Translate an or-pattern of zero or more rx items.
259Return (REGEXP . PRECEDENCE)." 285Return (REGEXP . PRECEDENCE)."
260 ;; FIXME: Possible improvements: 286 ;; FIXME: Possible improvements:
261 ;; 287 ;;
262 ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
263 ;; so that they can be candidates for regexp-opt.
264 ;;
265 ;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
266 ;;
267 ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) 288 ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
268 ;; in order to improve effectiveness of regexp-opt. 289 ;; Then call regexp-opt on runs of string arguments. Example:
269 ;; This would also help composability.
270 ;;
271 ;; - Use associativity to run regexp-opt on contiguous subsets of arguments
272 ;; if not all of them are strings. Example:
273 ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) 290 ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
274 ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) 291 ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
275 ;; 292 ;;
@@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
279 ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) 296 ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
280 ;; -> (any "@" "%" digit "A-Z" space word) 297 ;; -> (any "@" "%" digit "A-Z" space word)
281 ;; -> "[A-Z@%[:digit:][:space:][:word:]]" 298 ;; -> "[A-Z@%[:digit:][:space:][:word:]]"
282 ;;
283 ;; Problem: If a subpattern is carefully written to be
284 ;; optimizable by regexp-opt, how do we prevent the transforms
285 ;; above from destroying that property?
286 ;; Example: (or "a" (or "abc" "abd" "abe"))
287 (cond 299 (cond
288 ((null body) ; No items: a never-matching regexp. 300 ((null body) ; No items: a never-matching regexp.
289 (rx--empty)) 301 (rx--empty))
290 ((null (cdr body)) ; Single item. 302 ((null (cdr body)) ; Single item.
291 (rx--translate (car body))) 303 (rx--translate (car body)))
292 ((rx--every #'stringp body) ; All strings.
293 (cons (list (regexp-opt body nil))
294 t))
295 ((rx--every #'rx--charset-p body) ; All charsets.
296 (rx--translate-union nil body))
297 (t 304 (t
298 (cons (append (car (rx--translate (car body))) 305 (let* ((args (mapcar #'rx--normalise-or-arg body))
299 (mapcan (lambda (item) 306 (all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
300 (cons "\\|" (car (rx--translate item)))) 307 (cond
301 (cdr body))) 308 (all-strings ; Only strings.
302 nil)))) 309 (cons (list (regexp-opt all-strings nil))
310 t))
311 ((rx--every #'rx--charset-p args) ; All charsets.
312 (rx--translate-union nil args))
313 (t
314 (cons (append (car (rx--translate (car args)))
315 (mapcan (lambda (item)
316 (cons "\\|" (car (rx--translate item))))
317 (cdr args)))
318 nil)))))))
303 319
304(defun rx--charset-p (form) 320(defun rx--charset-p (form)
305 "Whether FORM looks like a charset, only consisting of character intervals 321 "Whether FORM looks like a charset, only consisting of character intervals
@@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
840 (cons (list (list 'regexp-quote arg)) 'seq)) 856 (cons (list (list 'regexp-quote arg)) 'seq))
841 (t (error "rx `literal' form with non-string argument"))))) 857 (t (error "rx `literal' form with non-string argument")))))
842 858
843(defun rx--translate-eval (body) 859(defun rx--expand-eval (body)
844 "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." 860 "Expand `eval' arguments. Return a new rx form."
845 (unless (and body (null (cdr body))) 861 (unless (and body (null (cdr body)))
846 (error "rx `eval' form takes exactly one argument")) 862 (error "rx `eval' form takes exactly one argument"))
847 (rx--translate (eval (car body)))) 863 (eval (car body)))
864
865(defun rx--translate-eval (body)
866 "Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
867 (rx--translate (rx--expand-eval body)))
848 868
849(defvar rx--regexp-atomic-regexp nil) 869(defvar rx--regexp-atomic-regexp nil)
850 870
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 2e34d65a9aa..4888e1d9d1e 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -42,13 +42,24 @@
42(ert-deftest rx-or () 42(ert-deftest rx-or ()
43 (should (equal (rx (or "ab" (| "c" nonl) "de")) 43 (should (equal (rx (or "ab" (| "c" nonl) "de"))
44 "ab\\|c\\|.\\|de")) 44 "ab\\|c\\|.\\|de"))
45 (should (equal (rx (or "ab" "abc" "a")) 45 (should (equal (rx (or "ab" "abc" ?a))
46 "\\(?:a\\(?:bc?\\)?\\)")) 46 "\\(?:a\\(?:bc?\\)?\\)"))
47 (should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
48 "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
49 (should (equal (rx (or "a" (eval (string ?a ?b))))
50 "\\(?:ab?\\)"))
47 (should (equal (rx (| nonl "a") (| "b" blank)) 51 (should (equal (rx (| nonl "a") (| "b" blank))
48 "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) 52 "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
49 (should (equal (rx (|)) 53 (should (equal (rx (|))
50 "\\`a\\`"))) 54 "\\`a\\`")))
51 55
56(ert-deftest rx-def-in-or ()
57 (rx-let ((a b)
58 (b (or "abc" c))
59 (c ?a))
60 (should (equal (rx (or a (| "ab" "abcde") "abcd"))
61 "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))))
62
52(ert-deftest rx-char-any () 63(ert-deftest rx-char-any ()
53 "Test character alternatives with `]' and `-' (Bug#25123)." 64 "Test character alternatives with `]' and `-' (Bug#25123)."
54 (should (equal 65 (should (equal