diff options
| author | Mattias EngdegÄrd | 2020-02-11 20:04:42 +0100 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2020-03-01 10:47:14 +0100 |
| commit | 49d3cd90bd80a225d5ec26027318ffb4606ff513 (patch) | |
| tree | 13ba7b7faec7014b626d1c4dfdbfec98df44631f | |
| parent | 6b48aedb6b3b1de0b41b61b727d14ab8277d2f73 (diff) | |
| download | emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.tar.gz emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.zip | |
rx: Improve 'or' compositionality (bug#37659)
Perform 'regexp-opt' on nested 'or' forms, and after expansion of
user-defined and 'eval' forms. Characters are now turned into strings
for wider 'regexp-opt' scope. This preserves the longest-match
semantics for string in 'or' forms over composition.
* doc/lispref/searching.texi (Rx Constructs): Document.
* lisp/emacs-lisp/rx.el (rx--normalise-or-arg)
(rx--all-string-or-args): New.
(rx--translate-or): Normalise arguments first, and check for strings
in subforms.
(rx--expand-eval): Extracted from rx--translate-eval.
(rx--translate-eval): Call rx--expand-eval.
* test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests.
* etc/NEWS: Announce.
| -rw-r--r-- | doc/lispref/searching.texi | 5 | ||||
| -rw-r--r-- | etc/NEWS | 6 | ||||
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 76 | ||||
| -rw-r--r-- | test/lisp/emacs-lisp/rx-tests.el | 13 |
4 files changed, 69 insertions, 31 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index a4d5a27203f..1a090ebe101 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi | |||
| @@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}} | |||
| 1086 | @itemx @code{(| @var{rx}@dots{})} | 1086 | @itemx @code{(| @var{rx}@dots{})} |
| 1087 | @cindex @code{|} in rx | 1087 | @cindex @code{|} in rx |
| 1088 | Match exactly one of the @var{rx}s. | 1088 | Match exactly one of the @var{rx}s. |
| 1089 | If all arguments are string literals, the longest possible match | 1089 | If all arguments are strings, characters, or @code{or} forms |
| 1090 | will always be used. Otherwise, either the longest match or the | 1090 | so constrained, the longest possible match will always be used. |
| 1091 | Otherwise, either the longest match or the | ||
| 1091 | first (in left-to-right order) will be used. | 1092 | first (in left-to-right order) will be used. |
| 1092 | Without arguments, the expression will not match anything at all.@* | 1093 | Without arguments, the expression will not match anything at all.@* |
| 1093 | Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}. | 1094 | Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}. |
| @@ -2325,6 +2325,12 @@ expressions from simpler parts. | |||
| 2325 | +++ | 2325 | +++ |
| 2326 | *** 'not' argument can now be a character or single-char string. | 2326 | *** 'not' argument can now be a character or single-char string. |
| 2327 | 2327 | ||
| 2328 | +++ | ||
| 2329 | *** Nested 'or' forms of strings guarantee a longest match. | ||
| 2330 | For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches | ||
| 2331 | the whole string "INPUT" if present, not just "IN". Previously, this | ||
| 2332 | was only guaranteed inside a single 'or' form of string literals. | ||
| 2333 | |||
| 2328 | ** Frames | 2334 | ** Frames |
| 2329 | 2335 | ||
| 2330 | +++ | 2336 | +++ |
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 1ee5e8294a6..a0b2444346a 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F." | |||
| 254 | (setq l (cdr l))) | 254 | (setq l (cdr l))) |
| 255 | x) | 255 | x) |
| 256 | 256 | ||
| 257 | (defun rx--normalise-or-arg (form) | ||
| 258 | "Normalise the `or' argument FORM. | ||
| 259 | Characters become strings, user-definitions and `eval' forms are expanded, | ||
| 260 | and `or' forms are normalised recursively." | ||
| 261 | (cond ((characterp form) | ||
| 262 | (char-to-string form)) | ||
| 263 | ((and (consp form) (memq (car form) '(or |))) | ||
| 264 | (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form)))) | ||
| 265 | ((and (consp form) (eq (car form) 'eval)) | ||
| 266 | (rx--normalise-or-arg (rx--expand-eval (cdr form)))) | ||
| 267 | (t | ||
| 268 | (let ((expanded (rx--expand-def form))) | ||
| 269 | (if expanded | ||
| 270 | (rx--normalise-or-arg expanded) | ||
| 271 | form))))) | ||
| 272 | |||
| 273 | (defun rx--all-string-or-args (body) | ||
| 274 | "If BODY only consists of strings or such `or' forms, return all the strings. | ||
| 275 | Otherwise throw `rx--nonstring'." | ||
| 276 | (mapcan (lambda (form) | ||
| 277 | (cond ((stringp form) (list form)) | ||
| 278 | ((and (consp form) (memq (car form) '(or |))) | ||
| 279 | (rx--all-string-or-args (cdr form))) | ||
| 280 | (t (throw 'rx--nonstring nil)))) | ||
| 281 | body)) | ||
| 282 | |||
| 257 | (defun rx--translate-or (body) | 283 | (defun rx--translate-or (body) |
| 258 | "Translate an or-pattern of zero or more rx items. | 284 | "Translate an or-pattern of zero or more rx items. |
| 259 | Return (REGEXP . PRECEDENCE)." | 285 | Return (REGEXP . PRECEDENCE)." |
| 260 | ;; FIXME: Possible improvements: | 286 | ;; FIXME: Possible improvements: |
| 261 | ;; | 287 | ;; |
| 262 | ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"), | ||
| 263 | ;; so that they can be candidates for regexp-opt. | ||
| 264 | ;; | ||
| 265 | ;; - Translate compile-time strings (`eval' forms), again for regexp-opt. | ||
| 266 | ;; | ||
| 267 | ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) | 288 | ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) |
| 268 | ;; in order to improve effectiveness of regexp-opt. | 289 | ;; Then call regexp-opt on runs of string arguments. Example: |
| 269 | ;; This would also help composability. | ||
| 270 | ;; | ||
| 271 | ;; - Use associativity to run regexp-opt on contiguous subsets of arguments | ||
| 272 | ;; if not all of them are strings. Example: | ||
| 273 | ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) | 290 | ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) |
| 274 | ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) | 291 | ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) |
| 275 | ;; | 292 | ;; |
| @@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)." | |||
| 279 | ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) | 296 | ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) |
| 280 | ;; -> (any "@" "%" digit "A-Z" space word) | 297 | ;; -> (any "@" "%" digit "A-Z" space word) |
| 281 | ;; -> "[A-Z@%[:digit:][:space:][:word:]]" | 298 | ;; -> "[A-Z@%[:digit:][:space:][:word:]]" |
| 282 | ;; | ||
| 283 | ;; Problem: If a subpattern is carefully written to be | ||
| 284 | ;; optimizable by regexp-opt, how do we prevent the transforms | ||
| 285 | ;; above from destroying that property? | ||
| 286 | ;; Example: (or "a" (or "abc" "abd" "abe")) | ||
| 287 | (cond | 299 | (cond |
| 288 | ((null body) ; No items: a never-matching regexp. | 300 | ((null body) ; No items: a never-matching regexp. |
| 289 | (rx--empty)) | 301 | (rx--empty)) |
| 290 | ((null (cdr body)) ; Single item. | 302 | ((null (cdr body)) ; Single item. |
| 291 | (rx--translate (car body))) | 303 | (rx--translate (car body))) |
| 292 | ((rx--every #'stringp body) ; All strings. | ||
| 293 | (cons (list (regexp-opt body nil)) | ||
| 294 | t)) | ||
| 295 | ((rx--every #'rx--charset-p body) ; All charsets. | ||
| 296 | (rx--translate-union nil body)) | ||
| 297 | (t | 304 | (t |
| 298 | (cons (append (car (rx--translate (car body))) | 305 | (let* ((args (mapcar #'rx--normalise-or-arg body)) |
| 299 | (mapcan (lambda (item) | 306 | (all-strings (catch 'rx--nonstring (rx--all-string-or-args args)))) |
| 300 | (cons "\\|" (car (rx--translate item)))) | 307 | (cond |
| 301 | (cdr body))) | 308 | (all-strings ; Only strings. |
| 302 | nil)))) | 309 | (cons (list (regexp-opt all-strings nil)) |
| 310 | t)) | ||
| 311 | ((rx--every #'rx--charset-p args) ; All charsets. | ||
| 312 | (rx--translate-union nil args)) | ||
| 313 | (t | ||
| 314 | (cons (append (car (rx--translate (car args))) | ||
| 315 | (mapcan (lambda (item) | ||
| 316 | (cons "\\|" (car (rx--translate item)))) | ||
| 317 | (cdr args))) | ||
| 318 | nil))))))) | ||
| 303 | 319 | ||
| 304 | (defun rx--charset-p (form) | 320 | (defun rx--charset-p (form) |
| 305 | "Whether FORM looks like a charset, only consisting of character intervals | 321 | "Whether FORM looks like a charset, only consisting of character intervals |
| @@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)." | |||
| 840 | (cons (list (list 'regexp-quote arg)) 'seq)) | 856 | (cons (list (list 'regexp-quote arg)) 'seq)) |
| 841 | (t (error "rx `literal' form with non-string argument"))))) | 857 | (t (error "rx `literal' form with non-string argument"))))) |
| 842 | 858 | ||
| 843 | (defun rx--translate-eval (body) | 859 | (defun rx--expand-eval (body) |
| 844 | "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." | 860 | "Expand `eval' arguments. Return a new rx form." |
| 845 | (unless (and body (null (cdr body))) | 861 | (unless (and body (null (cdr body))) |
| 846 | (error "rx `eval' form takes exactly one argument")) | 862 | (error "rx `eval' form takes exactly one argument")) |
| 847 | (rx--translate (eval (car body)))) | 863 | (eval (car body))) |
| 864 | |||
| 865 | (defun rx--translate-eval (body) | ||
| 866 | "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." | ||
| 867 | (rx--translate (rx--expand-eval body))) | ||
| 848 | 868 | ||
| 849 | (defvar rx--regexp-atomic-regexp nil) | 869 | (defvar rx--regexp-atomic-regexp nil) |
| 850 | 870 | ||
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 2e34d65a9aa..4888e1d9d1e 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el | |||
| @@ -42,13 +42,24 @@ | |||
| 42 | (ert-deftest rx-or () | 42 | (ert-deftest rx-or () |
| 43 | (should (equal (rx (or "ab" (| "c" nonl) "de")) | 43 | (should (equal (rx (or "ab" (| "c" nonl) "de")) |
| 44 | "ab\\|c\\|.\\|de")) | 44 | "ab\\|c\\|.\\|de")) |
| 45 | (should (equal (rx (or "ab" "abc" "a")) | 45 | (should (equal (rx (or "ab" "abc" ?a)) |
| 46 | "\\(?:a\\(?:bc?\\)?\\)")) | 46 | "\\(?:a\\(?:bc?\\)?\\)")) |
| 47 | (should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc"))) | ||
| 48 | "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)")) | ||
| 49 | (should (equal (rx (or "a" (eval (string ?a ?b)))) | ||
| 50 | "\\(?:ab?\\)")) | ||
| 47 | (should (equal (rx (| nonl "a") (| "b" blank)) | 51 | (should (equal (rx (| nonl "a") (| "b" blank)) |
| 48 | "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) | 52 | "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) |
| 49 | (should (equal (rx (|)) | 53 | (should (equal (rx (|)) |
| 50 | "\\`a\\`"))) | 54 | "\\`a\\`"))) |
| 51 | 55 | ||
| 56 | (ert-deftest rx-def-in-or () | ||
| 57 | (rx-let ((a b) | ||
| 58 | (b (or "abc" c)) | ||
| 59 | (c ?a)) | ||
| 60 | (should (equal (rx (or a (| "ab" "abcde") "abcd")) | ||
| 61 | "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)")))) | ||
| 62 | |||
| 52 | (ert-deftest rx-char-any () | 63 | (ert-deftest rx-char-any () |
| 53 | "Test character alternatives with `]' and `-' (Bug#25123)." | 64 | "Test character alternatives with `]' and `-' (Bug#25123)." |
| 54 | (should (equal | 65 | (should (equal |