diff options
| author | Mattias EngdegÄrd | 2019-12-12 23:04:00 +0100 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2019-12-12 23:47:25 +0100 |
| commit | f16766a0eb2a78b58a4856d31306fc37f913d70e (patch) | |
| tree | d3be560c8aaf4f4d3a59b285e27aab224922bb33 | |
| parent | d7efe98951730842db4fc136e3b631c5ee0d8a53 (diff) | |
| download | emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.gz emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.zip | |
Use `or' instead of `union' for charset union in rx
Design change suggested by Stefan Monnier.
* doc/lispref/searching.texi (Rx Constructs):
* etc/NEWS: Document.
* lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments.
(rx--charset-p): New.
(rx--translate-not, rx--charset-intervals, rx--translate-union):
Change from `union' to `or'.
(rx--translate-form, rx--builtin-forms, rx): Remove `union'.
* test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union)
(rx-intersection): Rename tests and change `union' to `or' and `|'.
| -rw-r--r-- | doc/lispref/searching.texi | 22 | ||||
| -rw-r--r-- | etc/NEWS | 6 | ||||
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 41 | ||||
| -rw-r--r-- | test/lisp/emacs-lisp/rx-tests.el | 44 |
4 files changed, 65 insertions, 48 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index bf18f80f63f..0c6c7cc68b5 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi | |||
| @@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]} | |||
| 1214 | @item @code{(not @var{charspec})} | 1214 | @item @code{(not @var{charspec})} |
| 1215 | @cindex @code{not} in rx | 1215 | @cindex @code{not} in rx |
| 1216 | Match a character not included in @var{charspec}. @var{charspec} can | 1216 | Match a character not included in @var{charspec}. @var{charspec} can |
| 1217 | be an @code{any}, @code{not}, @code{union}, @code{intersection}, | 1217 | be an @code{any}, @code{not}, @code{or}, @code{intersection}, |
| 1218 | @code{syntax} or @code{category} form, or a character class.@* | 1218 | @code{syntax} or @code{category} form, or a character class. |
| 1219 | If @var{charspec} is an @code{or} form, its arguments have the same | ||
| 1220 | restrictions as those of @code{intersection}; see below.@* | ||
| 1219 | Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}}, | 1221 | Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}}, |
| 1220 | @samp{\C@var{code}} | 1222 | @samp{\C@var{code}} |
| 1221 | 1223 | ||
| 1222 | @item @code{(union @var{charset}@dots{})} | 1224 | @item @code{(intersection @var{charset}@dots{})} |
| 1223 | @itemx @code{(intersection @var{charset}@dots{})} | ||
| 1224 | @cindex @code{union} in rx | ||
| 1225 | @cindex @code{intersection} in rx | 1225 | @cindex @code{intersection} in rx |
| 1226 | Match a character that matches the union or intersection, | 1226 | Match a character included in all of the @var{charset}s. |
| 1227 | respectively, of the @var{charset}s. Each @var{charset} can be an | 1227 | Each @var{charset} can be an @code{any} form without character |
| 1228 | @code{any} form without character classes, or a @code{union}, | 1228 | classes, or an @code{intersection}, @code{or} or @code{not} form whose |
| 1229 | @code{intersection} or @code{not} form whose arguments are also | 1229 | arguments are also @var{charset}s. |
| 1230 | @var{charset}s. | ||
| 1231 | 1230 | ||
| 1232 | @item @code{not-newline}, @code{nonl} | 1231 | @item @code{not-newline}, @code{nonl} |
| 1233 | @cindex @code{not-newline} in rx | 1232 | @cindex @code{not-newline} in rx |
| @@ -1591,7 +1590,8 @@ when they are used, not when they are defined. | |||
| 1591 | User-defined forms are allowed wherever arbitrary @code{rx} | 1590 | User-defined forms are allowed wherever arbitrary @code{rx} |
| 1592 | expressions are expected; for example, in the body of a | 1591 | expressions are expected; for example, in the body of a |
| 1593 | @code{zero-or-one} form, but not inside @code{any} or @code{category} | 1592 | @code{zero-or-one} form, but not inside @code{any} or @code{category} |
| 1594 | forms. They are also allowed inside @code{not} forms. | 1593 | forms. They are also allowed inside @code{not} and |
| 1594 | @code{intersection} forms. | ||
| 1595 | @end itemize | 1595 | @end itemize |
| 1596 | 1596 | ||
| 1597 | @defmac rx-define name [arglist] rx-form | 1597 | @defmac rx-define name [arglist] rx-form |
| @@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation. | |||
| 2120 | Both match any single character; 'anychar' is more descriptive. | 2120 | Both match any single character; 'anychar' is more descriptive. |
| 2121 | 2121 | ||
| 2122 | +++ | 2122 | +++ |
| 2123 | *** New 'union' and 'intersection' forms for character sets. | 2123 | *** New 'intersection' form for character sets. |
| 2124 | These permit composing character-matching expressions from simpler | 2124 | With 'or' and 'not', it can be used to compose character-matching |
| 2125 | parts. | 2125 | expressions from simpler parts. |
| 2126 | 2126 | ||
| 2127 | ** Frames | 2127 | ** Frames |
| 2128 | 2128 | ||
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index d4b21c3c9ad..a5cab1db888 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)." | |||
| 273 | ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) | 273 | ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) |
| 274 | ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) | 274 | ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) |
| 275 | ;; | 275 | ;; |
| 276 | ;; - Fuse patterns into a single character alternative if they fit. | 276 | ;; - Optimise single-character alternatives better: |
| 277 | ;; regexp-opt will do that if all are strings, but we want to do that for: | 277 | ;; * classes: space, alpha, ... |
| 278 | ;; * symbols that expand to classes: space, alpha, ... | ||
| 279 | ;; * character alternatives: (any ...) | ||
| 280 | ;; * (syntax S), for some S (whitespace, word) | 278 | ;; * (syntax S), for some S (whitespace, word) |
| 281 | ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) | 279 | ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) |
| 282 | ;; -> (any "@" "%" digit "A-Z" space word) | 280 | ;; -> (any "@" "%" digit "A-Z" space word) |
| @@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)." | |||
| 294 | ((rx--every #'stringp body) ; All strings. | 292 | ((rx--every #'stringp body) ; All strings. |
| 295 | (cons (list (regexp-opt body nil t)) | 293 | (cons (list (regexp-opt body nil t)) |
| 296 | t)) | 294 | t)) |
| 295 | ((rx--every #'rx--charset-p body) ; All charsets. | ||
| 296 | (rx--translate-union nil body)) | ||
| 297 | (t | 297 | (t |
| 298 | (cons (append (car (rx--translate (car body))) | 298 | (cons (append (car (rx--translate (car body))) |
| 299 | (mapcan (lambda (item) | 299 | (mapcan (lambda (item) |
| @@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)." | |||
| 301 | (cdr body))) | 301 | (cdr body))) |
| 302 | nil)))) | 302 | nil)))) |
| 303 | 303 | ||
| 304 | (defun rx--charset-p (form) | ||
| 305 | "Whether FORM looks like a charset, only consisting of character intervals | ||
| 306 | and set operations." | ||
| 307 | (or (and (consp form) | ||
| 308 | (or (and (memq (car form) '(any 'in 'char)) | ||
| 309 | (rx--every (lambda (x) (not (symbolp x))) (cdr form))) | ||
| 310 | (and (memq (car form) '(not or | intersection)) | ||
| 311 | (rx--every #'rx--charset-p (cdr form))))) | ||
| 312 | (and (or (symbolp form) (consp form)) | ||
| 313 | (let ((expanded (rx--expand-def form))) | ||
| 314 | (and expanded | ||
| 315 | (rx--charset-p expanded)))))) | ||
| 316 | |||
| 304 | (defun rx--string-to-intervals (str) | 317 | (defun rx--string-to-intervals (str) |
| 305 | "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single | 318 | "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single |
| 306 | character X becomes (?X . ?X). Return the intervals in a list." | 319 | character X becomes (?X . ?X). Return the intervals in a list." |
| @@ -477,7 +490,7 @@ If NEGATED, negate the sense." | |||
| 477 | (not negated) (rx--complement-intervals intervals) nil))) | 490 | (not negated) (rx--complement-intervals intervals) nil))) |
| 478 | 491 | ||
| 479 | ;; FIXME: Consider turning `not' into a variadic operator, following SRE: | 492 | ;; FIXME: Consider turning `not' into a variadic operator, following SRE: |
| 480 | ;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and | 493 | ;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and |
| 481 | ;; (not) = anychar. | 494 | ;; (not) = anychar. |
| 482 | ;; Maybe allow singleton characters as arguments. | 495 | ;; Maybe allow singleton characters as arguments. |
| 483 | 496 | ||
| @@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)." | |||
| 498 | (rx--translate-category (not negated) (cdr arg))) | 511 | (rx--translate-category (not negated) (cdr arg))) |
| 499 | ('not | 512 | ('not |
| 500 | (rx--translate-not (not negated) (cdr arg))) | 513 | (rx--translate-not (not negated) (cdr arg))) |
| 501 | ('union | 514 | ((or 'or '|) |
| 502 | (rx--translate-union (not negated) (cdr arg))) | 515 | (rx--translate-union (not negated) (cdr arg))) |
| 503 | ('intersection | 516 | ('intersection |
| 504 | (rx--translate-intersection (not negated) (cdr arg)))))) | 517 | (rx--translate-intersection (not negated) (cdr arg)))))) |
| @@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)." | |||
| 558 | (defun rx--charset-intervals (charset) | 571 | (defun rx--charset-intervals (charset) |
| 559 | "Return a sorted list of non-adjacent disjoint intervals from CHARSET. | 572 | "Return a sorted list of non-adjacent disjoint intervals from CHARSET. |
| 560 | CHARSET is any expression allowed in a character set expression: | 573 | CHARSET is any expression allowed in a character set expression: |
| 561 | either `any' (no classes permitted), or `not', `union' or `intersection' | 574 | either `any' (no classes permitted), or `not', `or' or `intersection' |
| 562 | forms whose arguments are charsets." | 575 | forms whose arguments are charsets." |
| 563 | (pcase charset | 576 | (pcase charset |
| 564 | (`(,(or 'any 'in 'char) . ,body) | 577 | (`(,(or 'any 'in 'char) . ,body) |
| @@ -569,8 +582,8 @@ forms whose arguments are charsets." | |||
| 569 | (cadr parsed))) | 582 | (cadr parsed))) |
| 570 | (car parsed))) | 583 | (car parsed))) |
| 571 | (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) | 584 | (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) |
| 572 | (`(union . ,xs) (rx--charset-union xs)) | 585 | (`(,(or 'or '|) . ,body) (rx--charset-union body)) |
| 573 | (`(intersection . ,xs) (rx--charset-intersection xs)) | 586 | (`(intersection . ,body) (rx--charset-intersection body)) |
| 574 | (_ (let ((expanded (rx--expand-def charset))) | 587 | (_ (let ((expanded (rx--expand-def charset))) |
| 575 | (if expanded | 588 | (if expanded |
| 576 | (rx--charset-intervals expanded) | 589 | (rx--charset-intervals expanded) |
| @@ -589,7 +602,7 @@ forms whose arguments are charsets." | |||
| 589 | (mapcar #'rx--charset-intervals charsets))) | 602 | (mapcar #'rx--charset-intervals charsets))) |
| 590 | 603 | ||
| 591 | (defun rx--translate-union (negated body) | 604 | (defun rx--translate-union (negated body) |
| 592 | "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE). | 605 | "Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE). |
| 593 | If NEGATED, negate the sense." | 606 | If NEGATED, negate the sense." |
| 594 | (rx--intervals-to-alt negated (rx--charset-union body))) | 607 | (rx--intervals-to-alt negated (rx--charset-union body))) |
| 595 | 608 | ||
| @@ -976,7 +989,6 @@ can expand to any number of values." | |||
| 976 | ((or 'any 'in 'char) (rx--translate-any nil body)) | 989 | ((or 'any 'in 'char) (rx--translate-any nil body)) |
| 977 | ('not-char (rx--translate-any t body)) | 990 | ('not-char (rx--translate-any t body)) |
| 978 | ('not (rx--translate-not nil body)) | 991 | ('not (rx--translate-not nil body)) |
| 979 | ('union (rx--translate-union nil body)) | ||
| 980 | ('intersection (rx--translate-intersection nil body)) | 992 | ('intersection (rx--translate-intersection nil body)) |
| 981 | 993 | ||
| 982 | ('repeat (rx--translate-repeat body)) | 994 | ('repeat (rx--translate-repeat body)) |
| @@ -1036,7 +1048,7 @@ can expand to any number of values." | |||
| 1036 | (t (error "Unknown rx form `%s'" op))))))) | 1048 | (t (error "Unknown rx form `%s'" op))))))) |
| 1037 | 1049 | ||
| 1038 | (defconst rx--builtin-forms | 1050 | (defconst rx--builtin-forms |
| 1039 | '(seq sequence : and or | any in char not-char not union intersection | 1051 | '(seq sequence : and or | any in char not-char not intersection |
| 1040 | repeat = >= ** | 1052 | repeat = >= ** |
| 1041 | zero-or-more 0+ * | 1053 | zero-or-more 0+ * |
| 1042 | one-or-more 1+ + | 1054 | one-or-more 1+ + |
| @@ -1149,11 +1161,10 @@ CHAR Match a literal character. | |||
| 1149 | character, a string, a range as string \"A-Z\" or cons | 1161 | character, a string, a range as string \"A-Z\" or cons |
| 1150 | (?A . ?Z), or a character class (see below). Alias: in, char. | 1162 | (?A . ?Z), or a character class (see below). Alias: in, char. |
| 1151 | (not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC | 1163 | (not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC |
| 1152 | can be (any ...), (union ...), (intersection ...), | 1164 | can be (any ...), (or ...), (intersection ...), |
| 1153 | (syntax ...), (category ...), or a character class. | 1165 | (syntax ...), (category ...), or a character class. |
| 1154 | (union CHARSET...) Union of CHARSETs. | ||
| 1155 | (intersection CHARSET...) Intersection of CHARSETs. | 1166 | (intersection CHARSET...) Intersection of CHARSETs. |
| 1156 | CHARSET is (any...), (not...), (union...) or (intersection...). | 1167 | CHARSET is (any...), (not...), (or...) or (intersection...). |
| 1157 | not-newline Match any character except a newline. Alias: nonl. | 1168 | not-newline Match any character except a newline. Alias: nonl. |
| 1158 | anychar Match any character. Alias: anything. | 1169 | anychar Match any character. Alias: anything. |
| 1159 | unmatchable Never match anything at all. | 1170 | unmatchable Never match anything at all. |
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 0cd2c9590b7..344f46764c8 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el | |||
| @@ -274,33 +274,36 @@ | |||
| 274 | (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) | 274 | (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) |
| 275 | "[[:ascii:]][^a-z]"))) | 275 | "[[:ascii:]][^a-z]"))) |
| 276 | 276 | ||
| 277 | (ert-deftest rx-union () | 277 | (ert-deftest rx-charset-or () |
| 278 | (should (equal (rx (union)) | 278 | (should (equal (rx (or)) |
| 279 | "\\`a\\`")) | 279 | "\\`a\\`")) |
| 280 | (should (equal (rx (union (any "ba"))) | 280 | (should (equal (rx (or (any "ba"))) |
| 281 | "[ab]")) | 281 | "[ab]")) |
| 282 | (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) | 282 | (should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) |
| 283 | "[a-krx-z]")) | 283 | "[a-krx-z]")) |
| 284 | (should (equal (rx (union (not (any "a-m")) (not (any "f-p")))) | 284 | (should (equal (rx (or (not (any "a-m")) (not (any "f-p")))) |
| 285 | "[^f-m]")) | 285 | "[^f-m]")) |
| 286 | (should (equal (rx (union (any "e-m") (not (any "a-z")))) | 286 | (should (equal (rx (| (any "e-m") (not (any "a-z")))) |
| 287 | "[^a-dn-z]")) | 287 | "[^a-dn-z]")) |
| 288 | (should (equal (rx (union (not (any "g-r")) (not (any "t")))) | 288 | (should (equal (rx (or (not (any "g-r")) (not (any "t")))) |
| 289 | "[^z-a]")) | 289 | "[^z-a]")) |
| 290 | (should (equal (rx (not (union (not (any "g-r")) (not (any "t"))))) | 290 | (should (equal (rx (not (or (not (any "g-r")) (not (any "t"))))) |
| 291 | "\\`a\\`")) | 291 | "\\`a\\`")) |
| 292 | (should (equal (rx (union (union (any "a-f") (any "u-z")) | 292 | (should (equal (rx (or (| (any "a-f") (any "u-z")) |
| 293 | (any "g-r"))) | 293 | (any "g-r"))) |
| 294 | "[a-ru-z]")) | 294 | "[a-ru-z]")) |
| 295 | (should (equal (rx (union (intersection (any "c-z") (any "a-g")) | 295 | (should (equal (rx (or (intersection (any "c-z") (any "a-g")) |
| 296 | (not (any "a-k")))) | 296 | (not (any "a-k")))) |
| 297 | "[^abh-k]"))) | 297 | "[^abh-k]"))) |
| 298 | 298 | ||
| 299 | (ert-deftest rx-def-in-union () | 299 | (ert-deftest rx-def-in-charset-or () |
| 300 | (rx-let ((a (any "badc")) | 300 | (rx-let ((a (any "badc")) |
| 301 | (b (union a (any "def")))) | 301 | (b (| a (any "def")))) |
| 302 | (should (equal(rx (union b (any "q"))) | 302 | (should (equal (rx (or b (any "q"))) |
| 303 | "[a-fq]")))) | 303 | "[a-fq]"))) |
| 304 | (rx-let ((diff-| (a b) (not (or (not a) b)))) | ||
| 305 | (should (equal (rx (diff-| (any "a-z") (any "gr"))) | ||
| 306 | "[a-fh-qs-z]")))) | ||
| 304 | 307 | ||
| 305 | (ert-deftest rx-intersection () | 308 | (ert-deftest rx-intersection () |
| 306 | (should (equal (rx (intersection)) | 309 | (should (equal (rx (intersection)) |
| @@ -321,15 +324,18 @@ | |||
| 321 | (should (equal (rx (intersection (any "d-u") | 324 | (should (equal (rx (intersection (any "d-u") |
| 322 | (intersection (any "e-z") (any "a-m")))) | 325 | (intersection (any "e-z") (any "a-m")))) |
| 323 | "[e-m]")) | 326 | "[e-m]")) |
| 324 | (should (equal (rx (intersection (union (any "a-f") (any "f-t")) | 327 | (should (equal (rx (intersection (or (any "a-f") (any "f-t")) |
| 325 | (any "e-w"))) | 328 | (any "e-w"))) |
| 326 | "[e-t]"))) | 329 | "[e-t]"))) |
| 327 | 330 | ||
| 328 | (ert-deftest rx-def-in-intersection () | 331 | (ert-deftest rx-def-in-intersection () |
| 329 | (rx-let ((a (any "a-g")) | 332 | (rx-let ((a (any "a-g")) |
| 330 | (b (intersection a (any "d-j")))) | 333 | (b (intersection a (any "d-j")))) |
| 331 | (should (equal(rx (intersection b (any "e-k"))) | 334 | (should (equal (rx (intersection b (any "e-k"))) |
| 332 | "[e-g]")))) | 335 | "[e-g]"))) |
| 336 | (rx-let ((diff-& (a b) (intersection a (not b)))) | ||
| 337 | (should (equal (rx (diff-& (any "a-z") (any "m-p"))) | ||
| 338 | "[a-lq-z]")))) | ||
| 333 | 339 | ||
| 334 | (ert-deftest rx-group () | 340 | (ert-deftest rx-group () |
| 335 | (should (equal (rx (group nonl) (submatch "x") | 341 | (should (equal (rx (group nonl) (submatch "x") |