aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2019-12-12 23:04:00 +0100
committerMattias EngdegÄrd2019-12-12 23:47:25 +0100
commitf16766a0eb2a78b58a4856d31306fc37f913d70e (patch)
treed3be560c8aaf4f4d3a59b285e27aab224922bb33
parentd7efe98951730842db4fc136e3b631c5ee0d8a53 (diff)
downloademacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.gz
emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.zip
Use `or' instead of `union' for charset union in rx
Design change suggested by Stefan Monnier. * doc/lispref/searching.texi (Rx Constructs): * etc/NEWS: Document. * lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments. (rx--charset-p): New. (rx--translate-not, rx--charset-intervals, rx--translate-union): Change from `union' to `or'. (rx--translate-form, rx--builtin-forms, rx): Remove `union'. * test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union) (rx-intersection): Rename tests and change `union' to `or' and `|'.
-rw-r--r--doc/lispref/searching.texi22
-rw-r--r--etc/NEWS6
-rw-r--r--lisp/emacs-lisp/rx.el41
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el44
4 files changed, 65 insertions, 48 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index bf18f80f63f..0c6c7cc68b5 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]}
1214@item @code{(not @var{charspec})} 1214@item @code{(not @var{charspec})}
1215@cindex @code{not} in rx 1215@cindex @code{not} in rx
1216Match a character not included in @var{charspec}. @var{charspec} can 1216Match a character not included in @var{charspec}. @var{charspec} can
1217be an @code{any}, @code{not}, @code{union}, @code{intersection}, 1217be an @code{any}, @code{not}, @code{or}, @code{intersection},
1218@code{syntax} or @code{category} form, or a character class.@* 1218@code{syntax} or @code{category} form, or a character class.
1219If @var{charspec} is an @code{or} form, its arguments have the same
1220restrictions as those of @code{intersection}; see below.@*
1219Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}}, 1221Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
1220@samp{\C@var{code}} 1222@samp{\C@var{code}}
1221 1223
1222@item @code{(union @var{charset}@dots{})} 1224@item @code{(intersection @var{charset}@dots{})}
1223@itemx @code{(intersection @var{charset}@dots{})}
1224@cindex @code{union} in rx
1225@cindex @code{intersection} in rx 1225@cindex @code{intersection} in rx
1226Match a character that matches the union or intersection, 1226Match a character included in all of the @var{charset}s.
1227respectively, of the @var{charset}s. Each @var{charset} can be an 1227Each @var{charset} can be an @code{any} form without character
1228@code{any} form without character classes, or a @code{union}, 1228classes, or an @code{intersection}, @code{or} or @code{not} form whose
1229@code{intersection} or @code{not} form whose arguments are also 1229arguments are also @var{charset}s.
1230@var{charset}s.
1231 1230
1232@item @code{not-newline}, @code{nonl} 1231@item @code{not-newline}, @code{nonl}
1233@cindex @code{not-newline} in rx 1232@cindex @code{not-newline} in rx
@@ -1591,7 +1590,8 @@ when they are used, not when they are defined.
1591User-defined forms are allowed wherever arbitrary @code{rx} 1590User-defined forms are allowed wherever arbitrary @code{rx}
1592expressions are expected; for example, in the body of a 1591expressions are expected; for example, in the body of a
1593@code{zero-or-one} form, but not inside @code{any} or @code{category} 1592@code{zero-or-one} form, but not inside @code{any} or @code{category}
1594forms. They are also allowed inside @code{not} forms. 1593forms. They are also allowed inside @code{not} and
1594@code{intersection} forms.
1595@end itemize 1595@end itemize
1596 1596
1597@defmac rx-define name [arglist] rx-form 1597@defmac rx-define name [arglist] rx-form
diff --git a/etc/NEWS b/etc/NEWS
index 4df123d787b..1e0422c761f 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation.
2120Both match any single character; 'anychar' is more descriptive. 2120Both match any single character; 'anychar' is more descriptive.
2121 2121
2122+++ 2122+++
2123*** New 'union' and 'intersection' forms for character sets. 2123*** New 'intersection' form for character sets.
2124These permit composing character-matching expressions from simpler 2124With 'or' and 'not', it can be used to compose character-matching
2125parts. 2125expressions from simpler parts.
2126 2126
2127** Frames 2127** Frames
2128 2128
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index d4b21c3c9ad..a5cab1db888 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)."
273 ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) 273 ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
274 ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) 274 ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
275 ;; 275 ;;
276 ;; - Fuse patterns into a single character alternative if they fit. 276 ;; - Optimise single-character alternatives better:
277 ;; regexp-opt will do that if all are strings, but we want to do that for: 277 ;; * classes: space, alpha, ...
278 ;; * symbols that expand to classes: space, alpha, ...
279 ;; * character alternatives: (any ...)
280 ;; * (syntax S), for some S (whitespace, word) 278 ;; * (syntax S), for some S (whitespace, word)
281 ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) 279 ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
282 ;; -> (any "@" "%" digit "A-Z" space word) 280 ;; -> (any "@" "%" digit "A-Z" space word)
@@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)."
294 ((rx--every #'stringp body) ; All strings. 292 ((rx--every #'stringp body) ; All strings.
295 (cons (list (regexp-opt body nil t)) 293 (cons (list (regexp-opt body nil t))
296 t)) 294 t))
295 ((rx--every #'rx--charset-p body) ; All charsets.
296 (rx--translate-union nil body))
297 (t 297 (t
298 (cons (append (car (rx--translate (car body))) 298 (cons (append (car (rx--translate (car body)))
299 (mapcan (lambda (item) 299 (mapcan (lambda (item)
@@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)."
301 (cdr body))) 301 (cdr body)))
302 nil)))) 302 nil))))
303 303
304(defun rx--charset-p (form)
305 "Whether FORM looks like a charset, only consisting of character intervals
306and set operations."
307 (or (and (consp form)
308 (or (and (memq (car form) '(any 'in 'char))
309 (rx--every (lambda (x) (not (symbolp x))) (cdr form)))
310 (and (memq (car form) '(not or | intersection))
311 (rx--every #'rx--charset-p (cdr form)))))
312 (and (or (symbolp form) (consp form))
313 (let ((expanded (rx--expand-def form)))
314 (and expanded
315 (rx--charset-p expanded))))))
316
304(defun rx--string-to-intervals (str) 317(defun rx--string-to-intervals (str)
305 "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single 318 "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
306character X becomes (?X . ?X). Return the intervals in a list." 319character X becomes (?X . ?X). Return the intervals in a list."
@@ -477,7 +490,7 @@ If NEGATED, negate the sense."
477 (not negated) (rx--complement-intervals intervals) nil))) 490 (not negated) (rx--complement-intervals intervals) nil)))
478 491
479;; FIXME: Consider turning `not' into a variadic operator, following SRE: 492;; FIXME: Consider turning `not' into a variadic operator, following SRE:
480;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and 493;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
481;; (not) = anychar. 494;; (not) = anychar.
482;; Maybe allow singleton characters as arguments. 495;; Maybe allow singleton characters as arguments.
483 496
@@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)."
498 (rx--translate-category (not negated) (cdr arg))) 511 (rx--translate-category (not negated) (cdr arg)))
499 ('not 512 ('not
500 (rx--translate-not (not negated) (cdr arg))) 513 (rx--translate-not (not negated) (cdr arg)))
501 ('union 514 ((or 'or '|)
502 (rx--translate-union (not negated) (cdr arg))) 515 (rx--translate-union (not negated) (cdr arg)))
503 ('intersection 516 ('intersection
504 (rx--translate-intersection (not negated) (cdr arg)))))) 517 (rx--translate-intersection (not negated) (cdr arg))))))
@@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)."
558(defun rx--charset-intervals (charset) 571(defun rx--charset-intervals (charset)
559 "Return a sorted list of non-adjacent disjoint intervals from CHARSET. 572 "Return a sorted list of non-adjacent disjoint intervals from CHARSET.
560CHARSET is any expression allowed in a character set expression: 573CHARSET is any expression allowed in a character set expression:
561either `any' (no classes permitted), or `not', `union' or `intersection' 574either `any' (no classes permitted), or `not', `or' or `intersection'
562forms whose arguments are charsets." 575forms whose arguments are charsets."
563 (pcase charset 576 (pcase charset
564 (`(,(or 'any 'in 'char) . ,body) 577 (`(,(or 'any 'in 'char) . ,body)
@@ -569,8 +582,8 @@ forms whose arguments are charsets."
569 (cadr parsed))) 582 (cadr parsed)))
570 (car parsed))) 583 (car parsed)))
571 (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) 584 (`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
572 (`(union . ,xs) (rx--charset-union xs)) 585 (`(,(or 'or '|) . ,body) (rx--charset-union body))
573 (`(intersection . ,xs) (rx--charset-intersection xs)) 586 (`(intersection . ,body) (rx--charset-intersection body))
574 (_ (let ((expanded (rx--expand-def charset))) 587 (_ (let ((expanded (rx--expand-def charset)))
575 (if expanded 588 (if expanded
576 (rx--charset-intervals expanded) 589 (rx--charset-intervals expanded)
@@ -589,7 +602,7 @@ forms whose arguments are charsets."
589 (mapcar #'rx--charset-intervals charsets))) 602 (mapcar #'rx--charset-intervals charsets)))
590 603
591(defun rx--translate-union (negated body) 604(defun rx--translate-union (negated body)
592 "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE). 605 "Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
593If NEGATED, negate the sense." 606If NEGATED, negate the sense."
594 (rx--intervals-to-alt negated (rx--charset-union body))) 607 (rx--intervals-to-alt negated (rx--charset-union body)))
595 608
@@ -976,7 +989,6 @@ can expand to any number of values."
976 ((or 'any 'in 'char) (rx--translate-any nil body)) 989 ((or 'any 'in 'char) (rx--translate-any nil body))
977 ('not-char (rx--translate-any t body)) 990 ('not-char (rx--translate-any t body))
978 ('not (rx--translate-not nil body)) 991 ('not (rx--translate-not nil body))
979 ('union (rx--translate-union nil body))
980 ('intersection (rx--translate-intersection nil body)) 992 ('intersection (rx--translate-intersection nil body))
981 993
982 ('repeat (rx--translate-repeat body)) 994 ('repeat (rx--translate-repeat body))
@@ -1036,7 +1048,7 @@ can expand to any number of values."
1036 (t (error "Unknown rx form `%s'" op))))))) 1048 (t (error "Unknown rx form `%s'" op)))))))
1037 1049
1038(defconst rx--builtin-forms 1050(defconst rx--builtin-forms
1039 '(seq sequence : and or | any in char not-char not union intersection 1051 '(seq sequence : and or | any in char not-char not intersection
1040 repeat = >= ** 1052 repeat = >= **
1041 zero-or-more 0+ * 1053 zero-or-more 0+ *
1042 one-or-more 1+ + 1054 one-or-more 1+ +
@@ -1149,11 +1161,10 @@ CHAR Match a literal character.
1149 character, a string, a range as string \"A-Z\" or cons 1161 character, a string, a range as string \"A-Z\" or cons
1150 (?A . ?Z), or a character class (see below). Alias: in, char. 1162 (?A . ?Z), or a character class (see below). Alias: in, char.
1151(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC 1163(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
1152 can be (any ...), (union ...), (intersection ...), 1164 can be (any ...), (or ...), (intersection ...),
1153 (syntax ...), (category ...), or a character class. 1165 (syntax ...), (category ...), or a character class.
1154(union CHARSET...) Union of CHARSETs.
1155(intersection CHARSET...) Intersection of CHARSETs. 1166(intersection CHARSET...) Intersection of CHARSETs.
1156 CHARSET is (any...), (not...), (union...) or (intersection...). 1167 CHARSET is (any...), (not...), (or...) or (intersection...).
1157not-newline Match any character except a newline. Alias: nonl. 1168not-newline Match any character except a newline. Alias: nonl.
1158anychar Match any character. Alias: anything. 1169anychar Match any character. Alias: anything.
1159unmatchable Never match anything at all. 1170unmatchable Never match anything at all.
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 0cd2c9590b7..344f46764c8 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -274,33 +274,36 @@
274 (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) 274 (should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
275 "[[:ascii:]][^a-z]"))) 275 "[[:ascii:]][^a-z]")))
276 276
277(ert-deftest rx-union () 277(ert-deftest rx-charset-or ()
278 (should (equal (rx (union)) 278 (should (equal (rx (or))
279 "\\`a\\`")) 279 "\\`a\\`"))
280 (should (equal (rx (union (any "ba"))) 280 (should (equal (rx (or (any "ba")))
281 "[ab]")) 281 "[ab]"))
282 (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) 282 (should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
283 "[a-krx-z]")) 283 "[a-krx-z]"))
284 (should (equal (rx (union (not (any "a-m")) (not (any "f-p")))) 284 (should (equal (rx (or (not (any "a-m")) (not (any "f-p"))))
285 "[^f-m]")) 285 "[^f-m]"))
286 (should (equal (rx (union (any "e-m") (not (any "a-z")))) 286 (should (equal (rx (| (any "e-m") (not (any "a-z"))))
287 "[^a-dn-z]")) 287 "[^a-dn-z]"))
288 (should (equal (rx (union (not (any "g-r")) (not (any "t")))) 288 (should (equal (rx (or (not (any "g-r")) (not (any "t"))))
289 "[^z-a]")) 289 "[^z-a]"))
290 (should (equal (rx (not (union (not (any "g-r")) (not (any "t"))))) 290 (should (equal (rx (not (or (not (any "g-r")) (not (any "t")))))
291 "\\`a\\`")) 291 "\\`a\\`"))
292 (should (equal (rx (union (union (any "a-f") (any "u-z")) 292 (should (equal (rx (or (| (any "a-f") (any "u-z"))
293 (any "g-r"))) 293 (any "g-r")))
294 "[a-ru-z]")) 294 "[a-ru-z]"))
295 (should (equal (rx (union (intersection (any "c-z") (any "a-g")) 295 (should (equal (rx (or (intersection (any "c-z") (any "a-g"))
296 (not (any "a-k")))) 296 (not (any "a-k"))))
297 "[^abh-k]"))) 297 "[^abh-k]")))
298 298
299(ert-deftest rx-def-in-union () 299(ert-deftest rx-def-in-charset-or ()
300 (rx-let ((a (any "badc")) 300 (rx-let ((a (any "badc"))
301 (b (union a (any "def")))) 301 (b (| a (any "def"))))
302 (should (equal(rx (union b (any "q"))) 302 (should (equal (rx (or b (any "q")))
303 "[a-fq]")))) 303 "[a-fq]")))
304 (rx-let ((diff-| (a b) (not (or (not a) b))))
305 (should (equal (rx (diff-| (any "a-z") (any "gr")))
306 "[a-fh-qs-z]"))))
304 307
305(ert-deftest rx-intersection () 308(ert-deftest rx-intersection ()
306 (should (equal (rx (intersection)) 309 (should (equal (rx (intersection))
@@ -321,15 +324,18 @@
321 (should (equal (rx (intersection (any "d-u") 324 (should (equal (rx (intersection (any "d-u")
322 (intersection (any "e-z") (any "a-m")))) 325 (intersection (any "e-z") (any "a-m"))))
323 "[e-m]")) 326 "[e-m]"))
324 (should (equal (rx (intersection (union (any "a-f") (any "f-t")) 327 (should (equal (rx (intersection (or (any "a-f") (any "f-t"))
325 (any "e-w"))) 328 (any "e-w")))
326 "[e-t]"))) 329 "[e-t]")))
327 330
328(ert-deftest rx-def-in-intersection () 331(ert-deftest rx-def-in-intersection ()
329 (rx-let ((a (any "a-g")) 332 (rx-let ((a (any "a-g"))
330 (b (intersection a (any "d-j")))) 333 (b (intersection a (any "d-j"))))
331 (should (equal(rx (intersection b (any "e-k"))) 334 (should (equal (rx (intersection b (any "e-k")))
332 "[e-g]")))) 335 "[e-g]")))
336 (rx-let ((diff-& (a b) (intersection a (not b))))
337 (should (equal (rx (diff-& (any "a-z") (any "m-p")))
338 "[a-lq-z]"))))
333 339
334(ert-deftest rx-group () 340(ert-deftest rx-group ()
335 (should (equal (rx (group nonl) (submatch "x") 341 (should (equal (rx (group nonl) (submatch "x")