aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2019-09-25 14:29:50 -0700
committerPaul Eggert2019-09-25 14:29:50 -0700
commit2ed71227c626c6cfdc684948644ccf3d9eaeb15b (patch)
tree2a4043ce8036206c7138b9bf5b149da8c66ec811
parenta773a6474897356cd78aeea092d2c1a51ede23f9 (diff)
downloademacs-2ed71227c626c6cfdc684948644ccf3d9eaeb15b.tar.gz
emacs-2ed71227c626c6cfdc684948644ccf3d9eaeb15b.zip
New rx implementation
* lisp/emacs-lisp/rx.el: * test/lisp/emacs-lisp/rx-tests.el: * doc/lispref/searching.texi (Rx Constructs): Rewrite rx for correctness, clarity, and performance. The new implementation retains full compatibility and has more comprehensive tests. * lisp/emacs-lisp/re-builder.el (reb-rx-font-lock-keywords): Adapt to changes in internal variables in rx.el.
-rw-r--r--doc/lispref/searching.texi4
-rw-r--r--lisp/emacs-lisp/re-builder.el9
-rw-r--r--lisp/emacs-lisp/rx.el1809
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el336
4 files changed, 1091 insertions, 1067 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index 65f56b490fd..2d94e5659de 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1044,11 +1044,9 @@ customisation.
1044 1044
1045The various forms in @code{rx} regexps are described below. The 1045The various forms in @code{rx} regexps are described below. The
1046shorthand @var{rx} represents any @code{rx} form, and @var{rx}@dots{} 1046shorthand @var{rx} represents any @code{rx} form, and @var{rx}@dots{}
1047means one or more @code{rx} forms. Where the corresponding string 1047means zero or more @code{rx} forms. Where the corresponding string
1048regexp syntax is given, @var{A}, @var{B}, @dots{} are string regexp 1048regexp syntax is given, @var{A}, @var{B}, @dots{} are string regexp
1049subexpressions. 1049subexpressions.
1050@c With the new implementation of rx, this can be changed from
1051@c 'one or more' to 'zero or more'.
1052 1050
1053@subsubheading Literals 1051@subsubheading Literals
1054 1052
diff --git a/lisp/emacs-lisp/re-builder.el b/lisp/emacs-lisp/re-builder.el
index 961d26a7212..1054f1453bc 100644
--- a/lisp/emacs-lisp/re-builder.el
+++ b/lisp/emacs-lisp/re-builder.el
@@ -816,13 +816,12 @@ If SUBEXP is non-nil mark only the corresponding sub-expressions."
816 816
817(require 'rx) 817(require 'rx)
818(defconst reb-rx-font-lock-keywords 818(defconst reb-rx-font-lock-keywords
819 (let ((constituents (mapcar (lambda (rec) 819 (let ((constituents (mapcar #'symbol-name rx--builtin-forms))
820 (symbol-name (car rec))) 820 (syntax (mapcar (lambda (rec) (symbol-name (car rec)))
821 rx-constituents)) 821 rx--syntax-codes))
822 (syntax (mapcar (lambda (rec) (symbol-name (car rec))) rx-syntax))
823 (categories (mapcar (lambda (rec) 822 (categories (mapcar (lambda (rec)
824 (symbol-name (car rec))) 823 (symbol-name (car rec)))
825 rx-categories))) 824 rx--categories)))
826 `( 825 `(
827 (,(concat "(" (regexp-opt (list "rx-to-string") t) "[[:space:]]") 826 (,(concat "(" (regexp-opt (list "rx-to-string") t) "[[:space:]]")
828 (1 font-lock-function-name-face)) 827 (1 font-lock-function-name-face))
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 249529e54e3..9b3419e1c88 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -1,11 +1,7 @@
1;;; rx.el --- sexp notation for regular expressions -*- lexical-binding: t -*- 1;;; rx.el --- S-exp notation for regexps --*- lexical-binding: t -*-
2 2
3;; Copyright (C) 2001-2019 Free Software Foundation, Inc. 3;; Copyright (C) 2001-2019 Free Software Foundation, Inc.
4 4
5;; Author: Gerd Moellmann <gerd@gnu.org>
6;; Maintainer: emacs-devel@gnu.org
7;; Keywords: strings, regexps, extensions
8
9;; This file is part of GNU Emacs. 5;; This file is part of GNU Emacs.
10 6
11;; GNU Emacs is free software: you can redistribute it and/or modify 7;; GNU Emacs is free software: you can redistribute it and/or modify
@@ -23,937 +19,824 @@
23 19
24;;; Commentary: 20;;; Commentary:
25 21
26;; This is another implementation of sexp-form regular expressions. 22;; This facility allows writing regexps in a sexp-based language
27;; It was unfortunately written without being aware of the Sregex 23;; instead of strings. Regexps in the `rx' notation are easier to
28;; package coming with Emacs, but as things stand, Rx completely 24;; read, write and maintain; they can be indented and commented in a
29;; covers all regexp features, which Sregex doesn't, doesn't suffer 25;; natural way, and are easily composed by program code.
30;; from the bugs mentioned in the commentary section of Sregex, and 26;; The translation to string regexp is done by a macro and does not
31;; uses a nicer syntax (IMHO, of course :-). 27;; incur any extra processing during run time. Example:
32
33;; This significantly extended version of the original, is almost
34;; compatible with Sregex. The only incompatibility I (fx) know of is
35;; that the `repeat' form can't have multiple regexp args.
36
37;; Now alternative forms are provided for a degree of compatibility
38;; with Olin Shivers' attempted definitive SRE notation. SRE forms
39;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
40;; ,<exp>, (word ...), word+, posix-string, and character class forms.
41;; Some forms are inconsistent with SRE, either for historical reasons
42;; or because of the implementation -- simple translation into Emacs
43;; regexp strings. These include: any, word. Also, case-sensitivity
44;; and greediness are controlled by variables external to the regexp,
45;; and you need to feed the forms to the `posix-' functions to get
46;; SRE's POSIX semantics. There are probably more difficulties.
47
48;; Rx translates a sexp notation for regular expressions into the
49;; usual string notation. The translation can be done at compile-time
50;; by using the `rx' macro. The `regexp' and `literal' forms accept
51;; non-constant expressions, in which case `rx' will translate to a
52;; `concat' expression. Translation can be done fully at run time by
53;; calling function `rx-to-string'. See the documentation of `rx' for
54;; a complete description of the sexp notation.
55;;
56;; Some examples of string regexps and their sexp counterparts:
57;;
58;; "^[a-z]*"
59;; (rx line-start (0+ (in "a-z")))
60;;
61;; "\n[^ \t]"
62;; (rx ?\n (not (in " \t")))
63;;
64;; "\\*\\*\\* EOOH \\*\\*\\*\n"
65;; (rx "*** EOOH ***\n")
66;;
67;; "\\<\\(catch\\|finally\\)\\>[^_]"
68;; (rx word-start (submatch (or "catch" "finally")) word-end
69;; (not (in ?_)))
70;;
71;; "[ \t\n]*:\\($\\|[^:]+\\)"
72;; (rx (* (in " \t\n")) ":"
73;; (submatch (or line-end (+ (not (in ?:))))))
74;;
75;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*"
76;; (rx line-start
77;; "content-transfer-encoding:"
78;; (* (? ?\n) (in " \t"))
79;; "quoted-printable"
80;; (* (? ?\n) (in " \t")))
81;;
82;; (concat "^\\(?:" something-else "\\)")
83;; (rx line-start (regexp something-else))
84;;
85;; (regexp-opt '(STRING1 STRING2 ...))
86;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
87;; calls `regexp-opt' as needed.
88;;
89;; "^;;\\s-*\n\\|^\n"
90;; (rx (or (seq line-start ";;" (0+ space) ?\n)
91;; (seq line-start ?\n)))
92;; 28;;
93;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " 29;; (rx bos (or (not (any "^"))
94;; (rx "$Id: " 30;; (seq "^" (or " *" "["))))
95;; (1+ (not (in " ")))
96;; " "
97;; (submatch (1+ (not (in " "))))
98;; " ")
99;; 31;;
100;; "\\\\\\\\\\[\\w+" 32;; => "\\`\\(?:[^^]\\|\\^\\(?: \\*\\|\\[\\)\\)"
101;; (rx "\\\\[" (1+ word))
102;;
103;; etc.
104
105;;; History:
106;; 33;;
34;; The notation is much influenced by and retains some compatibility with
35;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities,
36;; and the older Emacs package Sregex.
107 37
108;;; Code: 38;;; Code:
109 39
110(require 'cl-lib) 40;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE),
111(require 'cl-extra) 41;; where REGEXP is a list of string expressions that will be
112 42;; concatenated into a regexp, and PRECEDENCE is one of
113;; FIXME: support macros. 43;;
114 44;; t -- can be used as argument to postfix operators (eg. "a")
115(defvar rx-constituents ;Not `const' because some modes extend it. 45;; seq -- can be concatenated in sequence with other seq or higher (eg. "ab")
116 '((and . (rx-and 0 nil)) 46;; lseq -- can be concatenated to the left of rseq or higher (eg. "^a")
117 (seq . and) ; SRE 47;; rseq -- can be concatenated to the right of lseq or higher (eg. "a$")
118 (: . and) ; SRE 48;; nil -- can only be used in alternatives (eg. "a\\|b")
119 (sequence . and) ; sregex 49;;
120 (or . (rx-or 0 nil)) 50;; They form a lattice:
121 (| . or) ; SRE 51;;
122 (not-newline . ".") 52;; t highest precedence
123 (nonl . not-newline) ; SRE 53;; |
124 (anything . (rx-anything 0 nil)) 54;; seq
125 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE 55;; / \
126 (any . ".") ; sregex 56;; lseq rseq
127 (in . any) 57;; \ /
128 (char . any) ; sregex 58;; nil lowest precedence
129 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex 59
130 (not . (rx-not 1 1 rx-check-not)) 60
131 (repeat . (rx-repeat 2 nil)) 61(defconst rx--char-classes
132 (= . (rx-= 2 nil)) ; SRE 62 '((digit . digit)
133 (>= . (rx->= 2 nil)) ; SRE 63 (numeric . digit)
134 (** . (rx-** 2 nil)) ; SRE 64 (num . digit)
135 (submatch . (rx-submatch 1 nil)) ; SRE 65 (control . cntrl)
136 (group . submatch) ; sregex 66 (cntrl . cntrl)
137 (submatch-n . (rx-submatch-n 2 nil)) 67 (hex-digit . xdigit)
138 (group-n . submatch-n) 68 (hex . xdigit)
139 (zero-or-more . (rx-kleene 1 nil)) 69 (xdigit . xdigit)
140 (one-or-more . (rx-kleene 1 nil)) 70 (blank . blank)
141 (zero-or-one . (rx-kleene 1 nil)) 71 (graphic . graph)
142 (\? . zero-or-one) ; SRE 72 (graph . graph)
143 (\?? . zero-or-one) 73 (printing . print)
144 (* . zero-or-more) ; SRE 74 (print . print)
145 (*? . zero-or-more) 75 (alphanumeric . alnum)
146 (0+ . zero-or-more) 76 (alnum . alnum)
147 (+ . one-or-more) ; SRE 77 (letter . alpha)
148 (+? . one-or-more) 78 (alphabetic . alpha)
149 (1+ . one-or-more) 79 (alpha . alpha)
150 (optional . zero-or-one) 80 (ascii . ascii)
151 (opt . zero-or-one) ; sregex 81 (nonascii . nonascii)
152 (minimal-match . (rx-greedy 1 1)) 82 (lower . lower)
153 (maximal-match . (rx-greedy 1 1)) 83 (lower-case . lower)
154 (backref . (rx-backref 1 1 rx-check-backref)) 84 (punctuation . punct)
155 (line-start . "^") 85 (punct . punct)
156 (bol . line-start) ; SRE 86 (space . space)
157 (line-end . "$") 87 (whitespace . space)
158 (eol . line-end) ; SRE 88 (white . space)
159 (string-start . "\\`") 89 (upper . upper)
160 (bos . string-start) ; SRE 90 (upper-case . upper)
161 (bot . string-start) ; sregex 91 (word . word)
162 (string-end . "\\'") 92 (wordchar . word)
163 (eos . string-end) ; SRE 93 (unibyte . unibyte)
164 (eot . string-end) ; sregex 94 (multibyte . multibyte))
165 (buffer-start . "\\`") 95 "Alist mapping rx symbols to character classes.
166 (buffer-end . "\\'") 96Most of the names are from SRE.")
167 (point . "\\=") 97
168 (word-start . "\\<") 98(defvar rx-constituents nil
169 (bow . word-start) ; SRE 99 "Alist of old-style rx extensions, for compatibility.
170 (word-end . "\\>") 100
171 (eow . word-end) ; SRE 101Each element is (SYMBOL . DEF).
172 (word-boundary . "\\b") 102
173 (not-word-boundary . "\\B") ; sregex 103If DEF is a symbol, then SYMBOL is an alias of DEF.
174 (symbol-start . "\\_<") 104
175 (symbol-end . "\\_>") 105If DEF is a string, then SYMBOL is a plain rx symbol defined as the
176 (syntax . (rx-syntax 1 1)) 106 regexp string DEF.
177 (not-syntax . (rx-not-syntax 1 1)) ; sregex 107
178 (category . (rx-category 1 1 rx-check-category)) 108If DEF is a list on the form (FUN MIN-ARGS MAX-ARGS PRED), then
179 (eval . (rx-eval 1 1)) 109 SYMBOL is an rx form with at least MIN-ARGS and at most
180 (literal . (rx-literal 1 1 stringp)) 110 MAX-ARGS arguments. If MAX-ARGS is nil, then there is no upper limit.
181 (regexp . (rx-regexp 1 1 stringp)) 111 FUN is a function taking the entire rx form as single argument
182 (regex . regexp) ; sregex 112 and returning the translated regexp string.
183 (digit . "[[:digit:]]") 113 If PRED is non-nil, it is a predicate that all actual arguments must
184 (numeric . digit) ; SRE 114 satisfy.")
185 (num . digit) ; SRE 115
186 (control . "[[:cntrl:]]") ; SRE 116;; TODO: Additions to consider:
187 (cntrl . control) ; SRE 117;; - A better name for `anything', like `any-char' or `anychar'.
188 (hex-digit . "[[:xdigit:]]") ; SRE 118;; - A name for (or), maybe `unmatchable'.
189 (hex . hex-digit) ; SRE 119;; - A construct like `or' but without the match order guarantee,
190 (xdigit . hex-digit) ; SRE 120;; maybe `unordered-or'. Useful for composition or generation of
191 (blank . "[[:blank:]]") ; SRE 121;; alternatives; permits more effective use of regexp-opt.
192 (graphic . "[[:graph:]]") ; SRE 122
193 (graph . graphic) ; SRE 123(defun rx--translate-symbol (sym)
194 (printing . "[[:print:]]") ; SRE 124 "Translate an rx symbol. Return (REGEXP . PRECEDENCE)."
195 (print . printing) ; SRE 125 (pcase sym
196 (alphanumeric . "[[:alnum:]]") ; SRE 126 ;; Use `list' instead of a quoted list to wrap the strings here,
197 (alnum . alphanumeric) ; SRE 127 ;; since the return value may be mutated.
198 (letter . "[[:alpha:]]") 128 ((or 'nonl 'not-newline 'any) (cons (list ".") t))
199 (alphabetic . letter) ; SRE 129 ('anything (rx--translate-form '(or nonl "\n")))
200 (alpha . letter) ; SRE 130 ((or 'bol 'line-start) (cons (list "^") 'lseq))
201 (ascii . "[[:ascii:]]") ; SRE 131 ((or 'eol 'line-end) (cons (list "$") 'rseq))
202 (nonascii . "[[:nonascii:]]") 132 ((or 'bos 'string-start 'bot 'buffer-start) (cons (list "\\`") t))
203 (lower . "[[:lower:]]") ; SRE 133 ((or 'eos 'string-end 'eot 'buffer-end) (cons (list "\\'") t))
204 (lower-case . lower) ; SRE 134 ('point (cons (list "\\=") t))
205 (punctuation . "[[:punct:]]") ; SRE 135 ((or 'bow 'word-start) (cons (list "\\<") t))
206 (punct . punctuation) ; SRE 136 ((or 'eow 'word-end) (cons (list "\\>") t))
207 (space . "[[:space:]]") ; SRE 137 ('word-boundary (cons (list "\\b") t))
208 (whitespace . space) ; SRE 138 ('not-word-boundary (cons (list "\\B") t))
209 (white . space) ; SRE 139 ('symbol-start (cons (list "\\_<") t))
210 (upper . "[[:upper:]]") ; SRE 140 ('symbol-end (cons (list "\\_>") t))
211 (upper-case . upper) ; SRE 141 ('not-wordchar (cons (list "\\W") t))
212 (word . "[[:word:]]") ; inconsistent with SRE 142 (_
213 (wordchar . word) ; sregex 143 (cond
214 (not-wordchar . "\\W")) 144 ((let ((class (cdr (assq sym rx--char-classes))))
215 "Alist of sexp form regexp constituents. 145 (and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t))))
216Each element of the alist has the form (SYMBOL . DEFN). 146
217SYMBOL is a valid constituent of sexp regular expressions. 147 ;; For compatibility with old rx.
218If DEFN is a string, SYMBOL is translated into DEFN. 148 ((let ((entry (assq sym rx-constituents)))
219If DEFN is a symbol, use the definition of DEFN, recursively. 149 (and (progn
220Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). 150 (while (and entry (not (stringp (cdr entry))))
221FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS 151 (setq entry
222are the minimum and maximum number of arguments the function-form 152 (if (symbolp (cdr entry))
223sexp constituent SYMBOL may have in sexp regular expressions. 153 ;; Alias for another entry.
224MAX-ARGS nil means no limit. PREDICATE, if specified, means that 154 (assq (cdr entry) rx-constituents)
225all arguments must satisfy PREDICATE.") 155 ;; Wrong type, try further down the list.
226 156 (assq (car entry)
227 157 (cdr (memq entry rx-constituents))))))
228(defconst rx-syntax 158 entry)
229 '((whitespace . ?-) 159 (cons (list (cdr entry)) nil))))
230 (punctuation . ?.) 160 (t (error "Unknown rx symbol `%s'" sym))))))
231 (word . ?w) 161
232 (symbol . ?_) 162(defun rx--enclose (left-str rexp right-str)
233 (open-parenthesis . ?\() 163 "Bracket REXP by LEFT-STR and RIGHT-STR."
234 (close-parenthesis . ?\)) 164 (append (list left-str) rexp (list right-str)))
235 (expression-prefix . ?\') 165
236 (string-quote . ?\") 166(defun rx--bracket (rexp)
237 (paired-delimiter . ?$) 167 (rx--enclose "\\(?:" rexp "\\)"))
238 (escape . ?\\) 168
239 (character-quote . ?/) 169(defun rx--sequence (left right)
240 (comment-start . ?<) 170 "Return the sequence (concatenation) of two translated items,
241 (comment-end . ?>) 171each on the form (REGEXP . PRECEDENCE), returning (REGEXP . PRECEDENCE)."
242 (string-delimiter . ?|) 172 ;; Concatenation rules:
243 (comment-delimiter . ?!)) 173 ;; seq ++ seq -> seq
244 "Alist mapping Rx syntax symbols to syntax characters. 174 ;; lseq ++ seq -> lseq
245Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid 175 ;; seq ++ rseq -> rseq
246symbol in `(syntax SYMBOL)', and CHAR is the syntax character 176 ;; lseq ++ rseq -> nil
247corresponding to SYMBOL, as it would be used with \\s or \\S in 177 (cond ((not (car left)) right)
248regular expressions.") 178 ((not (car right)) left)
249 179 (t
250 180 (let ((l (if (memq (cdr left) '(nil rseq))
251(defconst rx-categories 181 (cons (rx--bracket (car left)) t)
252 '((space-for-indent . ?\s) 182 left))
253 (base . ?.) 183 (r (if (memq (cdr right) '(nil lseq))
254 (consonant . ?0) 184 (cons (rx--bracket (car right)) t)
255 (base-vowel . ?1) 185 right)))
256 (upper-diacritical-mark . ?2) 186 (cons (append (car l) (car r))
257 (lower-diacritical-mark . ?3) 187 (if (eq (cdr l) 'lseq)
258 (tone-mark . ?4) 188 (if (eq (cdr r) 'rseq)
259 (symbol . ?5) 189 nil ; lseq ++ rseq
260 (digit . ?6) 190 'lseq) ; lseq ++ seq
261 (vowel-modifying-diacritical-mark . ?7) 191 (if (eq (cdr r) 'rseq)
262 (vowel-sign . ?8) 192 'rseq ; seq ++ rseq
263 (semivowel-lower . ?9) 193 'seq))))))) ; seq ++ seq
264 (not-at-end-of-line . ?<) 194
265 (not-at-beginning-of-line . ?>) 195(defun rx--translate-seq (body)
266 (alpha-numeric-two-byte . ?A) 196 "Translate a sequence of one or more rx items. Return (REGEXP . PRECEDENCE)."
267 (chinese-two-byte . ?C) 197 (if body
268 (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3. 198 (let* ((items (mapcar #'rx--translate body))
269 (greek-two-byte . ?G) 199 (result (car items)))
270 (japanese-hiragana-two-byte . ?H) 200 (dolist (item (cdr items))
271 (indian-two-byte . ?I) 201 (setq result (rx--sequence result item)))
272 (japanese-katakana-two-byte . ?K) 202 result)
273 (strong-left-to-right . ?L) 203 (cons nil 'seq)))
274 (korean-hangul-two-byte . ?N) 204
275 (strong-right-to-left . ?R) 205(defun rx--empty ()
276 (cyrillic-two-byte . ?Y) 206 "Regexp that never matches anything."
277 (combining-diacritic . ?^) 207 (cons (list regexp-unmatchable) 'seq))
278 (ascii . ?a) 208
279 (arabic . ?b) 209;; `cl-every' replacement to avoid bootstrapping problems.
280 (chinese . ?c) 210(defun rx--every (pred list)
281 (ethiopic . ?e) 211 "Whether PRED is true for every element of LIST."
282 (greek . ?g) 212 (while (and list (funcall pred (car list)))
283 (korean . ?h) 213 (setq list (cdr list)))
284 (indian . ?i) 214 (null list))
285 (japanese . ?j) 215
286 (japanese-katakana . ?k) 216(defun rx--translate-or (body)
287 (latin . ?l) 217 "Translate an or-pattern of one of more rx items.
288 (lao . ?o) 218Return (REGEXP . PRECEDENCE)."
289 (tibetan . ?q) 219 ;; FIXME: Possible improvements:
290 (japanese-roman . ?r) 220 ;;
291 (thai . ?t) 221 ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
292 (vietnamese . ?v) 222 ;; so that they can be candidates for regexp-opt.
293 (hebrew . ?w) 223 ;;
294 (cyrillic . ?y) 224 ;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
295 (can-break . ?|)) 225 ;;
296 "Alist mapping symbols to category characters. 226 ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
297Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid 227 ;; in order to improve effectiveness of regexp-opt.
298symbol in `(category SYMBOL)', and CHAR is the category character 228 ;; This would also help composability.
299corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in 229 ;;
300regular expression strings.") 230 ;; - Use associativity to run regexp-opt on contiguous subsets of arguments
301 231 ;; if not all of them are strings. Example:
302 232 ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
303(defvar rx-greedy-flag t 233 ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
304 "Non-nil means produce greedy regular expressions for `zero-or-one', 234 ;;
305`zero-or-more', and `one-or-more'. Dynamically bound.") 235 ;; - Fuse patterns into a single character alternative if they fit.
306 236 ;; regexp-opt will do that if all are strings, but we want to do that for:
307(defvar rx--compile-to-lisp nil 237 ;; * symbols that expand to classes: space, alpha, ...
308 "Nil means return a regexp as a string. 238 ;; * character alternatives: (any ...)
309Non-nil means we may return a lisp form which produces a 239 ;; * (syntax S), for some S (whitespace, word)
310string (used for `rx' macro).") 240 ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
311 241 ;; -> (any "@" "%" digit "A-Z" space word)
312(defun rx-info (op head) 242 ;; -> "[A-Z@%[:digit:][:space:][:word:]]"
313 "Return parsing/code generation info for OP. 243 ;;
314If OP is the space character ASCII 32, return info for the symbol `?'. 244 ;; Problem: If a subpattern is carefully written to to be
315If OP is the character `?', return info for the symbol `??'. 245 ;; optimisable by regexp-opt, how do we prevent the transforms
316See also `rx-constituents'. 246 ;; above from destroying that property?
317If HEAD is non-nil, then OP is the head of a sexp, otherwise it's 247 ;; Example: (or "a" (or "abc" "abd" "abe"))
318a standalone symbol."
319 (cond ((eq op ? ) (setq op '\?))
320 ((eq op ??) (setq op '\??)))
321 (let (old-op)
322 (while (and (not (null op)) (symbolp op))
323 (setq old-op op)
324 (setq op (cdr (assq op rx-constituents)))
325 (when (if head (stringp op) (consp op))
326 ;; We found something but of the wrong kind. Let's look for an
327 ;; alternate definition for the other case.
328 (let ((new-op
329 (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
330 rx-constituents))))))
331 (if (and new-op (not (if head (stringp new-op) (consp new-op))))
332 (setq op new-op))))))
333 op)
334
335
336(defun rx-check (form)
337 "Check FORM according to its car's parsing info."
338 (unless (listp form)
339 (error "rx `%s' needs argument(s)" form))
340 (let* ((rx (rx-info (car form) 'head))
341 (nargs (1- (length form)))
342 (min-args (nth 1 rx))
343 (max-args (nth 2 rx))
344 (type-pred (nth 3 rx)))
345 (when (and (not (null min-args))
346 (< nargs min-args))
347 (error "rx form `%s' requires at least %d args"
348 (car form) min-args))
349 (when (and (not (null max-args))
350 (> nargs max-args))
351 (error "rx form `%s' accepts at most %d args"
352 (car form) max-args))
353 (when type-pred
354 (dolist (sub-form (cdr form))
355 (unless (funcall type-pred sub-form)
356 (error "rx form `%s' requires args satisfying `%s'"
357 (car form) type-pred))))))
358
359
360(defun rx-group-if (regexp group)
361 "Put shy groups around REGEXP if seemingly necessary when GROUP
362is non-nil."
363 (cond 248 (cond
364 ;; for some repetition 249 ((null body) ; No items: a never-matching regexp.
365 ((eq group '*) (if (rx-atomic-p regexp) (setq group nil))) 250 (rx--empty))
366 ;; for concatenation 251 ((null (cdr body)) ; Single item.
367 ((eq group ':) 252 (rx--translate (car body)))
368 (if (rx-atomic-p 253 ((rx--every #'stringp body) ; All strings.
369 (if (and (stringp regexp) 254 (cons (list (regexp-opt body nil t))
370 (string-match 255 t))
371 "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)) 256 (t
372 (substring regexp 0 (match-beginning 0)) 257 (cons (append (car (rx--translate (car body)))
373 regexp)) 258 (mapcan (lambda (item)
374 (setq group nil))) 259 (cons "\\|" (car (rx--translate item))))
375 ;; for OR 260 (cdr body)))
376 ((eq group '|) (setq group nil)) 261 nil))))
377 ;; do anyway 262
378 ((eq group t)) 263(defun rx--string-to-intervals (str)
379 ((rx-atomic-p regexp t) (setq group nil))) 264 "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
380 (cond ((and group (stringp regexp)) 265character X becomes (?X . ?X). Return the intervals in a list."
381 (concat "\\(?:" regexp "\\)")) 266 ;; We could just do string-to-multibyte on the string and work with
382 (group `("\\(?:" ,@regexp "\\)")) 267 ;; that instead of this `decode-char' workaround.
383 (t regexp)))
384
385
386(defvar rx-parent)
387;; dynamically bound in some functions.
388
389
390(defun rx-and (form)
391 "Parse and produce code from FORM.
392FORM is of the form `(and FORM1 ...)'."
393 (rx-check form)
394 (rx-group-if
395 (rx--subforms (cdr form) ':)
396 (and (memq rx-parent '(* t)) rx-parent)))
397
398
399(defun rx-or (form)
400 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
401 (rx-check form)
402 (rx-group-if
403 (cond
404 ((null (cdr form)) regexp-unmatchable)
405 ((cl-every #'stringp (cdr form))
406 (regexp-opt (cdr form) nil t))
407 (t (rx--subforms (cdr form) '| "\\|")))
408 (and (memq rx-parent '(: * t)) rx-parent)))
409
410
411(defun rx-anything (form)
412 "Match any character."
413 (if (consp form)
414 (error "rx `anything' syntax error: %s" form))
415 (rx-or (list 'or 'not-newline ?\n)))
416
417
418(defun rx-any-delete-from-range (char ranges)
419 "Delete by side effect character CHAR from RANGES.
420Only both edges of each range is checked."
421 (let (m)
422 (cond
423 ((memq char ranges) (setq ranges (delq char ranges)))
424 ((setq m (assq char ranges))
425 (if (eq (1+ char) (cdr m))
426 (setcar (memq m ranges) (1+ char))
427 (setcar m (1+ char))))
428 ((setq m (rassq char ranges))
429 (if (eq (1- char) (car m))
430 (setcar (memq m ranges) (1- char))
431 (setcdr m (1- char)))))
432 ranges))
433
434
435(defun rx-any-condense-range (args)
436 "Condense by side effect ARGS as range for Rx `any'."
437 (let (str
438 l)
439 ;; set STR list of all strings
440 ;; set L list of all ranges
441 (mapc (lambda (e) (cond ((stringp e) (push e str))
442 ((numberp e) (push (cons e e) l))
443 ;; Ranges between ASCII and raw bytes are split,
444 ;; to prevent accidental inclusion of Unicode
445 ;; characters later on.
446 ((and (<= (car e) #x7f)
447 (>= (cdr e) #x3fff80))
448 (push (cons (car e) #x7f) l)
449 (push (cons #x3fff80 (cdr e)) l))
450 (t (push e l))))
451 args)
452 ;; condense overlapped ranges in L
453 (let ((tail (setq l (sort l #'car-less-than-car)))
454 d)
455 (while (setq d (cdr tail))
456 (if (>= (cdar tail) (1- (caar d)))
457 (progn
458 (setcdr (car tail) (max (cdar tail) (cdar d)))
459 (setcdr tail (cdr d)))
460 (setq tail d))))
461 ;; Separate small ranges to single number, and delete dups.
462 (nconc
463 (apply #'nconc
464 (mapcar (lambda (e)
465 (cond
466 ((= (car e) (cdr e)) (list (car e)))
467 ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
468 ((list e))))
469 l))
470 (delete-dups str))))
471
472
473(defun rx-check-any-string (str)
474 "Turn the `any' argument string STR into a list of characters.
475The original order is not preserved. Ranges, \"A-Z\", become pairs, (?A . ?Z)."
476 (let ((decode-char 268 (let ((decode-char
477 ;; Make sure raw bytes are decoded as such, to avoid confusion with
478 ;; U+0080..U+00FF.
479 (if (multibyte-string-p str) 269 (if (multibyte-string-p str)
480 #'identity 270 #'identity
481 (lambda (c) (if (<= #x80 c #xff) 271 #'unibyte-char-to-multibyte))
482 (+ c #x3fff00)
483 c))))
484 (len (length str)) 272 (len (length str))
485 (i 0) 273 (i 0)
486 (ret nil)) 274 (intervals nil))
487 (if (= 0 len)
488 (error "String arg for Rx `any' must not be empty"))
489 (while (< i len) 275 (while (< i len)
490 (cond ((and (< i (- len 2)) 276 (cond ((and (< i (- len 2))
491 (= (aref str (+ i 1)) ?-)) 277 (= (aref str (1+ i)) ?-))
492 ;; Range. 278 ;; Range.
493 (let ((start (funcall decode-char (aref str i))) 279 (let ((start (funcall decode-char (aref str i)))
494 (end (funcall decode-char (aref str (+ i 2))))) 280 (end (funcall decode-char (aref str (+ i 2)))))
495 (cond ((< start end) (push (cons start end) ret)) 281 (cond ((and (<= start #x7f) (>= end #x3fff80))
496 ((= start end) (push start ret)) 282 ;; Ranges between ASCII and raw bytes are split to
283 ;; avoid having them absorb Unicode characters
284 ;; caught in-between.
285 (push (cons start #x7f) intervals)
286 (push (cons #x3fff80 end) intervals))
287 ((<= start end)
288 (push (cons start end) intervals))
497 (t 289 (t
498 (error "Rx character range `%c-%c' is reversed" 290 (error "Invalid rx `any' range: %s"
499 start end))) 291 (substring str i 3))))
500 (setq i (+ i 3)))) 292 (setq i (+ i 3))))
501 (t 293 (t
502 ;; Single character. 294 ;; Single character.
503 (push (funcall decode-char (aref str i)) ret) 295 (let ((char (funcall decode-char (aref str i))))
296 (push (cons char char) intervals))
504 (setq i (+ i 1))))) 297 (setq i (+ i 1)))))
505 ret)) 298 intervals))
506 299
507 300(defun rx--condense-intervals (intervals)
508(defun rx-check-any (arg) 301 "Merge adjacent and overlapping intervals by mutation, preserving the order.
509 "Check arg ARG for Rx `any'." 302INTERVALS is a list of (START . END) with START ≀ END, sorted by START."
510 (cond 303 (let ((tail intervals)
511 ((integerp arg) (list arg)) 304 d)
512 ((symbolp arg) 305 (while (setq d (cdr tail))
513 (let ((translation (condition-case nil 306 (if (>= (cdar tail) (1- (caar d)))
514 (rx-form arg) 307 (progn
515 (error nil)))) 308 (setcdr (car tail) (max (cdar tail) (cdar d)))
516 (if (or (null translation) 309 (setcdr tail (cdr d)))
517 (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) 310 (setq tail d)))
518 (error "Invalid char class `%s' in Rx `any'" arg)) 311 intervals))
519 (list (substring translation 1 -1)))) ; strip outer brackets 312
520 ((and (characterp (car-safe arg)) (characterp (cdr-safe arg))) 313(defun rx--translate-any (negated body)
521 (unless (<= (car arg) (cdr arg)) 314 "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE).
522 (error "Rx character range `%c-%c' is reversed" 315If NEGATED, negate the sense."
523 (car arg) (cdr arg))) 316 (let ((classes nil)
524 (list arg)) 317 (strings nil)
525 ((stringp arg) (rx-check-any-string arg)) 318 (conses nil))
526 ((error 319 ;; Collect strings, conses and characters, and classes in separate bins.
527 "rx `any' requires string, character, char pair or char class args")))) 320 (dolist (arg body)
528 321 (cond ((stringp arg)
529 322 (push arg strings))
530(defun rx-any (form) 323 ((and (consp arg)
531 "Parse and produce code from FORM, which is `(any ARG ...)'. 324 (characterp (car arg))
532ARG is optional." 325 (characterp (cdr arg))
533 (rx-check form) 326 (<= (car arg) (cdr arg)))
534 (let* ((args (rx-any-condense-range 327 ;; Copy the cons, in case we need to modify it.
535 (apply 328 (push (cons (car arg) (cdr arg)) conses))
536 #'nconc 329 ((characterp arg)
537 (mapcar #'rx-check-any (cdr form))))) 330 (push (cons arg arg) conses))
538 m 331 ((and (symbolp arg)
539 s) 332 (let ((class (cdr (assq arg rx--char-classes))))
540 (cond 333 (and class (push class classes)))))
541 ;; single close bracket 334 (t (error "Invalid rx `any' argument: %s" arg))))
542 ;; => "[]...-]" or "[]...--.]" 335 (let ((items
543 ((memq ?\] args) 336 ;; Translate strings and conses into nonoverlapping intervals,
544 ;; set ] at the beginning 337 ;; and add classes as symbols at the end.
545 (setq args (cons ?\] (delq ?\] args))) 338 (append
546 ;; set - at the end 339 (rx--condense-intervals
547 (if (or (memq ?- args) (assq ?- args)) 340 (sort (append conses
548 (setq args (nconc (rx-any-delete-from-range ?- args) 341 (mapcan #'rx--string-to-intervals strings))
549 (list ?-))))) 342 #'car-less-than-car))
550 ;; close bracket starts a range 343 (reverse classes))))
551 ;; => "[]-....-]" or "[]-.--....]" 344
552 ((setq m (assq ?\] args)) 345 ;; Move lone ] and range ]-x to the start.
553 ;; bring it to the beginning 346 (let ((rbrac-l (assq ?\] items)))
554 (setq args (cons m (delq m args))) 347 (when rbrac-l
555 (cond ((memq ?- args) 348 (setq items (cons rbrac-l (delq rbrac-l items)))))
556 ;; to the end 349
557 (setq args (nconc (delq ?- args) (list ?-)))) 350 ;; Split x-] and move the lone ] to the start.
558 ((setq m (assq ?- args)) 351 (let ((rbrac-r (rassq ?\] items)))
559 ;; next to the bracket's range, make the second range 352 (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
560 (setcdr args (cons m (delq m (cdr args))))))) 353 (setcdr rbrac-r ?\\)
561 ;; bracket in the end range 354 (setq items (cons '(?\] . ?\]) items))))
562 ;; => "[]...-]" 355
563 ((setq m (rassq ?\] args)) 356 ;; Split ,-- (which would end up as ,- otherwise).
564 ;; set ] at the beginning 357 (let ((dash-r (rassq ?- items)))
565 (setq args (cons ?\] (rx-any-delete-from-range ?\] args))) 358 (when (eq (car dash-r) ?,)
566 ;; set - at the end 359 (setcdr dash-r ?,)
567 (if (or (memq ?- args) (assq ?- args)) 360 (setq items (nconc items '((?- . ?-))))))
568 (setq args (nconc (rx-any-delete-from-range ?- args) 361
569 (list ?-))))) 362 ;; Remove - (lone or at start of interval)
570 ;; {no close bracket appears} 363 (let ((dash-l (assq ?- items)))
571 ;; 364 (when dash-l
572 ;; bring single bar to the beginning 365 (if (eq (cdr dash-l) ?-)
573 ((memq ?- args) 366 (setq items (delq dash-l items)) ; Remove lone -
574 (setq args (cons ?- (delq ?- args)))) 367 (setcar dash-l ?.)) ; Reduce --x to .-x
575 ;; bar start a range, bring it to the beginning 368 (setq items (nconc items '((?- . ?-))))))
576 ((setq m (assq ?- args)) 369
577 (setq args (cons m (delq m args)))) 370 ;; Deal with leading ^ and range ^-x.
578 ;; 371 (when (and (consp (car items))
579 ;; hat at the beginning? 372 (eq (caar items) ?^)
580 ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^)) 373 (cdr items))
581 (setq args (if (cdr args) 374 ;; Move ^ and ^-x to second place.
582 `(,(cadr args) ,(car args) ,@(cddr args)) 375 (setq items (cons (cadr items)
583 (nconc (rx-any-delete-from-range ?^ args) 376 (cons (car items) (cddr items)))))
584 (list ?^)))))) 377
585 ;; some 1-char?
586 (if (and (null (cdr args)) (numberp (car args))
587 (or (= 1 (length
588 (setq s (regexp-quote (string (car args))))))
589 (and (equal (car args) ?^) ;; unnecessary predicate?
590 (null (eq rx-parent '!)))))
591 s
592 (concat "["
593 (mapconcat
594 (lambda (e) (cond
595 ((numberp e) (string e))
596 ((consp e)
597 (if (and (= (1+ (car e)) (cdr e))
598 ;; rx-any-condense-range should
599 ;; prevent this case from happening.
600 (null (memq (car e) '(?\] ?-)))
601 (null (memq (cdr e) '(?\] ?-))))
602 (string (car e) (cdr e))
603 (string (car e) ?- (cdr e))))
604 (e)))
605 args
606 nil)
607 "]"))))
608
609
610(defun rx-check-not (arg)
611 "Check arg ARG for Rx `not'."
612 (unless (or (and (symbolp arg)
613 (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
614 (condition-case nil
615 (rx-form arg)
616 (error ""))))
617 (eq arg 'word-boundary)
618 (and (consp arg)
619 (memq (car arg) '(not any in syntax category))))
620 (error "rx `not' syntax error: %s" arg))
621 t)
622
623
624(defun rx-not (form)
625 "Parse and produce code from FORM. FORM is `(not ...)'."
626 (rx-check form)
627 (let ((result (rx-form (cadr form) '!))
628 case-fold-search)
629 (cond ((string-match "\\`\\[\\^" result)
630 (cond
631 ((equal result "[^]") "[^^]")
632 ((and (= (length result) 4) (null (eq rx-parent '!)))
633 (regexp-quote (substring result 2 3)))
634 ((concat "[" (substring result 2)))))
635 ((eq ?\[ (aref result 0))
636 (concat "[^" (substring result 1)))
637 ((string-match "\\`\\\\[scbw]" result)
638 (concat (upcase (substring result 0 2))
639 (substring result 2)))
640 ((string-match "\\`\\\\[SCBW]" result)
641 (concat (downcase (substring result 0 2))
642 (substring result 2)))
643 (t
644 (concat "[^" result "]")))))
645
646
647(defun rx-not-char (form)
648 "Parse and produce code from FORM. FORM is `(not-char ...)'."
649 (rx-check form)
650 (rx-not `(not (in ,@(cdr form)))))
651
652
653(defun rx-not-syntax (form)
654 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
655 (rx-check form)
656 (rx-not `(not (syntax ,@(cdr form)))))
657
658
659(defun rx-trans-forms (form &optional skip)
660 "If FORM's length is greater than two, transform it to length two.
661A form (HEAD REST ...) becomes (HEAD (and REST ...)).
662If SKIP is non-nil, allow that number of items after the head, i.e.
663`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
664 (unless skip (setq skip 0))
665 (let ((tail (nthcdr (1+ skip) form)))
666 (if (= (length tail) 1)
667 form
668 (let ((form (copy-sequence form)))
669 (setcdr (nthcdr skip form) (list (cons 'and tail)))
670 form))))
671
672
673(defun rx-= (form)
674 "Parse and produce code from FORM `(= N ...)'."
675 (rx-check form)
676 (setq form (rx-trans-forms form 1))
677 (unless (and (integerp (nth 1 form))
678 (> (nth 1 form) 0))
679 (error "rx `=' requires positive integer first arg"))
680 (let ((subform (rx-form (nth 2 form) '*)))
681 (if (stringp subform)
682 (format "%s\\{%d\\}" subform (nth 1 form))
683 `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
684
685
686(defun rx->= (form)
687 "Parse and produce code from FORM `(>= N ...)'."
688 (rx-check form)
689 (setq form (rx-trans-forms form 1))
690 (unless (and (integerp (nth 1 form))
691 (> (nth 1 form) 0))
692 (error "rx `>=' requires positive integer first arg"))
693 (let ((subform (rx-form (nth 2 form) '*)))
694 (if (stringp subform)
695 (format "%s\\{%d,\\}" subform (nth 1 form))
696 `(,@subform ,(format "\\{%d,\\}" (nth 1 form))))))
697
698
699(defun rx-** (form)
700 "Parse and produce code from FORM `(** N M ...)'."
701 (rx-check form)
702 (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
703
704
705(defun rx-repeat (form)
706 "Parse and produce code from FORM.
707FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
708 (rx-check form)
709 (if (> (length form) 4)
710 (setq form (rx-trans-forms form 2)))
711 (if (null (nth 2 form))
712 (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
713 (cond ((= (length form) 3)
714 (unless (and (integerp (nth 1 form))
715 (> (nth 1 form) 0))
716 (error "rx `repeat' requires positive integer first arg"))
717 (let ((subform (rx-form (nth 2 form) '*)))
718 (if (stringp subform)
719 (format "%s\\{%d\\}" subform (nth 1 form))
720 `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
721 ((or (not (integerp (nth 2 form)))
722 (< (nth 2 form) 0)
723 (not (integerp (nth 1 form)))
724 (< (nth 1 form) 0)
725 (< (nth 2 form) (nth 1 form)))
726 (error "rx `repeat' range error"))
727 (t
728 (let ((subform (rx-form (nth 3 form) '*)))
729 (if (stringp subform)
730 (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form))
731 `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form))))))))
732
733
734(defun rx-submatch (form)
735 "Parse and produce code from FORM, which is `(submatch ...)'."
736 (let ((subforms (rx--subforms (cdr form) ':)))
737 (if (stringp subforms)
738 (concat "\\(" subforms "\\)")
739 `("\\(" ,@subforms "\\)"))))
740
741(defun rx-submatch-n (form)
742 "Parse and produce code from FORM, which is `(submatch-n N ...)'."
743 (let ((n (nth 1 form))
744 (subforms (rx--subforms (cddr form) ':)))
745 (unless (and (integerp n) (> n 0))
746 (error "rx `submatch-n' argument must be positive"))
747 (if (stringp subforms)
748 (concat "\\(?" (number-to-string n) ":" subforms "\\)")
749 `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)"))))
750
751(defun rx-backref (form)
752 "Parse and produce code from FORM, which is `(backref N)'."
753 (rx-check form)
754 (format "\\%d" (nth 1 form)))
755
756(defun rx-check-backref (arg)
757 "Check arg ARG for Rx `backref'."
758 (or (and (integerp arg) (>= arg 1) (<= arg 9))
759 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
760
761(defun rx-kleene (form)
762 "Parse and produce code from FORM.
763FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
764`zero-or-more' etc. operators.
765If OP is one of `*', `+', `?', produce a greedy regexp.
766If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
767If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
768is non-nil."
769 (rx-check form)
770 (setq form (rx-trans-forms form))
771 (let ((suffix (cond ((memq (car form) '(* + \? ?\s)) "")
772 ((memq (car form) '(*? +? \?? ??)) "?")
773 (rx-greedy-flag "")
774 (t "?")))
775 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
776 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
777 (t "?")))
778 (subform (rx-form (cadr form) '*)))
779 (rx-group-if
780 (if (stringp subform)
781 (concat subform op suffix)
782 `(,@subform ,(concat op suffix)))
783 (and (memq rx-parent '(t *)) rx-parent))))
784
785
786(defun rx-atomic-p (r &optional lax)
787 "Return non-nil if regexp string R is atomic.
788An atomic regexp R is one such that a suffix operator
789appended to R will apply to all of R. For example, \"a\"
790\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
791\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
792
793This function may return false negatives, but it will not
794return false positives. It is nevertheless useful in
795situations where an efficiency shortcut can be taken only if a
796regexp is atomic. The function can be improved to detect
797more cases of atomic regexps. Presently, this function
798detects the following categories of atomic regexp;
799
800 a group or shy group: \\(...\\)
801 a character class: [...]
802 a single character: a
803
804On the other hand, false negatives will be returned for
805regexps that are atomic but end in operators, such as
806\"a+\". I think these are rare. Probably such cases could
807be detected without much effort. A guarantee of no false
808negatives would require a theoretic specification of the set
809of all atomic regexps."
810 (if (and rx--compile-to-lisp
811 (not (stringp r)))
812 nil ;; Runtime value, we must assume non-atomic.
813 (let ((l (length r)))
814 (cond 378 (cond
815 ((<= l 1)) 379 ;; Empty set: if negated, any char, otherwise match-nothing.
816 ((= l 2) (= (aref r 0) ?\\)) 380 ((null items)
817 ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) 381 (if negated
818 ((null lax) 382 (rx--translate-symbol 'anything)
819 (cond 383 (rx--empty)))
820 ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r)) 384 ;; Single non-negated character.
821 ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))) 385 ((and (null (cdr items))
822 386 (consp (car items))
823 387 (eq (caar items) (cdar items))
824(defun rx-syntax (form) 388 (not negated))
825 "Parse and produce code from FORM, which is `(syntax SYMBOL)'." 389 (cons (list (regexp-quote (char-to-string (caar items))))
826 (rx-check form) 390 t))
827 (let* ((sym (cadr form)) 391 ;; At least one character or class, possibly negated.
828 (syntax (cdr (assq sym rx-syntax)))) 392 (t
393 (cons
394 (list
395 (concat
396 "["
397 (and negated "^")
398 (mapconcat (lambda (item)
399 (cond ((symbolp item)
400 (format "[:%s:]" item))
401 ((eq (car item) (cdr item))
402 (char-to-string (car item)))
403 ((eq (1+ (car item)) (cdr item))
404 (string (car item) (cdr item)))
405 (t
406 (string (car item) ?- (cdr item)))))
407 items nil)
408 "]"))
409 t))))))
410
411(defun rx--translate-not (negated body)
412 "Translate a (not ...) construct. Return (REGEXP . PRECEDENCE).
413If NEGATED, negate the sense (thus making it positive)."
414 (unless (and body (null (cdr body)))
415 (error "rx `not' form takes exactly one argument"))
416 (let ((arg (car body)))
417 (cond
418 ((consp arg)
419 (pcase (car arg)
420 ((or 'any 'in 'char) (rx--translate-any (not negated) (cdr arg)))
421 ('syntax (rx--translate-syntax (not negated) (cdr arg)))
422 ('category (rx--translate-category (not negated) (cdr arg)))
423 ('not (rx--translate-not (not negated) (cdr arg)))
424 (_ (error "Illegal argument to rx `not': %S" arg))))
425 ((eq arg 'word-boundary)
426 (rx--translate-symbol
427 (if negated 'word-boundary 'not-word-boundary)))
428 (t
429 (let ((class (cdr (assq arg rx--char-classes))))
430 (if class
431 (rx--translate-any (not negated) (list class))
432 (error "Illegal argument to rx `not': %s" arg)))))))
433
434(defun rx--atomic-regexp (item)
435 "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
436 (if (eq (cdr item) t)
437 (car item)
438 (rx--bracket (car item))))
439
440(defun rx--translate-counted-repetition (min-count max-count body)
441 (let ((operand (rx--translate-seq body)))
442 (if (car operand)
443 (cons (append
444 (rx--atomic-regexp operand)
445 (list (concat "\\{"
446 (number-to-string min-count)
447 (cond ((null max-count) ",")
448 ((< min-count max-count)
449 (concat "," (number-to-string max-count))))
450 "\\}")))
451 t)
452 operand)))
453
454(defun rx--check-repeat-arg (name min-args body)
455 (unless (>= (length body) min-args)
456 (error "rx `%s' requires at least %d argument%s"
457 name min-args (if (= min-args 1) "" "s")))
458 ;; There seems to be no reason to disallow zero counts.
459 (unless (natnump (car body))
460 (error "rx `%s' first argument must be nonnegative" name)))
461
462(defun rx--translate-bounded-repetition (name body)
463 (let ((min-count (car body))
464 (max-count (cadr body))
465 (items (cddr body)))
466 (unless (and (natnump min-count)
467 (natnump max-count)
468 (<= min-count max-count))
469 (error "rx `%s' range error" name))
470 (rx--translate-counted-repetition min-count max-count items)))
471
472(defun rx--translate-repeat (body)
473 (rx--check-repeat-arg 'repeat 2 body)
474 (if (= (length body) 2)
475 (rx--translate-counted-repetition (car body) (car body) (cdr body))
476 (rx--translate-bounded-repetition 'repeat body)))
477
478(defun rx--translate-** (body)
479 (rx--check-repeat-arg '** 2 body)
480 (rx--translate-bounded-repetition '** body))
481
482(defun rx--translate->= (body)
483 (rx--check-repeat-arg '>= 1 body)
484 (rx--translate-counted-repetition (car body) nil (cdr body)))
485
486(defun rx--translate-= (body)
487 (rx--check-repeat-arg '= 1 body)
488 (rx--translate-counted-repetition (car body) (car body) (cdr body)))
489
490(defvar rx--greedy t)
491
492(defun rx--translate-rep (op-string greedy body)
493 "Translate a repetition; OP-STRING is one of \"*\", \"+\" or \"?\".
494GREEDY is a boolean. Return (REGEXP . PRECEDENCE)."
495 (let ((operand (rx--translate-seq body)))
496 (if (car operand)
497 (cons (append (rx--atomic-regexp operand)
498 (list (concat op-string (unless greedy "?"))))
499 ;; The result has precedence seq to avoid (? (* "a")) -> "a*?"
500 'seq)
501 operand)))
502
503(defun rx--control-greedy (greedy body)
504 "Translate the sequence BODY with greediness GREEDY.
505Return (REGEXP . PRECEDENCE)."
506 (let ((rx--greedy greedy))
507 (rx--translate-seq body)))
508
509(defun rx--translate-group (body)
510 "Translate the `group' form. Return (REGEXP . PRECEDENCE)."
511 (cons (rx--enclose "\\("
512 (car (rx--translate-seq body))
513 "\\)")
514 t))
515
516(defun rx--translate-group-n (body)
517 "Translate the `group-n' form. Return (REGEXP . PRECEDENCE)."
518 (unless (and (integerp (car body)) (> (car body) 0))
519 (error "rx `group-n' requires a positive number as first argument"))
520 (cons (rx--enclose (concat "\\(?" (number-to-string (car body)) ":")
521 (car (rx--translate-seq (cdr body)))
522 "\\)")
523 t))
524
525(defun rx--translate-backref (body)
526 "Translate the `backref' form. Return (REGEXP . PRECEDENCE)."
527 (unless (and (= (length body) 1) (integerp (car body)) (<= 1 (car body) 9))
528 (error "rx `backref' requires an argument in the range 1..9"))
529 (cons (list "\\" (number-to-string (car body))) t))
530
531(defconst rx--syntax-codes
532 '((whitespace . ?-) ; SPC also accepted
533 (punctuation . ?.)
534 (word . ?w) ; W also accepted
535 (symbol . ?_)
536 (open-parenthesis . ?\()
537 (close-parenthesis . ?\))
538 (expression-prefix . ?\')
539 (string-quote . ?\")
540 (paired-delimiter . ?$)
541 (escape . ?\\)
542 (character-quote . ?/)
543 (comment-start . ?<)
544 (comment-end . ?>)
545 (string-delimiter . ?|)
546 (comment-delimiter . ?!)))
547
548(defun rx--translate-syntax (negated body)
549 "Translate the `syntax' form. Return (REGEXP . PRECEDENCE)."
550 (unless (and body (null (cdr body)))
551 (error "rx `syntax' form takes exactly one argument"))
552 (let* ((sym (car body))
553 (syntax (cdr (assq sym rx--syntax-codes))))
829 (unless syntax 554 (unless syntax
830 ;; Try sregex compatibility.
831 (cond 555 (cond
832 ((characterp sym) (setq syntax sym)) 556 ;; Syntax character directly (sregex compatibility)
557 ((and (characterp sym) (rassq sym rx--syntax-codes))
558 (setq syntax sym))
559 ;; Syntax character as symbol (sregex compatibility)
833 ((symbolp sym) 560 ((symbolp sym)
834 (let ((name (symbol-name sym))) 561 (let ((name (symbol-name sym)))
835 (if (= 1 (length name)) 562 (when (= (length name) 1)
836 (setq syntax (aref name 0)))))) 563 (let ((char (string-to-char name)))
564 (when (rassq char rx--syntax-codes)
565 (setq syntax char)))))))
837 (unless syntax 566 (unless syntax
838 (error "Unknown rx syntax `%s'" sym))) 567 (error "Unknown rx syntax name `%s'" sym)))
839 (format "\\s%c" syntax))) 568 (cons (list (string ?\\ (if negated ?S ?s) syntax))
840 569 t)))
841 570
842(defun rx-check-category (form) 571(defconst rx--categories
843 "Check the argument FORM of a `(category FORM)'." 572 '((space-for-indent . ?\s)
844 (unless (or (integerp form) 573 (base . ?.)
845 (cdr (assq form rx-categories))) 574 (consonant . ?0)
846 (error "Unknown category `%s'" form)) 575 (base-vowel . ?1)
847 t) 576 (upper-diacritical-mark . ?2)
848 577 (lower-diacritical-mark . ?3)
849 578 (tone-mark . ?4)
850(defun rx-category (form) 579 (symbol . ?5)
851 "Parse and produce code from FORM, which is `(category SYMBOL)'." 580 (digit . ?6)
852 (rx-check form) 581 (vowel-modifying-diacritical-mark . ?7)
853 (let ((char (if (integerp (cadr form)) 582 (vowel-sign . ?8)
854 (cadr form) 583 (semivowel-lower . ?9)
855 (cdr (assq (cadr form) rx-categories))))) 584 (not-at-end-of-line . ?<)
856 (format "\\c%c" char))) 585 (not-at-beginning-of-line . ?>)
857 586 (alpha-numeric-two-byte . ?A)
858 587 (chinese-two-byte . ?C)
859(defun rx-eval (form) 588 (chinse-two-byte . ?C) ; A typo in Emacs 21.1-24.3.
860 "Parse and produce code from FORM, which is `(eval FORM)'." 589 (greek-two-byte . ?G)
861 (rx-check form) 590 (japanese-hiragana-two-byte . ?H)
862 (rx-form (eval (cadr form)) rx-parent)) 591 (indian-two-byte . ?I)
863 592 (japanese-katakana-two-byte . ?K)
864 593 (strong-left-to-right . ?L)
865(defun rx-greedy (form) 594 (korean-hangul-two-byte . ?N)
866 "Parse and produce code from FORM. 595 (strong-right-to-left . ?R)
867If FORM is `(minimal-match FORM1)', non-greedy versions of `*', 596 (cyrillic-two-byte . ?Y)
868`+', and `?' operators will be used in FORM1. If FORM is 597 (combining-diacritic . ?^)
869`(maximal-match FORM1)', greedy operators will be used." 598 (ascii . ?a)
870 (rx-check form) 599 (arabic . ?b)
871 (let ((rx-greedy-flag (eq (car form) 'maximal-match))) 600 (chinese . ?c)
872 (rx-form (cadr form) rx-parent))) 601 (ethiopic . ?e)
873 602 (greek . ?g)
874 603 (korean . ?h)
875(defun rx-regexp (form) 604 (indian . ?i)
876 "Parse and produce code from FORM, which is `(regexp STRING)'." 605 (japanese . ?j)
877 (cond ((stringp (cadr form)) 606 (japanese-katakana . ?k)
878 (rx-group-if (cadr form) rx-parent)) 607 (latin . ?l)
879 (rx--compile-to-lisp 608 (lao . ?o)
880 ;; Always group non-string forms, since we can't be sure they 609 (tibetan . ?q)
881 ;; are atomic. 610 (japanese-roman . ?r)
882 (rx-group-if (cdr form) t)) 611 (thai . ?t)
883 (t (rx-check form)))) 612 (vietnamese . ?v)
884 613 (hebrew . ?w)
885(defun rx-literal (form) 614 (cyrillic . ?y)
886 "Parse and produce code from FORM, which is `(literal STRING-EXP)'." 615 (can-break . ?|)))
887 (cond ((stringp (cadr form)) 616
888 ;; This is allowed, but makes little sense, you could just 617(defun rx--translate-category (negated body)
889 ;; use STRING directly. 618 "Translate the `category' form. Return (REGEXP . PRECEDENCE)."
890 (rx-group-if (regexp-quote (cadr form)) rx-parent)) 619 (unless (and body (null (cdr body)))
891 (rx--compile-to-lisp 620 (error "rx `category' form takes exactly one argument"))
892 (rx-group-if `((regexp-quote ,(cadr form))) rx-parent)) 621 (let* ((arg (car body))
893 (t (rx-check form)))) 622 (category
894 623 (cond ((symbolp arg)
895(defun rx-form (form &optional parent) 624 (let ((cat (assq arg rx--categories)))
896 "Parse and produce code for regular expression FORM. 625 (unless cat
897FORM is a regular expression in sexp form. 626 (error "Unknown rx category `%s'" arg))
898PARENT shows which type of expression calls and controls putting of 627 (cdr cat)))
899shy groups around the result and some more in other functions." 628 ((characterp arg) arg)
900 (let ((rx-parent parent)) 629 (t (error "Invalid rx `category' argument `%s'" arg)))))
901 (cond 630 (cons (list (string ?\\ (if negated ?C ?c) category))
902 ((stringp form) 631 t)))
903 (rx-group-if (regexp-quote form) 632
904 (if (and (eq parent '*) (< 1 (length form))) 633(defvar rx--delayed-evaluation nil
905 parent))) 634 "Whether to allow certain forms to be evaluated at runtime.")
906 ((integerp form) 635
907 (regexp-quote (char-to-string form))) 636(defun rx--translate-literal (body)
908 ((symbolp form) 637 "Translate the `literal' form. Return (REGEXP . PRECEDENCE)."
909 (let ((info (rx-info form nil))) 638 (unless (and body (null (cdr body)))
910 (cond ((stringp info) 639 (error "rx `literal' form takes exactly one argument"))
911 info) 640 (let ((arg (car body)))
912 ((null info) 641 (cond ((stringp arg)
913 (error "Unknown rx form `%s'" form)) 642 (cons (list (regexp-quote arg)) (if (= (length arg) 1) t 'seq)))
914 (t 643 (rx--delayed-evaluation
915 (funcall (nth 0 info) form))))) 644 (cons (list (list 'regexp-quote arg)) 'seq))
916 ((consp form) 645 (t (error "rx `literal' form with non-string argument")))))
917 (let ((info (rx-info (car form) 'head))) 646
918 (unless (consp info) 647(defun rx--translate-eval (body)
919 (error "Unknown rx form `%s'" (car form))) 648 "Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
920 (funcall (nth 0 info) form))) 649 (unless (and body (null (cdr body)))
921 (t 650 (error "rx `eval' form takes exactly one argument"))
922 (error "rx syntax error at `%s'" form))))) 651 (rx--translate (eval (car body))))
923 652
924(defun rx--subforms (subforms &optional parent separator) 653(defvar rx--regexp-atomic-regexp nil)
925 "Produce code for regular expressions SUBFORMS. 654
926SUBFORMS is a list of regular expression sexps. 655(defun rx--translate-regexp (body)
927PARENT controls grouping, as in `rx-form'. 656 "Translate the `regexp' form. Return (REGEXP . PRECEDENCE)."
928Insert SEPARATOR between the code from each of SUBFORMS." 657 (unless (and body (null (cdr body)))
929 (if (null (cdr subforms)) 658 (error "rx `regexp' form takes exactly one argument"))
930 ;; Zero or one forms, no need for grouping. 659 (let ((arg (car body)))
931 (and subforms (rx-form (car subforms))) 660 (cond ((stringp arg)
932 (let ((listify (lambda (x) 661 ;; Generate the regexp when needed, since rx isn't
933 (if (listp x) (copy-sequence x) 662 ;; necessarily present in the byte-compilation environment.
934 (list x))))) 663 (unless rx--regexp-atomic-regexp
935 (setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms)) 664 (setq rx--regexp-atomic-regexp
936 (cond ((or (not rx--compile-to-lisp) 665 ;; Match atomic (precedence t) regexps: may give
937 (cl-every #'stringp subforms)) 666 ;; false negatives but no false positives, assuming
938 (mapconcat #'identity subforms separator)) 667 ;; the target string is syntactically correct.
939 (separator 668 (rx-to-string
940 (nconc (funcall listify (car subforms)) 669 '(seq
941 (mapcan (lambda (x) 670 bos
942 (cons separator (funcall listify x))) 671 (or (seq "["
943 (cdr subforms)))) 672 (opt "^")
944 (t (mapcan listify subforms)))))) 673 (opt "]")
674 (* (or (seq "[:" (+ (any "a-z")) ":]")
675 (not (any "]"))))
676 "]")
677 anything
678 (seq "\\"
679 (or anything
680 (seq (any "sScC_") anything)
681 (seq "("
682 (* (or (not (any "\\"))
683 (seq "\\" (not (any ")")))))
684 "\\)"))))
685 eos)
686 t)))
687 (cons (list arg)
688 (if (string-match-p rx--regexp-atomic-regexp arg) t nil)))
689 (rx--delayed-evaluation
690 (cons (list arg) nil))
691 (t (error "rx `regexp' form with non-string argument")))))
692
693(defun rx--translate-compat-form (def form)
694 "Translate a compatibility form from `rx-constituents'.
695DEF is the definition tuple. Return (REGEXP . PRECEDENCE)."
696 (let* ((fn (nth 0 def))
697 (min-args (nth 1 def))
698 (max-args (nth 2 def))
699 (predicate (nth 3 def))
700 (nargs (1- (length form))))
701 (when (< nargs min-args)
702 (error "The `%s' form takes at least %d argument(s)"
703 (car form) min-args))
704 (when (and max-args (> nargs max-args))
705 (error "The `%s' form takes at most %d argument(s)"
706 (car form) max-args))
707 (when (and predicate (not (rx--every predicate (cdr form))))
708 (error "The `%s' form requires arguments satisfying `%s'"
709 (car form) predicate))
710 (let ((regexp (funcall fn form)))
711 (unless (stringp regexp)
712 (error "The `%s' form did not expand to a string" (car form)))
713 (cons (list regexp) nil))))
714
715(defun rx--translate-form (form)
716 "Translate an rx form (list structure). Return (REGEXP . PRECEDENCE)."
717 (let ((body (cdr form)))
718 (pcase (car form)
719 ((or 'seq : 'and 'sequence) (rx--translate-seq body))
720 ((or 'or '|) (rx--translate-or body))
721 ((or 'any 'in 'char) (rx--translate-any nil body))
722 ('not-char (rx--translate-any t body))
723 ('not (rx--translate-not nil body))
724
725 ('repeat (rx--translate-repeat body))
726 ('= (rx--translate-= body))
727 ('>= (rx--translate->= body))
728 ('** (rx--translate-** body))
729
730 ((or 'zero-or-more '0+) (rx--translate-rep "*" rx--greedy body))
731 ((or 'one-or-more '1+) (rx--translate-rep "+" rx--greedy body))
732 ((or 'zero-or-one 'opt 'optional) (rx--translate-rep "?" rx--greedy body))
733
734 ('* (rx--translate-rep "*" t body))
735 ('+ (rx--translate-rep "+" t body))
736 ((or '\? ?\s) (rx--translate-rep "?" t body))
737
738 ('*? (rx--translate-rep "*" nil body))
739 ('+? (rx--translate-rep "+" nil body))
740 ((or '\?? ??) (rx--translate-rep "?" nil body))
741
742 ('minimal-match (rx--control-greedy nil body))
743 ('maximal-match (rx--control-greedy t body))
744
745 ((or 'group 'submatch) (rx--translate-group body))
746 ((or 'group-n 'submatch-n) (rx--translate-group-n body))
747 ('backref (rx--translate-backref body))
748
749 ('syntax (rx--translate-syntax nil body))
750 ('not-syntax (rx--translate-syntax t body))
751 ('category (rx--translate-category nil body))
752
753 ('literal (rx--translate-literal body))
754 ('eval (rx--translate-eval body))
755 ((or 'regexp 'regex) (rx--translate-regexp body))
756
757 (op
758 (unless (symbolp op)
759 (error "Bad rx operator `%S'" op))
760
761 ;; For compatibility with old rx.
762 (let ((entry (assq op rx-constituents)))
763 (if (progn
764 (while (and entry (not (consp (cdr entry))))
765 (setq entry
766 (if (symbolp (cdr entry))
767 ;; Alias for another entry.
768 (assq (cdr entry) rx-constituents)
769 ;; Wrong type, try further down the list.
770 (assq (car entry)
771 (cdr (memq entry rx-constituents))))))
772 entry)
773 (rx--translate-compat-form (cdr entry) form)
774 (error "Unknown rx form `%s'" op)))))))
775
776;; Defined here rather than in re-builder to lower the odds that it
777;; will be kept in sync with changes.
778(defconst rx--builtin-forms
779 '(seq sequence : and or | any in char not-char not
780 repeat = >= **
781 zero-or-more 0+ *
782 one-or-more 1+ +
783 zero-or-one opt optional \?
784 *? +? \??
785 minimal-match maximal-match
786 group submatch group-n submatch-n backref
787 syntax not-syntax category
788 literal eval regexp regex)
789 "List of built-in rx forms. For use in re-builder only.")
790
791(defun rx--translate (item)
792 "Translate the rx-expression ITEM. Return (REGEXP . PRECEDENCE)."
793 (cond
794 ((stringp item)
795 (if (= (length item) 0)
796 (cons nil 'seq)
797 (cons (list (regexp-quote item)) (if (= (length item) 1) t 'seq))))
798 ((characterp item)
799 (cons (list (regexp-quote (char-to-string item))) t))
800 ((symbolp item)
801 (rx--translate-symbol item))
802 ((consp item)
803 (rx--translate-form item))
804 (t (error "Bad rx expression: %S" item))))
945 805
946 806
947;;;###autoload 807;;;###autoload
948(defun rx-to-string (form &optional no-group) 808(defun rx-to-string (form &optional no-group)
949 "Parse and produce code for regular expression FORM. 809 "Translate FORM from `rx' sexp syntax into a string regexp.
950FORM is a regular expression in sexp form. 810The arguments to `literal' and `regexp' forms inside FORM must be
951NO-GROUP non-nil means don't put shy groups around the result. 811constant strings.
952 812If NO-GROUP is non-nil, don't bracket the result in a non-capturing
953In contrast to the `rx' macro, subforms `literal' and `regexp' 813group."
954will not accept non-string arguments, i.e., (literal STRING) 814 (let* ((item (rx--translate form))
955becomes just a more verbose version of STRING." 815 (exprs (if no-group
956 (rx-group-if (rx-form form) (null no-group))) 816 (car item)
817 (rx--atomic-regexp item))))
818 (apply #'concat exprs)))
819
820(defun rx--to-expr (form)
821 "Translate the rx-expression FORM to a Lisp expression yielding a regexp."
822 (let* ((rx--delayed-evaluation t)
823 (elems (car (rx--translate form)))
824 (args nil))
825 ;; Merge adjacent strings.
826 (while elems
827 (let ((strings nil))
828 (while (and elems (stringp (car elems)))
829 (push (car elems) strings)
830 (setq elems (cdr elems)))
831 (let ((s (apply #'concat (nreverse strings))))
832 (unless (zerop (length s))
833 (push s args))))
834 (when elems
835 (push (car elems) args)
836 (setq elems (cdr elems))))
837 (cond ((null args) "") ; 0 args
838 ((cdr args) (cons 'concat (nreverse args))) ; ≄2 args
839 (t (car args))))) ; 1 arg
957 840
958 841
959;;;###autoload 842;;;###autoload
@@ -1054,78 +937,64 @@ Zero-width assertions: these all match the empty string in specific places.
1054 937
1055(literal EXPR) Match the literal string from evaluating EXPR at run time. 938(literal EXPR) Match the literal string from evaluating EXPR at run time.
1056(regexp EXPR) Match the string regexp from evaluating EXPR at run time. 939(regexp EXPR) Match the string regexp from evaluating EXPR at run time.
1057(eval EXPR) Match the rx sexp from evaluating EXPR at compile time." 940(eval EXPR) Match the rx sexp from evaluating EXPR at compile time.
1058 (let* ((rx--compile-to-lisp t) 941
1059 (re (cond ((null regexps) 942\(fn REGEXPS...)"
1060 (error "No regexp")) 943 (rx--to-expr (cons 'seq regexps)))
1061 ((cdr regexps) 944
1062 (rx-to-string `(and ,@regexps) t)) 945
1063 (t 946;; During `rx--pcase-transform', list of defined variables in right-to-left
1064 (rx-to-string (car regexps) t))))) 947;; order.
1065 (if (stringp re) 948(defvar rx--pcase-vars)
1066 re 949
1067 `(concat ,@re)))) 950(defun rx--pcase-transform (rx)
1068 951 "Transform RX, an rx-expression augmented with `let' and named `backref',
952into a plain rx-expression, collecting names into `rx--pcase-vars'."
953 (pcase rx
954 (`(let ,name . ,body)
955 (let* ((index (length (memq name rx--pcase-vars)))
956 (i (if (zerop index)
957 (length (push name rx--pcase-vars))
958 index)))
959 `(group-n ,i ,(rx--pcase-transform (cons 'seq body)))))
960 ((and `(backref ,ref)
961 (guard (symbolp ref)))
962 (let ((index (length (memq ref rx--pcase-vars))))
963 (when (zerop index)
964 (error "rx `backref' variable must be one of: %s"
965 (mapconcat #'symbol-name rx--pcase-vars " ")))
966 `(backref ,index)))
967 ((and `(,head . ,rest)
968 (guard (and (symbolp head)
969 (not (memq head '(literal regexp regex eval))))))
970 (cons head (mapcar #'rx--pcase-transform rest)))
971 (_ rx)))
1069 972
1070(pcase-defmacro rx (&rest regexps) 973(pcase-defmacro rx (&rest regexps)
1071 "Build a `pcase' pattern matching `rx' REGEXPS in sexp form. 974 "A pattern that matches strings against `rx' REGEXPS in sexp form.
1072The REGEXPS are interpreted as in `rx'. The pattern matches any 975REGEXPS are interpreted as in `rx'. The pattern matches any
1073string that is a match for the regular expression so constructed, 976string that is a match for REGEXPS, as if by `string-match'.
1074as if by `string-match'.
1075 977
1076In addition to the usual `rx' constructs, REGEXPS can contain the 978In addition to the usual `rx' syntax, REGEXPS can contain the
1077following constructs: 979following constructs:
1078 980
1079 (let REF SEXP...) creates a new explicitly named reference to 981 (let REF RX...) binds the symbol REF to a submatch that matches
1080 a submatch that matches regular expressions 982 the regular expressions RX. REF is bound in
1081 SEXP, and binds the match to REF. 983 CODE to the string of the submatch or nil, but
1082 (backref REF) creates a backreference to the submatch 984 can also be used in `backref'.
1083 introduced by a previous (let REF ...) 985 (backref REF) matches whatever the submatch REF matched.
1084 construct. REF can be the same symbol 986 REF can be a number, as usual, or a name
1085 in the first argument of the corresponding 987 introduced by a previous (let REF ...)
1086 (let REF ...) construct, or it can be a 988 construct."
1087 submatch number. It matches the referenced 989 (let* ((rx--pcase-vars nil)
1088 submatch. 990 (regexp (rx--to-expr (rx--pcase-transform (cons 'seq regexps)))))
1089
1090The REFs are associated with explicitly named submatches starting
1091from 1. Multiple occurrences of the same REF refer to the same
1092submatch.
1093
1094If a case matches, the match data is modified as usual so you can
1095use it in the case body, but you still have to pass the correct
1096string as argument to `match-string'."
1097 (let* ((vars ())
1098 (rx-constituents
1099 `((let
1100 ,(lambda (form)
1101 (rx-check form)
1102 (let ((var (cadr form)))
1103 (cl-check-type var symbol)
1104 (let ((i (or (cl-position var vars :test #'eq)
1105 (prog1 (length vars)
1106 (setq vars `(,@vars ,var))))))
1107 (rx-form `(submatch-n ,(1+ i) ,@(cddr form))))))
1108 1 nil)
1109 (backref
1110 ,(lambda (form)
1111 (rx-check form)
1112 (rx-backref
1113 `(backref ,(let ((var (cadr form)))
1114 (if (integerp var) var
1115 (1+ (cl-position var vars :test #'eq)))))))
1116 1 1
1117 ,(lambda (var)
1118 (cond ((integerp var) (rx-check-backref var))
1119 ((memq var vars) t)
1120 (t (error "rx `backref' variable must be one of %s: %s"
1121 vars var)))))
1122 ,@rx-constituents))
1123 (regexp (rx-to-string `(seq ,@regexps) :no-group)))
1124 `(and (pred (string-match ,regexp)) 991 `(and (pred (string-match ,regexp))
1125 ,@(cl-loop for i from 1 992 ,@(let ((i 0))
1126 for var in vars 993 (mapcar (lambda (name)
1127 collect `(app (match-string ,i) ,var))))) 994 (setq i (1+ i))
1128 995 `(app (match-string ,i) ,name))
996 (reverse rx--pcase-vars))))))
997
1129(provide 'rx) 998(provide 'rx)
1130 999
1131;;; rx.el ends here 1000;;; rx.el ends here
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 8845ebf46d1..fec046dd991 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -1,4 +1,4 @@
1;;; rx-tests.el --- test for rx.el functions -*- lexical-binding: t -*- 1;;; rx-tests.el --- tests for rx.el -*- lexical-binding: t -*-
2 2
3;; Copyright (C) 2016-2019 Free Software Foundation, Inc. 3;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
4 4
@@ -17,21 +17,44 @@
17;; You should have received a copy of the GNU General Public License 17;; You should have received a copy of the GNU General Public License
18;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. 18;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
19 19
20;;; Commentary:
21
22(require 'ert) 20(require 'ert)
23(require 'rx) 21(require 'rx)
24 22
25;;; Code: 23(ert-deftest rx-seq ()
24 (should (equal (rx "a.b" "*" "c")
25 "a\\.b\\*c"))
26 (should (equal (rx (seq "a" (: "b" (and "c" (sequence "d" nonl)
27 "e")
28 "f")
29 "g"))
30 "abcd.efg"))
31 (should (equal (rx "a$" "b")
32 "a\\$b"))
33 (should (equal (rx bol "a" "b" ?c eol)
34 "^abc$"))
35 (should (equal (rx "a" "" "b")
36 "ab"))
37 (should (equal (rx (seq))
38 ""))
39 (should (equal (rx "" (or "ab" nonl) "")
40 "ab\\|.")))
41
42(ert-deftest rx-or ()
43 (should (equal (rx (or "ab" (| "c" nonl) "de"))
44 "ab\\|c\\|.\\|de"))
45 (should (equal (rx (or "ab" "abc" "a"))
46 "\\(?:ab\\|abc\\|a\\)"))
47 (should (equal (rx (| nonl "a") (| "b" blank))
48 "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
49 (should (equal (rx (|))
50 "\\`a\\`")))
26 51
27(ert-deftest rx-char-any () 52(ert-deftest rx-char-any ()
28 "Test character alternatives with `]' and `-' (Bug#25123)." 53 "Test character alternatives with `]' and `-' (Bug#25123)."
29 (should (string-match 54 (should (equal
30 (rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:))) 55 (rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:)))
31 string-end) 56 string-end)
32 (apply #'string (nconc (number-sequence ?\] ?\{) 57 "\\`[.-:<-{-]+\\'")))
33 (number-sequence ?< ?\])
34 (number-sequence ?- ?:))))))
35 58
36(ert-deftest rx-char-any-range-nl () 59(ert-deftest rx-char-any-range-nl ()
37 "Test character alternatives with LF as a range endpoint." 60 "Test character alternatives with LF as a range endpoint."
@@ -40,28 +63,72 @@
40 (should (equal (rx (any "\a-\n")) 63 (should (equal (rx (any "\a-\n"))
41 "[\a-\n]"))) 64 "[\a-\n]")))
42 65
43(ert-deftest rx-char-any-range-bad ()
44 (should-error (rx (any "0-9a-Z")))
45 (should-error (rx (any (?0 . ?9) (?a . ?Z)))))
46
47(ert-deftest rx-char-any-raw-byte () 66(ert-deftest rx-char-any-raw-byte ()
48 "Test raw bytes in character alternatives." 67 "Test raw bytes in character alternatives."
68
69 ;; The multibyteness of the rx return value sometimes depends on whether
70 ;; the test had been byte-compiled or not, so we add explicit conversions.
71
49 ;; Separate raw characters. 72 ;; Separate raw characters.
50 (should (equal (string-match-p (rx (any "\326A\333B")) 73 (should (equal (string-to-multibyte (rx (any "\326A\333B")))
51 "X\326\333") 74 (string-to-multibyte "[AB\326\333]")))
52 1))
53 ;; Range of raw characters, unibyte. 75 ;; Range of raw characters, unibyte.
54 (should (equal (string-match-p (rx (any "\200-\377")) 76 (should (equal (string-to-multibyte (rx (any "\200-\377")))
55 "ĂżA\310B") 77 (string-to-multibyte "[\200-\377]")))
56 2)) 78
57 ;; Range of raw characters, multibyte. 79 ;; Range of raw characters, multibyte.
58 (should (equal (string-match-p (rx (any "Å\211\326-\377\177")) 80 (should (equal (rx (any "Å\211\326-\377\177"))
59 "XY\355\177\327") 81 "[\177Å\211\326-\377]"))
60 2))
61 ;; Split range; \177-\377Ăż should not be optimised to \177-\377. 82 ;; Split range; \177-\377Ăż should not be optimised to \177-\377.
62 (should (equal (string-match-p (rx (any "\177-\377" ?Ăż)) 83 (should (equal (rx (any "\177-\377" ?Ăż))
63 "ĂżA\310B") 84 "[\177Ăż\200-\377]")))
64 0))) 85
86(ert-deftest rx-any ()
87 (should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))
88 "[ACDF-HJ-S]"))
89 (should (equal (rx (in "a!f" ?c) (char "q-z" "0-3")
90 (not-char "a-e1-5") (not (in "A-M" ?q)))
91 "[!acf][0-3q-z][^1-5a-e][^A-Mq]"))
92 (should (equal (rx (any "^") (any "]") (any "-")
93 (not (any "^")) (not (any "]")) (not (any "-")))
94 "\\^]-[^^][^]][^-]"))
95 (should (equal (rx (any "]" "^") (any "]" "-") (any "-" "^")
96 (not (any "]" "^")) (not (any "]" "-"))
97 (not (any "-" "^")))
98 "[]^][]-][-^][^]^][^]-][^-^]"))
99 (should (equal (rx (any "]" "^" "-") (not (any "]" "^" "-")))
100 "[]^-][^]^-]"))
101 (should (equal (rx (any "-" ascii) (any "^" ascii) (any "]" ascii))
102 "[[:ascii:]-][[:ascii:]^][][:ascii:]]"))
103 (should (equal (rx (not (any "-" ascii)) (not (any "^" ascii))
104 (not (any "]" ascii)))
105 "[^[:ascii:]-][^[:ascii:]^][^][:ascii:]]"))
106 (should (equal (rx (any "-]" ascii) (any "^]" ascii) (any "-^" ascii))
107 "[][:ascii:]-][]^[:ascii:]][[:ascii:]^-]"))
108 (should (equal (rx (not (any "-]" ascii)) (not (any "^]" ascii))
109 (not (any "-^" ascii)))
110 "[^][:ascii:]-][^]^[:ascii:]][^[:ascii:]^-]"))
111 (should (equal (rx (any "-]^" ascii) (not (any "-]^" ascii)))
112 "[]^[:ascii:]-][^]^[:ascii:]-]"))
113 (should (equal (rx (any "^" lower upper) (not (any "^" lower upper)))
114 "[[:lower:]^[:upper:]][^[:lower:]^[:upper:]]"))
115 (should (equal (rx (any "-" lower upper) (not (any "-" lower upper)))
116 "[[:lower:][:upper:]-][^[:lower:][:upper:]-]"))
117 (should (equal (rx (any "]" lower upper) (not (any "]" lower upper)))
118 "[][:lower:][:upper:]][^][:lower:][:upper:]]"))
119 (should (equal (rx (any "-a" "c-" "f-f" "--/*--"))
120 "[*-/acf]"))
121 (should (equal (rx (any "]-a" ?-) (not (any "]-a" ?-)))
122 "[]-a-][^]-a-]"))
123 (should (equal (rx (any "--]") (not (any "--]"))
124 (any "-" "^-a") (not (any "-" "^-a")))
125 "[].-\\-][^].-\\-][-^-a][^-^-a]"))
126 (should (equal (rx (not (any "!a" "0-8" digit nonascii)))
127 "[^!0-8a[:digit:][:nonascii:]]"))
128 (should (equal (rx (any) (not (any)))
129 "\\`a\\`\\(?:.\\|\n\\)"))
130 (should (equal (rx (any "") (not (any "")))
131 "\\`a\\`\\(?:.\\|\n\\)")))
65 132
66(ert-deftest rx-pcase () 133(ert-deftest rx-pcase ()
67 (should (equal (pcase "a 1 2 3 1 1 b" 134 (should (equal (pcase "a 1 2 3 1 1 b"
@@ -71,7 +138,11 @@
71 (backref u) space 138 (backref u) space
72 (backref 1)) 139 (backref 1))
73 (list u v))) 140 (list u v)))
74 '("1" "3")))) 141 '("1" "3")))
142 (let ((k "blue"))
143 (should (equal (pcase "<blue>"
144 ((rx "<" (literal k) ">") 'ok))
145 'ok))))
75 146
76(ert-deftest rx-kleene () 147(ert-deftest rx-kleene ()
77 "Test greedy and non-greedy repetition operators." 148 "Test greedy and non-greedy repetition operators."
@@ -94,71 +165,158 @@
94 (should (equal (rx (maximal-match 165 (should (equal (rx (maximal-match
95 (seq (* "a") (+ "b") (\? "c") (?\s "d") 166 (seq (* "a") (+ "b") (\? "c") (?\s "d")
96 (*? "e") (+? "f") (\?? "g") (?? "h")))) 167 (*? "e") (+? "f") (\?? "g") (?? "h"))))
97 "a*b+c?d?e*?f+?g??h??"))) 168 "a*b+c?d?e*?f+?g??h??"))
169 (should (equal (rx "a" (*) (+ (*)) (? (*) (+)) "b")
170 "ab")))
98 171
99(ert-deftest rx-or () 172(ert-deftest rx-repeat ()
100 ;; Test or-pattern reordering (Bug#34641). 173 (should (equal (rx (= 3 "a") (>= 51 "b")
101 (let ((s "abc")) 174 (** 2 11 "c") (repeat 6 "d") (repeat 4 8 "e"))
102 (should (equal (and (string-match (rx (or "abc" "ab" "a")) s) 175 "a\\{3\\}b\\{51,\\}c\\{2,11\\}d\\{6\\}e\\{4,8\\}"))
103 (match-string 0 s)) 176 (should (equal (rx (= 0 "k") (>= 0 "l") (** 0 0 "m") (repeat 0 "n")
104 "abc")) 177 (repeat 0 0 "o"))
105 (should (equal (and (string-match (rx (or "ab" "abc" "a")) s) 178 "k\\{0\\}l\\{0,\\}m\\{0\\}n\\{0\\}o\\{0\\}"))
106 (match-string 0 s)) 179 (should (equal (rx (opt (0+ "a")))
107 "ab")) 180 "\\(?:a*\\)?"))
108 (should (equal (and (string-match (rx (or "a" "ab" "abc")) s) 181 (should (equal (rx (opt (= 4 "a")))
109 (match-string 0 s)) 182 "a\\{4\\}?"))
110 "a"))) 183 (should (equal (rx "a" (** 3 7) (= 4) (>= 3) (= 4 (>= 7) (= 2)) "b")
111 ;; Test zero-argument `or'. 184 "ab")))
112 (should (equal (rx (or)) regexp-unmatchable))) 185
186(ert-deftest rx-atoms ()
187 (should (equal (rx anything)
188 ".\\|\n"))
189 (should (equal (rx line-start not-newline nonl any line-end)
190 "^...$"))
191 (should (equal (rx bol string-start string-end buffer-start buffer-end
192 bos eos bot eot eol)
193 "^\\`\\'\\`\\'\\`\\'\\`\\'$"))
194 (should (equal (rx point word-start word-end bow eow symbol-start symbol-end
195 word-boundary not-word-boundary not-wordchar)
196 "\\=\\<\\>\\<\\>\\_<\\_>\\b\\B\\W"))
197 (should (equal (rx digit numeric num control cntrl)
198 "[[:digit:]][[:digit:]][[:digit:]][[:cntrl:]][[:cntrl:]]"))
199 (should (equal (rx hex-digit hex xdigit blank)
200 "[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:blank:]]"))
201 (should (equal (rx graph graphic print printing)
202 "[[:graph:]][[:graph:]][[:print:]][[:print:]]"))
203 (should (equal (rx alphanumeric alnum letter alphabetic alpha)
204 "[[:alnum:]][[:alnum:]][[:alpha:]][[:alpha:]][[:alpha:]]"))
205 (should (equal (rx ascii nonascii lower lower-case)
206 "[[:ascii:]][[:nonascii:]][[:lower:]][[:lower:]]"))
207 (should (equal (rx punctuation punct space whitespace white)
208 "[[:punct:]][[:punct:]][[:space:]][[:space:]][[:space:]]"))
209 (should (equal (rx upper upper-case word wordchar)
210 "[[:upper:]][[:upper:]][[:word:]][[:word:]]"))
211 (should (equal (rx unibyte multibyte)
212 "[[:unibyte:]][[:multibyte:]]")))
213
214(ert-deftest rx-syntax ()
215 (should (equal (rx (syntax whitespace) (syntax punctuation)
216 (syntax word) (syntax symbol)
217 (syntax open-parenthesis) (syntax close-parenthesis))
218 "\\s-\\s.\\sw\\s_\\s(\\s)"))
219 (should (equal (rx (syntax string-quote) (syntax paired-delimiter)
220 (syntax escape) (syntax character-quote)
221 (syntax comment-start) (syntax comment-end)
222 (syntax string-delimiter) (syntax comment-delimiter))
223 "\\s\"\\s$\\s\\\\s/\\s<\\s>\\s|\\s!")))
224
225(ert-deftest rx-category ()
226 (should (equal (rx (category space-for-indent) (category base)
227 (category consonant) (category base-vowel)
228 (category upper-diacritical-mark)
229 (category lower-diacritical-mark)
230 (category tone-mark) (category symbol)
231 (category digit)
232 (category vowel-modifying-diacritical-mark)
233 (category vowel-sign) (category semivowel-lower)
234 (category not-at-end-of-line)
235 (category not-at-beginning-of-line))
236 "\\c \\c.\\c0\\c1\\c2\\c3\\c4\\c5\\c6\\c7\\c8\\c9\\c<\\c>"))
237 (should (equal (rx (category alpha-numeric-two-byte)
238 (category chinese-two-byte) (category greek-two-byte)
239 (category japanese-hiragana-two-byte)
240 (category indian-two-byte)
241 (category japanese-katakana-two-byte)
242 (category strong-left-to-right)
243 (category korean-hangul-two-byte)
244 (category strong-right-to-left)
245 (category cyrillic-two-byte)
246 (category combining-diacritic))
247 "\\cA\\cC\\cG\\cH\\cI\\cK\\cL\\cN\\cR\\cY\\c^"))
248 (should (equal (rx (category ascii) (category arabic) (category chinese)
249 (category ethiopic) (category greek) (category korean)
250 (category indian) (category japanese)
251 (category japanese-katakana) (category latin)
252 (category lao) (category tibetan))
253 "\\ca\\cb\\cc\\ce\\cg\\ch\\ci\\cj\\ck\\cl\\co\\cq"))
254 (should (equal (rx (category japanese-roman) (category thai)
255 (category vietnamese) (category hebrew)
256 (category cyrillic) (category can-break))
257 "\\cr\\ct\\cv\\cw\\cy\\c|"))
258 (should (equal (rx (category ?g) (not (category ?~)))
259 "\\cg\\C~")))
260
261(ert-deftest rx-not ()
262 (should (equal (rx (not word-boundary))
263 "\\B"))
264 (should (equal (rx (not ascii) (not lower-case) (not wordchar))
265 "[^[:ascii:]][^[:lower:]][^[:word:]]"))
266 (should (equal (rx (not (syntax punctuation)) (not (syntax escape)))
267 "\\S.\\S\\"))
268 (should (equal (rx (not (category tone-mark)) (not (category lao)))
269 "\\C4\\Co")))
270
271(ert-deftest rx-group ()
272 (should (equal (rx (group nonl) (submatch "x")
273 (group-n 3 "y") (submatch-n 13 "z") (backref 1))
274 "\\(.\\)\\(x\\)\\(?3:y\\)\\(?13:z\\)\\1"))
275 (should (equal (rx (group) (group-n 2))
276 "\\(\\)\\(?2:\\)")))
277
278(ert-deftest rx-regexp ()
279 (should (equal (rx (regexp "abc") (regex "[de]"))
280 "\\(?:abc\\)[de]"))
281 (let ((x "a*"))
282 (should (equal (rx (regexp x) "b")
283 "\\(?:a*\\)b"))
284 (should (equal (rx "" (regexp x) (eval ""))
285 "a*"))))
286
287(ert-deftest rx-eval ()
288 (should (equal (rx (eval (list 'syntax 'symbol)))
289 "\\s_"))
290 (should (equal (rx "a" (eval (concat)) "b")
291 "ab")))
292
293(ert-deftest rx-literal ()
294 (should (equal (rx (literal (char-to-string 42)) nonl)
295 "\\*."))
296 (let ((x "a+b"))
297 (should (equal (rx (opt (literal (upcase x))))
298 "\\(?:A\\+B\\)?"))))
299
300(ert-deftest rx-to-string ()
301 (should (equal (rx-to-string '(or nonl "\nx"))
302 "\\(?:.\\|\nx\\)"))
303 (should (equal (rx-to-string '(or nonl "\nx") t)
304 ".\\|\nx")))
305
306
307(ert-deftest rx-constituents ()
308 (let ((rx-constituents
309 (append '((beta . gamma)
310 (gamma . "a*b")
311 (delta . ((lambda (form)
312 (regexp-quote (format "<%S>" form)))
313 1 nil symbolp))
314 (epsilon . delta))
315 rx-constituents)))
316 (should (equal (rx-to-string '(seq (+ beta) nonl gamma) t)
317 "\\(?:a*b\\)+.\\(?:a*b\\)"))
318 (should (equal (rx-to-string '(seq (delta a b c) (* (epsilon d e))) t)
319 "\\(?:<(delta a b c)>\\)\\(?:<(epsilon d e)>\\)*"))))
113 320
114(ert-deftest rx-seq ()
115 ;; Test zero-argument `seq'.
116 (should (equal (rx (seq)) "")))
117
118(defmacro rx-tests--match (regexp string &optional match)
119 (macroexp-let2 nil strexp string
120 `(ert-info ((format "Matching %S to %S" ',regexp ,strexp))
121 (should (string-match ,regexp ,strexp))
122 ,@(when match
123 `((should (equal (match-string 0 ,strexp) ,match)))))))
124
125(ert-deftest rx-nonstring-expr ()
126 (let ((bee "b")
127 (vowel "[aeiou]"))
128 (rx-tests--match (rx "a" (literal bee) "c") "abc")
129 (rx-tests--match (rx "a" (regexp bee) "c") "abc")
130 (rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc")
131 (rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc")
132 (should-not (string-match (rx (or (regexp bee) "xy")) ""))
133 (rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc")
134 (rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez")
135 (should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z"))
136 (rx-tests--match (rx "x" (= 3 (literal vowel)) "z")
137 "x[aeiou][aeiou][aeiou]z")
138 (rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz")
139 (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz")
140 (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz")
141 (rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz")
142 (rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz")
143 (rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz")
144 (rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz")
145 (rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz")))
146
147(ert-deftest rx-nonstring-expr-non-greedy ()
148 "`rx's greediness can't affect runtime regexp parts."
149 (let ((ad-min "[ad]*?")
150 (ad-max "[ad]*")
151 (ad "[ad]"))
152 (rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda")
153 (rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada")
154 (rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" "cdaaada")
155 (rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" "cda")
156 (rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" "cda")
157 (rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" "cdaaada")))
158
159(ert-deftest rx-to-string-lisp-forms ()
160 (rx-tests--match (rx-to-string '(seq "a" (literal "b") "c")) "abc")
161 (rx-tests--match (rx-to-string '(seq "a" (regexp "b") "c")) "abc"))
162 321
163(provide 'rx-tests) 322(provide 'rx-tests)
164;; rx-tests.el ends here.