diff options
| author | Mattias EngdegÄrd | 2019-09-25 14:29:50 -0700 |
|---|---|---|
| committer | Paul Eggert | 2019-09-25 14:29:50 -0700 |
| commit | 2ed71227c626c6cfdc684948644ccf3d9eaeb15b (patch) | |
| tree | 2a4043ce8036206c7138b9bf5b149da8c66ec811 | |
| parent | a773a6474897356cd78aeea092d2c1a51ede23f9 (diff) | |
| download | emacs-2ed71227c626c6cfdc684948644ccf3d9eaeb15b.tar.gz emacs-2ed71227c626c6cfdc684948644ccf3d9eaeb15b.zip | |
New rx implementation
* lisp/emacs-lisp/rx.el:
* test/lisp/emacs-lisp/rx-tests.el:
* doc/lispref/searching.texi (Rx Constructs):
Rewrite rx for correctness, clarity, and performance. The new
implementation retains full compatibility and has more comprehensive
tests.
* lisp/emacs-lisp/re-builder.el (reb-rx-font-lock-keywords):
Adapt to changes in internal variables in rx.el.
| -rw-r--r-- | doc/lispref/searching.texi | 4 | ||||
| -rw-r--r-- | lisp/emacs-lisp/re-builder.el | 9 | ||||
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 1809 | ||||
| -rw-r--r-- | test/lisp/emacs-lisp/rx-tests.el | 336 |
4 files changed, 1091 insertions, 1067 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 65f56b490fd..2d94e5659de 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi | |||
| @@ -1044,11 +1044,9 @@ customisation. | |||
| 1044 | 1044 | ||
| 1045 | The various forms in @code{rx} regexps are described below. The | 1045 | The various forms in @code{rx} regexps are described below. The |
| 1046 | shorthand @var{rx} represents any @code{rx} form, and @var{rx}@dots{} | 1046 | shorthand @var{rx} represents any @code{rx} form, and @var{rx}@dots{} |
| 1047 | means one or more @code{rx} forms. Where the corresponding string | 1047 | means zero or more @code{rx} forms. Where the corresponding string |
| 1048 | regexp syntax is given, @var{A}, @var{B}, @dots{} are string regexp | 1048 | regexp syntax is given, @var{A}, @var{B}, @dots{} are string regexp |
| 1049 | subexpressions. | 1049 | subexpressions. |
| 1050 | @c With the new implementation of rx, this can be changed from | ||
| 1051 | @c 'one or more' to 'zero or more'. | ||
| 1052 | 1050 | ||
| 1053 | @subsubheading Literals | 1051 | @subsubheading Literals |
| 1054 | 1052 | ||
diff --git a/lisp/emacs-lisp/re-builder.el b/lisp/emacs-lisp/re-builder.el index 961d26a7212..1054f1453bc 100644 --- a/lisp/emacs-lisp/re-builder.el +++ b/lisp/emacs-lisp/re-builder.el | |||
| @@ -816,13 +816,12 @@ If SUBEXP is non-nil mark only the corresponding sub-expressions." | |||
| 816 | 816 | ||
| 817 | (require 'rx) | 817 | (require 'rx) |
| 818 | (defconst reb-rx-font-lock-keywords | 818 | (defconst reb-rx-font-lock-keywords |
| 819 | (let ((constituents (mapcar (lambda (rec) | 819 | (let ((constituents (mapcar #'symbol-name rx--builtin-forms)) |
| 820 | (symbol-name (car rec))) | 820 | (syntax (mapcar (lambda (rec) (symbol-name (car rec))) |
| 821 | rx-constituents)) | 821 | rx--syntax-codes)) |
| 822 | (syntax (mapcar (lambda (rec) (symbol-name (car rec))) rx-syntax)) | ||
| 823 | (categories (mapcar (lambda (rec) | 822 | (categories (mapcar (lambda (rec) |
| 824 | (symbol-name (car rec))) | 823 | (symbol-name (car rec))) |
| 825 | rx-categories))) | 824 | rx--categories))) |
| 826 | `( | 825 | `( |
| 827 | (,(concat "(" (regexp-opt (list "rx-to-string") t) "[[:space:]]") | 826 | (,(concat "(" (regexp-opt (list "rx-to-string") t) "[[:space:]]") |
| 828 | (1 font-lock-function-name-face)) | 827 | (1 font-lock-function-name-face)) |
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 249529e54e3..9b3419e1c88 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -1,11 +1,7 @@ | |||
| 1 | ;;; rx.el --- sexp notation for regular expressions -*- lexical-binding: t -*- | 1 | ;;; rx.el --- S-exp notation for regexps --*- lexical-binding: t -*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 2001-2019 Free Software Foundation, Inc. | 3 | ;; Copyright (C) 2001-2019 Free Software Foundation, Inc. |
| 4 | 4 | ||
| 5 | ;; Author: Gerd Moellmann <gerd@gnu.org> | ||
| 6 | ;; Maintainer: emacs-devel@gnu.org | ||
| 7 | ;; Keywords: strings, regexps, extensions | ||
| 8 | |||
| 9 | ;; This file is part of GNU Emacs. | 5 | ;; This file is part of GNU Emacs. |
| 10 | 6 | ||
| 11 | ;; GNU Emacs is free software: you can redistribute it and/or modify | 7 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
| @@ -23,937 +19,824 @@ | |||
| 23 | 19 | ||
| 24 | ;;; Commentary: | 20 | ;;; Commentary: |
| 25 | 21 | ||
| 26 | ;; This is another implementation of sexp-form regular expressions. | 22 | ;; This facility allows writing regexps in a sexp-based language |
| 27 | ;; It was unfortunately written without being aware of the Sregex | 23 | ;; instead of strings. Regexps in the `rx' notation are easier to |
| 28 | ;; package coming with Emacs, but as things stand, Rx completely | 24 | ;; read, write and maintain; they can be indented and commented in a |
| 29 | ;; covers all regexp features, which Sregex doesn't, doesn't suffer | 25 | ;; natural way, and are easily composed by program code. |
| 30 | ;; from the bugs mentioned in the commentary section of Sregex, and | 26 | ;; The translation to string regexp is done by a macro and does not |
| 31 | ;; uses a nicer syntax (IMHO, of course :-). | 27 | ;; incur any extra processing during run time. Example: |
| 32 | |||
| 33 | ;; This significantly extended version of the original, is almost | ||
| 34 | ;; compatible with Sregex. The only incompatibility I (fx) know of is | ||
| 35 | ;; that the `repeat' form can't have multiple regexp args. | ||
| 36 | |||
| 37 | ;; Now alternative forms are provided for a degree of compatibility | ||
| 38 | ;; with Olin Shivers' attempted definitive SRE notation. SRE forms | ||
| 39 | ;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, | ||
| 40 | ;; ,<exp>, (word ...), word+, posix-string, and character class forms. | ||
| 41 | ;; Some forms are inconsistent with SRE, either for historical reasons | ||
| 42 | ;; or because of the implementation -- simple translation into Emacs | ||
| 43 | ;; regexp strings. These include: any, word. Also, case-sensitivity | ||
| 44 | ;; and greediness are controlled by variables external to the regexp, | ||
| 45 | ;; and you need to feed the forms to the `posix-' functions to get | ||
| 46 | ;; SRE's POSIX semantics. There are probably more difficulties. | ||
| 47 | |||
| 48 | ;; Rx translates a sexp notation for regular expressions into the | ||
| 49 | ;; usual string notation. The translation can be done at compile-time | ||
| 50 | ;; by using the `rx' macro. The `regexp' and `literal' forms accept | ||
| 51 | ;; non-constant expressions, in which case `rx' will translate to a | ||
| 52 | ;; `concat' expression. Translation can be done fully at run time by | ||
| 53 | ;; calling function `rx-to-string'. See the documentation of `rx' for | ||
| 54 | ;; a complete description of the sexp notation. | ||
| 55 | ;; | ||
| 56 | ;; Some examples of string regexps and their sexp counterparts: | ||
| 57 | ;; | ||
| 58 | ;; "^[a-z]*" | ||
| 59 | ;; (rx line-start (0+ (in "a-z"))) | ||
| 60 | ;; | ||
| 61 | ;; "\n[^ \t]" | ||
| 62 | ;; (rx ?\n (not (in " \t"))) | ||
| 63 | ;; | ||
| 64 | ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | ||
| 65 | ;; (rx "*** EOOH ***\n") | ||
| 66 | ;; | ||
| 67 | ;; "\\<\\(catch\\|finally\\)\\>[^_]" | ||
| 68 | ;; (rx word-start (submatch (or "catch" "finally")) word-end | ||
| 69 | ;; (not (in ?_))) | ||
| 70 | ;; | ||
| 71 | ;; "[ \t\n]*:\\($\\|[^:]+\\)" | ||
| 72 | ;; (rx (* (in " \t\n")) ":" | ||
| 73 | ;; (submatch (or line-end (+ (not (in ?:)))))) | ||
| 74 | ;; | ||
| 75 | ;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*" | ||
| 76 | ;; (rx line-start | ||
| 77 | ;; "content-transfer-encoding:" | ||
| 78 | ;; (* (? ?\n) (in " \t")) | ||
| 79 | ;; "quoted-printable" | ||
| 80 | ;; (* (? ?\n) (in " \t"))) | ||
| 81 | ;; | ||
| 82 | ;; (concat "^\\(?:" something-else "\\)") | ||
| 83 | ;; (rx line-start (regexp something-else)) | ||
| 84 | ;; | ||
| 85 | ;; (regexp-opt '(STRING1 STRING2 ...)) | ||
| 86 | ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | ||
| 87 | ;; calls `regexp-opt' as needed. | ||
| 88 | ;; | ||
| 89 | ;; "^;;\\s-*\n\\|^\n" | ||
| 90 | ;; (rx (or (seq line-start ";;" (0+ space) ?\n) | ||
| 91 | ;; (seq line-start ?\n))) | ||
| 92 | ;; | 28 | ;; |
| 93 | ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | 29 | ;; (rx bos (or (not (any "^")) |
| 94 | ;; (rx "$Id: " | 30 | ;; (seq "^" (or " *" "[")))) |
| 95 | ;; (1+ (not (in " "))) | ||
| 96 | ;; " " | ||
| 97 | ;; (submatch (1+ (not (in " ")))) | ||
| 98 | ;; " ") | ||
| 99 | ;; | 31 | ;; |
| 100 | ;; "\\\\\\\\\\[\\w+" | 32 | ;; => "\\`\\(?:[^^]\\|\\^\\(?: \\*\\|\\[\\)\\)" |
| 101 | ;; (rx "\\\\[" (1+ word)) | ||
| 102 | ;; | ||
| 103 | ;; etc. | ||
| 104 | |||
| 105 | ;;; History: | ||
| 106 | ;; | 33 | ;; |
| 34 | ;; The notation is much influenced by and retains some compatibility with | ||
| 35 | ;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities, | ||
| 36 | ;; and the older Emacs package Sregex. | ||
| 107 | 37 | ||
| 108 | ;;; Code: | 38 | ;;; Code: |
| 109 | 39 | ||
| 110 | (require 'cl-lib) | 40 | ;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE), |
| 111 | (require 'cl-extra) | 41 | ;; where REGEXP is a list of string expressions that will be |
| 112 | 42 | ;; concatenated into a regexp, and PRECEDENCE is one of | |
| 113 | ;; FIXME: support macros. | 43 | ;; |
| 114 | 44 | ;; t -- can be used as argument to postfix operators (eg. "a") | |
| 115 | (defvar rx-constituents ;Not `const' because some modes extend it. | 45 | ;; seq -- can be concatenated in sequence with other seq or higher (eg. "ab") |
| 116 | '((and . (rx-and 0 nil)) | 46 | ;; lseq -- can be concatenated to the left of rseq or higher (eg. "^a") |
| 117 | (seq . and) ; SRE | 47 | ;; rseq -- can be concatenated to the right of lseq or higher (eg. "a$") |
| 118 | (: . and) ; SRE | 48 | ;; nil -- can only be used in alternatives (eg. "a\\|b") |
| 119 | (sequence . and) ; sregex | 49 | ;; |
| 120 | (or . (rx-or 0 nil)) | 50 | ;; They form a lattice: |
| 121 | (| . or) ; SRE | 51 | ;; |
| 122 | (not-newline . ".") | 52 | ;; t highest precedence |
| 123 | (nonl . not-newline) ; SRE | 53 | ;; | |
| 124 | (anything . (rx-anything 0 nil)) | 54 | ;; seq |
| 125 | (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE | 55 | ;; / \ |
| 126 | (any . ".") ; sregex | 56 | ;; lseq rseq |
| 127 | (in . any) | 57 | ;; \ / |
| 128 | (char . any) ; sregex | 58 | ;; nil lowest precedence |
| 129 | (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex | 59 | |
| 130 | (not . (rx-not 1 1 rx-check-not)) | 60 | |
| 131 | (repeat . (rx-repeat 2 nil)) | 61 | (defconst rx--char-classes |
| 132 | (= . (rx-= 2 nil)) ; SRE | 62 | '((digit . digit) |
| 133 | (>= . (rx->= 2 nil)) ; SRE | 63 | (numeric . digit) |
| 134 | (** . (rx-** 2 nil)) ; SRE | 64 | (num . digit) |
| 135 | (submatch . (rx-submatch 1 nil)) ; SRE | 65 | (control . cntrl) |
| 136 | (group . submatch) ; sregex | 66 | (cntrl . cntrl) |
| 137 | (submatch-n . (rx-submatch-n 2 nil)) | 67 | (hex-digit . xdigit) |
| 138 | (group-n . submatch-n) | 68 | (hex . xdigit) |
| 139 | (zero-or-more . (rx-kleene 1 nil)) | 69 | (xdigit . xdigit) |
| 140 | (one-or-more . (rx-kleene 1 nil)) | 70 | (blank . blank) |
| 141 | (zero-or-one . (rx-kleene 1 nil)) | 71 | (graphic . graph) |
| 142 | (\? . zero-or-one) ; SRE | 72 | (graph . graph) |
| 143 | (\?? . zero-or-one) | 73 | (printing . print) |
| 144 | (* . zero-or-more) ; SRE | 74 | (print . print) |
| 145 | (*? . zero-or-more) | 75 | (alphanumeric . alnum) |
| 146 | (0+ . zero-or-more) | 76 | (alnum . alnum) |
| 147 | (+ . one-or-more) ; SRE | 77 | (letter . alpha) |
| 148 | (+? . one-or-more) | 78 | (alphabetic . alpha) |
| 149 | (1+ . one-or-more) | 79 | (alpha . alpha) |
| 150 | (optional . zero-or-one) | 80 | (ascii . ascii) |
| 151 | (opt . zero-or-one) ; sregex | 81 | (nonascii . nonascii) |
| 152 | (minimal-match . (rx-greedy 1 1)) | 82 | (lower . lower) |
| 153 | (maximal-match . (rx-greedy 1 1)) | 83 | (lower-case . lower) |
| 154 | (backref . (rx-backref 1 1 rx-check-backref)) | 84 | (punctuation . punct) |
| 155 | (line-start . "^") | 85 | (punct . punct) |
| 156 | (bol . line-start) ; SRE | 86 | (space . space) |
| 157 | (line-end . "$") | 87 | (whitespace . space) |
| 158 | (eol . line-end) ; SRE | 88 | (white . space) |
| 159 | (string-start . "\\`") | 89 | (upper . upper) |
| 160 | (bos . string-start) ; SRE | 90 | (upper-case . upper) |
| 161 | (bot . string-start) ; sregex | 91 | (word . word) |
| 162 | (string-end . "\\'") | 92 | (wordchar . word) |
| 163 | (eos . string-end) ; SRE | 93 | (unibyte . unibyte) |
| 164 | (eot . string-end) ; sregex | 94 | (multibyte . multibyte)) |
| 165 | (buffer-start . "\\`") | 95 | "Alist mapping rx symbols to character classes. |
| 166 | (buffer-end . "\\'") | 96 | Most of the names are from SRE.") |
| 167 | (point . "\\=") | 97 | |
| 168 | (word-start . "\\<") | 98 | (defvar rx-constituents nil |
| 169 | (bow . word-start) ; SRE | 99 | "Alist of old-style rx extensions, for compatibility. |
| 170 | (word-end . "\\>") | 100 | |
| 171 | (eow . word-end) ; SRE | 101 | Each element is (SYMBOL . DEF). |
| 172 | (word-boundary . "\\b") | 102 | |
| 173 | (not-word-boundary . "\\B") ; sregex | 103 | If DEF is a symbol, then SYMBOL is an alias of DEF. |
| 174 | (symbol-start . "\\_<") | 104 | |
| 175 | (symbol-end . "\\_>") | 105 | If DEF is a string, then SYMBOL is a plain rx symbol defined as the |
| 176 | (syntax . (rx-syntax 1 1)) | 106 | regexp string DEF. |
| 177 | (not-syntax . (rx-not-syntax 1 1)) ; sregex | 107 | |
| 178 | (category . (rx-category 1 1 rx-check-category)) | 108 | If DEF is a list on the form (FUN MIN-ARGS MAX-ARGS PRED), then |
| 179 | (eval . (rx-eval 1 1)) | 109 | SYMBOL is an rx form with at least MIN-ARGS and at most |
| 180 | (literal . (rx-literal 1 1 stringp)) | 110 | MAX-ARGS arguments. If MAX-ARGS is nil, then there is no upper limit. |
| 181 | (regexp . (rx-regexp 1 1 stringp)) | 111 | FUN is a function taking the entire rx form as single argument |
| 182 | (regex . regexp) ; sregex | 112 | and returning the translated regexp string. |
| 183 | (digit . "[[:digit:]]") | 113 | If PRED is non-nil, it is a predicate that all actual arguments must |
| 184 | (numeric . digit) ; SRE | 114 | satisfy.") |
| 185 | (num . digit) ; SRE | 115 | |
| 186 | (control . "[[:cntrl:]]") ; SRE | 116 | ;; TODO: Additions to consider: |
| 187 | (cntrl . control) ; SRE | 117 | ;; - A better name for `anything', like `any-char' or `anychar'. |
| 188 | (hex-digit . "[[:xdigit:]]") ; SRE | 118 | ;; - A name for (or), maybe `unmatchable'. |
| 189 | (hex . hex-digit) ; SRE | 119 | ;; - A construct like `or' but without the match order guarantee, |
| 190 | (xdigit . hex-digit) ; SRE | 120 | ;; maybe `unordered-or'. Useful for composition or generation of |
| 191 | (blank . "[[:blank:]]") ; SRE | 121 | ;; alternatives; permits more effective use of regexp-opt. |
| 192 | (graphic . "[[:graph:]]") ; SRE | 122 | |
| 193 | (graph . graphic) ; SRE | 123 | (defun rx--translate-symbol (sym) |
| 194 | (printing . "[[:print:]]") ; SRE | 124 | "Translate an rx symbol. Return (REGEXP . PRECEDENCE)." |
| 195 | (print . printing) ; SRE | 125 | (pcase sym |
| 196 | (alphanumeric . "[[:alnum:]]") ; SRE | 126 | ;; Use `list' instead of a quoted list to wrap the strings here, |
| 197 | (alnum . alphanumeric) ; SRE | 127 | ;; since the return value may be mutated. |
| 198 | (letter . "[[:alpha:]]") | 128 | ((or 'nonl 'not-newline 'any) (cons (list ".") t)) |
| 199 | (alphabetic . letter) ; SRE | 129 | ('anything (rx--translate-form '(or nonl "\n"))) |
| 200 | (alpha . letter) ; SRE | 130 | ((or 'bol 'line-start) (cons (list "^") 'lseq)) |
| 201 | (ascii . "[[:ascii:]]") ; SRE | 131 | ((or 'eol 'line-end) (cons (list "$") 'rseq)) |
| 202 | (nonascii . "[[:nonascii:]]") | 132 | ((or 'bos 'string-start 'bot 'buffer-start) (cons (list "\\`") t)) |
| 203 | (lower . "[[:lower:]]") ; SRE | 133 | ((or 'eos 'string-end 'eot 'buffer-end) (cons (list "\\'") t)) |
| 204 | (lower-case . lower) ; SRE | 134 | ('point (cons (list "\\=") t)) |
| 205 | (punctuation . "[[:punct:]]") ; SRE | 135 | ((or 'bow 'word-start) (cons (list "\\<") t)) |
| 206 | (punct . punctuation) ; SRE | 136 | ((or 'eow 'word-end) (cons (list "\\>") t)) |
| 207 | (space . "[[:space:]]") ; SRE | 137 | ('word-boundary (cons (list "\\b") t)) |
| 208 | (whitespace . space) ; SRE | 138 | ('not-word-boundary (cons (list "\\B") t)) |
| 209 | (white . space) ; SRE | 139 | ('symbol-start (cons (list "\\_<") t)) |
| 210 | (upper . "[[:upper:]]") ; SRE | 140 | ('symbol-end (cons (list "\\_>") t)) |
| 211 | (upper-case . upper) ; SRE | 141 | ('not-wordchar (cons (list "\\W") t)) |
| 212 | (word . "[[:word:]]") ; inconsistent with SRE | 142 | (_ |
| 213 | (wordchar . word) ; sregex | 143 | (cond |
| 214 | (not-wordchar . "\\W")) | 144 | ((let ((class (cdr (assq sym rx--char-classes)))) |
| 215 | "Alist of sexp form regexp constituents. | 145 | (and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t)))) |
| 216 | Each element of the alist has the form (SYMBOL . DEFN). | 146 | |
| 217 | SYMBOL is a valid constituent of sexp regular expressions. | 147 | ;; For compatibility with old rx. |
| 218 | If DEFN is a string, SYMBOL is translated into DEFN. | 148 | ((let ((entry (assq sym rx-constituents))) |
| 219 | If DEFN is a symbol, use the definition of DEFN, recursively. | 149 | (and (progn |
| 220 | Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | 150 | (while (and entry (not (stringp (cdr entry)))) |
| 221 | FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | 151 | (setq entry |
| 222 | are the minimum and maximum number of arguments the function-form | 152 | (if (symbolp (cdr entry)) |
| 223 | sexp constituent SYMBOL may have in sexp regular expressions. | 153 | ;; Alias for another entry. |
| 224 | MAX-ARGS nil means no limit. PREDICATE, if specified, means that | 154 | (assq (cdr entry) rx-constituents) |
| 225 | all arguments must satisfy PREDICATE.") | 155 | ;; Wrong type, try further down the list. |
| 226 | 156 | (assq (car entry) | |
| 227 | 157 | (cdr (memq entry rx-constituents)))))) | |
| 228 | (defconst rx-syntax | 158 | entry) |
| 229 | '((whitespace . ?-) | 159 | (cons (list (cdr entry)) nil)))) |
| 230 | (punctuation . ?.) | 160 | (t (error "Unknown rx symbol `%s'" sym)))))) |
| 231 | (word . ?w) | 161 | |
| 232 | (symbol . ?_) | 162 | (defun rx--enclose (left-str rexp right-str) |
| 233 | (open-parenthesis . ?\() | 163 | "Bracket REXP by LEFT-STR and RIGHT-STR." |
| 234 | (close-parenthesis . ?\)) | 164 | (append (list left-str) rexp (list right-str))) |
| 235 | (expression-prefix . ?\') | 165 | |
| 236 | (string-quote . ?\") | 166 | (defun rx--bracket (rexp) |
| 237 | (paired-delimiter . ?$) | 167 | (rx--enclose "\\(?:" rexp "\\)")) |
| 238 | (escape . ?\\) | 168 | |
| 239 | (character-quote . ?/) | 169 | (defun rx--sequence (left right) |
| 240 | (comment-start . ?<) | 170 | "Return the sequence (concatenation) of two translated items, |
| 241 | (comment-end . ?>) | 171 | each on the form (REGEXP . PRECEDENCE), returning (REGEXP . PRECEDENCE)." |
| 242 | (string-delimiter . ?|) | 172 | ;; Concatenation rules: |
| 243 | (comment-delimiter . ?!)) | 173 | ;; seq ++ seq -> seq |
| 244 | "Alist mapping Rx syntax symbols to syntax characters. | 174 | ;; lseq ++ seq -> lseq |
| 245 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | 175 | ;; seq ++ rseq -> rseq |
| 246 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character | 176 | ;; lseq ++ rseq -> nil |
| 247 | corresponding to SYMBOL, as it would be used with \\s or \\S in | 177 | (cond ((not (car left)) right) |
| 248 | regular expressions.") | 178 | ((not (car right)) left) |
| 249 | 179 | (t | |
| 250 | 180 | (let ((l (if (memq (cdr left) '(nil rseq)) | |
| 251 | (defconst rx-categories | 181 | (cons (rx--bracket (car left)) t) |
| 252 | '((space-for-indent . ?\s) | 182 | left)) |
| 253 | (base . ?.) | 183 | (r (if (memq (cdr right) '(nil lseq)) |
| 254 | (consonant . ?0) | 184 | (cons (rx--bracket (car right)) t) |
| 255 | (base-vowel . ?1) | 185 | right))) |
| 256 | (upper-diacritical-mark . ?2) | 186 | (cons (append (car l) (car r)) |
| 257 | (lower-diacritical-mark . ?3) | 187 | (if (eq (cdr l) 'lseq) |
| 258 | (tone-mark . ?4) | 188 | (if (eq (cdr r) 'rseq) |
| 259 | (symbol . ?5) | 189 | nil ; lseq ++ rseq |
| 260 | (digit . ?6) | 190 | 'lseq) ; lseq ++ seq |
| 261 | (vowel-modifying-diacritical-mark . ?7) | 191 | (if (eq (cdr r) 'rseq) |
| 262 | (vowel-sign . ?8) | 192 | 'rseq ; seq ++ rseq |
| 263 | (semivowel-lower . ?9) | 193 | 'seq))))))) ; seq ++ seq |
| 264 | (not-at-end-of-line . ?<) | 194 | |
| 265 | (not-at-beginning-of-line . ?>) | 195 | (defun rx--translate-seq (body) |
| 266 | (alpha-numeric-two-byte . ?A) | 196 | "Translate a sequence of one or more rx items. Return (REGEXP . PRECEDENCE)." |
| 267 | (chinese-two-byte . ?C) | 197 | (if body |
| 268 | (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3. | 198 | (let* ((items (mapcar #'rx--translate body)) |
| 269 | (greek-two-byte . ?G) | 199 | (result (car items))) |
| 270 | (japanese-hiragana-two-byte . ?H) | 200 | (dolist (item (cdr items)) |
| 271 | (indian-two-byte . ?I) | 201 | (setq result (rx--sequence result item))) |
| 272 | (japanese-katakana-two-byte . ?K) | 202 | result) |
| 273 | (strong-left-to-right . ?L) | 203 | (cons nil 'seq))) |
| 274 | (korean-hangul-two-byte . ?N) | 204 | |
| 275 | (strong-right-to-left . ?R) | 205 | (defun rx--empty () |
| 276 | (cyrillic-two-byte . ?Y) | 206 | "Regexp that never matches anything." |
| 277 | (combining-diacritic . ?^) | 207 | (cons (list regexp-unmatchable) 'seq)) |
| 278 | (ascii . ?a) | 208 | |
| 279 | (arabic . ?b) | 209 | ;; `cl-every' replacement to avoid bootstrapping problems. |
| 280 | (chinese . ?c) | 210 | (defun rx--every (pred list) |
| 281 | (ethiopic . ?e) | 211 | "Whether PRED is true for every element of LIST." |
| 282 | (greek . ?g) | 212 | (while (and list (funcall pred (car list))) |
| 283 | (korean . ?h) | 213 | (setq list (cdr list))) |
| 284 | (indian . ?i) | 214 | (null list)) |
| 285 | (japanese . ?j) | 215 | |
| 286 | (japanese-katakana . ?k) | 216 | (defun rx--translate-or (body) |
| 287 | (latin . ?l) | 217 | "Translate an or-pattern of one of more rx items. |
| 288 | (lao . ?o) | 218 | Return (REGEXP . PRECEDENCE)." |
| 289 | (tibetan . ?q) | 219 | ;; FIXME: Possible improvements: |
| 290 | (japanese-roman . ?r) | 220 | ;; |
| 291 | (thai . ?t) | 221 | ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"), |
| 292 | (vietnamese . ?v) | 222 | ;; so that they can be candidates for regexp-opt. |
| 293 | (hebrew . ?w) | 223 | ;; |
| 294 | (cyrillic . ?y) | 224 | ;; - Translate compile-time strings (`eval' forms), again for regexp-opt. |
| 295 | (can-break . ?|)) | 225 | ;; |
| 296 | "Alist mapping symbols to category characters. | 226 | ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) |
| 297 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | 227 | ;; in order to improve effectiveness of regexp-opt. |
| 298 | symbol in `(category SYMBOL)', and CHAR is the category character | 228 | ;; This would also help composability. |
| 299 | corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | 229 | ;; |
| 300 | regular expression strings.") | 230 | ;; - Use associativity to run regexp-opt on contiguous subsets of arguments |
| 301 | 231 | ;; if not all of them are strings. Example: | |
| 302 | 232 | ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) | |
| 303 | (defvar rx-greedy-flag t | 233 | ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) |
| 304 | "Non-nil means produce greedy regular expressions for `zero-or-one', | 234 | ;; |
| 305 | `zero-or-more', and `one-or-more'. Dynamically bound.") | 235 | ;; - Fuse patterns into a single character alternative if they fit. |
| 306 | 236 | ;; regexp-opt will do that if all are strings, but we want to do that for: | |
| 307 | (defvar rx--compile-to-lisp nil | 237 | ;; * symbols that expand to classes: space, alpha, ... |
| 308 | "Nil means return a regexp as a string. | 238 | ;; * character alternatives: (any ...) |
| 309 | Non-nil means we may return a lisp form which produces a | 239 | ;; * (syntax S), for some S (whitespace, word) |
| 310 | string (used for `rx' macro).") | 240 | ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) |
| 311 | 241 | ;; -> (any "@" "%" digit "A-Z" space word) | |
| 312 | (defun rx-info (op head) | 242 | ;; -> "[A-Z@%[:digit:][:space:][:word:]]" |
| 313 | "Return parsing/code generation info for OP. | 243 | ;; |
| 314 | If OP is the space character ASCII 32, return info for the symbol `?'. | 244 | ;; Problem: If a subpattern is carefully written to to be |
| 315 | If OP is the character `?', return info for the symbol `??'. | 245 | ;; optimisable by regexp-opt, how do we prevent the transforms |
| 316 | See also `rx-constituents'. | 246 | ;; above from destroying that property? |
| 317 | If HEAD is non-nil, then OP is the head of a sexp, otherwise it's | 247 | ;; Example: (or "a" (or "abc" "abd" "abe")) |
| 318 | a standalone symbol." | ||
| 319 | (cond ((eq op ? ) (setq op '\?)) | ||
| 320 | ((eq op ??) (setq op '\??))) | ||
| 321 | (let (old-op) | ||
| 322 | (while (and (not (null op)) (symbolp op)) | ||
| 323 | (setq old-op op) | ||
| 324 | (setq op (cdr (assq op rx-constituents))) | ||
| 325 | (when (if head (stringp op) (consp op)) | ||
| 326 | ;; We found something but of the wrong kind. Let's look for an | ||
| 327 | ;; alternate definition for the other case. | ||
| 328 | (let ((new-op | ||
| 329 | (cdr (assq old-op (cdr (memq (assq old-op rx-constituents) | ||
| 330 | rx-constituents)))))) | ||
| 331 | (if (and new-op (not (if head (stringp new-op) (consp new-op)))) | ||
| 332 | (setq op new-op)))))) | ||
| 333 | op) | ||
| 334 | |||
| 335 | |||
| 336 | (defun rx-check (form) | ||
| 337 | "Check FORM according to its car's parsing info." | ||
| 338 | (unless (listp form) | ||
| 339 | (error "rx `%s' needs argument(s)" form)) | ||
| 340 | (let* ((rx (rx-info (car form) 'head)) | ||
| 341 | (nargs (1- (length form))) | ||
| 342 | (min-args (nth 1 rx)) | ||
| 343 | (max-args (nth 2 rx)) | ||
| 344 | (type-pred (nth 3 rx))) | ||
| 345 | (when (and (not (null min-args)) | ||
| 346 | (< nargs min-args)) | ||
| 347 | (error "rx form `%s' requires at least %d args" | ||
| 348 | (car form) min-args)) | ||
| 349 | (when (and (not (null max-args)) | ||
| 350 | (> nargs max-args)) | ||
| 351 | (error "rx form `%s' accepts at most %d args" | ||
| 352 | (car form) max-args)) | ||
| 353 | (when type-pred | ||
| 354 | (dolist (sub-form (cdr form)) | ||
| 355 | (unless (funcall type-pred sub-form) | ||
| 356 | (error "rx form `%s' requires args satisfying `%s'" | ||
| 357 | (car form) type-pred)))))) | ||
| 358 | |||
| 359 | |||
| 360 | (defun rx-group-if (regexp group) | ||
| 361 | "Put shy groups around REGEXP if seemingly necessary when GROUP | ||
| 362 | is non-nil." | ||
| 363 | (cond | 248 | (cond |
| 364 | ;; for some repetition | 249 | ((null body) ; No items: a never-matching regexp. |
| 365 | ((eq group '*) (if (rx-atomic-p regexp) (setq group nil))) | 250 | (rx--empty)) |
| 366 | ;; for concatenation | 251 | ((null (cdr body)) ; Single item. |
| 367 | ((eq group ':) | 252 | (rx--translate (car body))) |
| 368 | (if (rx-atomic-p | 253 | ((rx--every #'stringp body) ; All strings. |
| 369 | (if (and (stringp regexp) | 254 | (cons (list (regexp-opt body nil t)) |
| 370 | (string-match | 255 | t)) |
| 371 | "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)) | 256 | (t |
| 372 | (substring regexp 0 (match-beginning 0)) | 257 | (cons (append (car (rx--translate (car body))) |
| 373 | regexp)) | 258 | (mapcan (lambda (item) |
| 374 | (setq group nil))) | 259 | (cons "\\|" (car (rx--translate item)))) |
| 375 | ;; for OR | 260 | (cdr body))) |
| 376 | ((eq group '|) (setq group nil)) | 261 | nil)))) |
| 377 | ;; do anyway | 262 | |
| 378 | ((eq group t)) | 263 | (defun rx--string-to-intervals (str) |
| 379 | ((rx-atomic-p regexp t) (setq group nil))) | 264 | "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single |
| 380 | (cond ((and group (stringp regexp)) | 265 | character X becomes (?X . ?X). Return the intervals in a list." |
| 381 | (concat "\\(?:" regexp "\\)")) | 266 | ;; We could just do string-to-multibyte on the string and work with |
| 382 | (group `("\\(?:" ,@regexp "\\)")) | 267 | ;; that instead of this `decode-char' workaround. |
| 383 | (t regexp))) | ||
| 384 | |||
| 385 | |||
| 386 | (defvar rx-parent) | ||
| 387 | ;; dynamically bound in some functions. | ||
| 388 | |||
| 389 | |||
| 390 | (defun rx-and (form) | ||
| 391 | "Parse and produce code from FORM. | ||
| 392 | FORM is of the form `(and FORM1 ...)'." | ||
| 393 | (rx-check form) | ||
| 394 | (rx-group-if | ||
| 395 | (rx--subforms (cdr form) ':) | ||
| 396 | (and (memq rx-parent '(* t)) rx-parent))) | ||
| 397 | |||
| 398 | |||
| 399 | (defun rx-or (form) | ||
| 400 | "Parse and produce code from FORM, which is `(or FORM1 ...)'." | ||
| 401 | (rx-check form) | ||
| 402 | (rx-group-if | ||
| 403 | (cond | ||
| 404 | ((null (cdr form)) regexp-unmatchable) | ||
| 405 | ((cl-every #'stringp (cdr form)) | ||
| 406 | (regexp-opt (cdr form) nil t)) | ||
| 407 | (t (rx--subforms (cdr form) '| "\\|"))) | ||
| 408 | (and (memq rx-parent '(: * t)) rx-parent))) | ||
| 409 | |||
| 410 | |||
| 411 | (defun rx-anything (form) | ||
| 412 | "Match any character." | ||
| 413 | (if (consp form) | ||
| 414 | (error "rx `anything' syntax error: %s" form)) | ||
| 415 | (rx-or (list 'or 'not-newline ?\n))) | ||
| 416 | |||
| 417 | |||
| 418 | (defun rx-any-delete-from-range (char ranges) | ||
| 419 | "Delete by side effect character CHAR from RANGES. | ||
| 420 | Only both edges of each range is checked." | ||
| 421 | (let (m) | ||
| 422 | (cond | ||
| 423 | ((memq char ranges) (setq ranges (delq char ranges))) | ||
| 424 | ((setq m (assq char ranges)) | ||
| 425 | (if (eq (1+ char) (cdr m)) | ||
| 426 | (setcar (memq m ranges) (1+ char)) | ||
| 427 | (setcar m (1+ char)))) | ||
| 428 | ((setq m (rassq char ranges)) | ||
| 429 | (if (eq (1- char) (car m)) | ||
| 430 | (setcar (memq m ranges) (1- char)) | ||
| 431 | (setcdr m (1- char))))) | ||
| 432 | ranges)) | ||
| 433 | |||
| 434 | |||
| 435 | (defun rx-any-condense-range (args) | ||
| 436 | "Condense by side effect ARGS as range for Rx `any'." | ||
| 437 | (let (str | ||
| 438 | l) | ||
| 439 | ;; set STR list of all strings | ||
| 440 | ;; set L list of all ranges | ||
| 441 | (mapc (lambda (e) (cond ((stringp e) (push e str)) | ||
| 442 | ((numberp e) (push (cons e e) l)) | ||
| 443 | ;; Ranges between ASCII and raw bytes are split, | ||
| 444 | ;; to prevent accidental inclusion of Unicode | ||
| 445 | ;; characters later on. | ||
| 446 | ((and (<= (car e) #x7f) | ||
| 447 | (>= (cdr e) #x3fff80)) | ||
| 448 | (push (cons (car e) #x7f) l) | ||
| 449 | (push (cons #x3fff80 (cdr e)) l)) | ||
| 450 | (t (push e l)))) | ||
| 451 | args) | ||
| 452 | ;; condense overlapped ranges in L | ||
| 453 | (let ((tail (setq l (sort l #'car-less-than-car))) | ||
| 454 | d) | ||
| 455 | (while (setq d (cdr tail)) | ||
| 456 | (if (>= (cdar tail) (1- (caar d))) | ||
| 457 | (progn | ||
| 458 | (setcdr (car tail) (max (cdar tail) (cdar d))) | ||
| 459 | (setcdr tail (cdr d))) | ||
| 460 | (setq tail d)))) | ||
| 461 | ;; Separate small ranges to single number, and delete dups. | ||
| 462 | (nconc | ||
| 463 | (apply #'nconc | ||
| 464 | (mapcar (lambda (e) | ||
| 465 | (cond | ||
| 466 | ((= (car e) (cdr e)) (list (car e))) | ||
| 467 | ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e))) | ||
| 468 | ((list e)))) | ||
| 469 | l)) | ||
| 470 | (delete-dups str)))) | ||
| 471 | |||
| 472 | |||
| 473 | (defun rx-check-any-string (str) | ||
| 474 | "Turn the `any' argument string STR into a list of characters. | ||
| 475 | The original order is not preserved. Ranges, \"A-Z\", become pairs, (?A . ?Z)." | ||
| 476 | (let ((decode-char | 268 | (let ((decode-char |
| 477 | ;; Make sure raw bytes are decoded as such, to avoid confusion with | ||
| 478 | ;; U+0080..U+00FF. | ||
| 479 | (if (multibyte-string-p str) | 269 | (if (multibyte-string-p str) |
| 480 | #'identity | 270 | #'identity |
| 481 | (lambda (c) (if (<= #x80 c #xff) | 271 | #'unibyte-char-to-multibyte)) |
| 482 | (+ c #x3fff00) | ||
| 483 | c)))) | ||
| 484 | (len (length str)) | 272 | (len (length str)) |
| 485 | (i 0) | 273 | (i 0) |
| 486 | (ret nil)) | 274 | (intervals nil)) |
| 487 | (if (= 0 len) | ||
| 488 | (error "String arg for Rx `any' must not be empty")) | ||
| 489 | (while (< i len) | 275 | (while (< i len) |
| 490 | (cond ((and (< i (- len 2)) | 276 | (cond ((and (< i (- len 2)) |
| 491 | (= (aref str (+ i 1)) ?-)) | 277 | (= (aref str (1+ i)) ?-)) |
| 492 | ;; Range. | 278 | ;; Range. |
| 493 | (let ((start (funcall decode-char (aref str i))) | 279 | (let ((start (funcall decode-char (aref str i))) |
| 494 | (end (funcall decode-char (aref str (+ i 2))))) | 280 | (end (funcall decode-char (aref str (+ i 2))))) |
| 495 | (cond ((< start end) (push (cons start end) ret)) | 281 | (cond ((and (<= start #x7f) (>= end #x3fff80)) |
| 496 | ((= start end) (push start ret)) | 282 | ;; Ranges between ASCII and raw bytes are split to |
| 283 | ;; avoid having them absorb Unicode characters | ||
| 284 | ;; caught in-between. | ||
| 285 | (push (cons start #x7f) intervals) | ||
| 286 | (push (cons #x3fff80 end) intervals)) | ||
| 287 | ((<= start end) | ||
| 288 | (push (cons start end) intervals)) | ||
| 497 | (t | 289 | (t |
| 498 | (error "Rx character range `%c-%c' is reversed" | 290 | (error "Invalid rx `any' range: %s" |
| 499 | start end))) | 291 | (substring str i 3)))) |
| 500 | (setq i (+ i 3)))) | 292 | (setq i (+ i 3)))) |
| 501 | (t | 293 | (t |
| 502 | ;; Single character. | 294 | ;; Single character. |
| 503 | (push (funcall decode-char (aref str i)) ret) | 295 | (let ((char (funcall decode-char (aref str i)))) |
| 296 | (push (cons char char) intervals)) | ||
| 504 | (setq i (+ i 1))))) | 297 | (setq i (+ i 1))))) |
| 505 | ret)) | 298 | intervals)) |
| 506 | 299 | ||
| 507 | 300 | (defun rx--condense-intervals (intervals) | |
| 508 | (defun rx-check-any (arg) | 301 | "Merge adjacent and overlapping intervals by mutation, preserving the order. |
| 509 | "Check arg ARG for Rx `any'." | 302 | INTERVALS is a list of (START . END) with START †END, sorted by START." |
| 510 | (cond | 303 | (let ((tail intervals) |
| 511 | ((integerp arg) (list arg)) | 304 | d) |
| 512 | ((symbolp arg) | 305 | (while (setq d (cdr tail)) |
| 513 | (let ((translation (condition-case nil | 306 | (if (>= (cdar tail) (1- (caar d))) |
| 514 | (rx-form arg) | 307 | (progn |
| 515 | (error nil)))) | 308 | (setcdr (car tail) (max (cdar tail) (cdar d))) |
| 516 | (if (or (null translation) | 309 | (setcdr tail (cdr d))) |
| 517 | (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) | 310 | (setq tail d))) |
| 518 | (error "Invalid char class `%s' in Rx `any'" arg)) | 311 | intervals)) |
| 519 | (list (substring translation 1 -1)))) ; strip outer brackets | 312 | |
| 520 | ((and (characterp (car-safe arg)) (characterp (cdr-safe arg))) | 313 | (defun rx--translate-any (negated body) |
| 521 | (unless (<= (car arg) (cdr arg)) | 314 | "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE). |
| 522 | (error "Rx character range `%c-%c' is reversed" | 315 | If NEGATED, negate the sense." |
| 523 | (car arg) (cdr arg))) | 316 | (let ((classes nil) |
| 524 | (list arg)) | 317 | (strings nil) |
| 525 | ((stringp arg) (rx-check-any-string arg)) | 318 | (conses nil)) |
| 526 | ((error | 319 | ;; Collect strings, conses and characters, and classes in separate bins. |
| 527 | "rx `any' requires string, character, char pair or char class args")))) | 320 | (dolist (arg body) |
| 528 | 321 | (cond ((stringp arg) | |
| 529 | 322 | (push arg strings)) | |
| 530 | (defun rx-any (form) | 323 | ((and (consp arg) |
| 531 | "Parse and produce code from FORM, which is `(any ARG ...)'. | 324 | (characterp (car arg)) |
| 532 | ARG is optional." | 325 | (characterp (cdr arg)) |
| 533 | (rx-check form) | 326 | (<= (car arg) (cdr arg))) |
| 534 | (let* ((args (rx-any-condense-range | 327 | ;; Copy the cons, in case we need to modify it. |
| 535 | (apply | 328 | (push (cons (car arg) (cdr arg)) conses)) |
| 536 | #'nconc | 329 | ((characterp arg) |
| 537 | (mapcar #'rx-check-any (cdr form))))) | 330 | (push (cons arg arg) conses)) |
| 538 | m | 331 | ((and (symbolp arg) |
| 539 | s) | 332 | (let ((class (cdr (assq arg rx--char-classes)))) |
| 540 | (cond | 333 | (and class (push class classes))))) |
| 541 | ;; single close bracket | 334 | (t (error "Invalid rx `any' argument: %s" arg)))) |
| 542 | ;; => "[]...-]" or "[]...--.]" | 335 | (let ((items |
| 543 | ((memq ?\] args) | 336 | ;; Translate strings and conses into nonoverlapping intervals, |
| 544 | ;; set ] at the beginning | 337 | ;; and add classes as symbols at the end. |
| 545 | (setq args (cons ?\] (delq ?\] args))) | 338 | (append |
| 546 | ;; set - at the end | 339 | (rx--condense-intervals |
| 547 | (if (or (memq ?- args) (assq ?- args)) | 340 | (sort (append conses |
| 548 | (setq args (nconc (rx-any-delete-from-range ?- args) | 341 | (mapcan #'rx--string-to-intervals strings)) |
| 549 | (list ?-))))) | 342 | #'car-less-than-car)) |
| 550 | ;; close bracket starts a range | 343 | (reverse classes)))) |
| 551 | ;; => "[]-....-]" or "[]-.--....]" | 344 | |
| 552 | ((setq m (assq ?\] args)) | 345 | ;; Move lone ] and range ]-x to the start. |
| 553 | ;; bring it to the beginning | 346 | (let ((rbrac-l (assq ?\] items))) |
| 554 | (setq args (cons m (delq m args))) | 347 | (when rbrac-l |
| 555 | (cond ((memq ?- args) | 348 | (setq items (cons rbrac-l (delq rbrac-l items))))) |
| 556 | ;; to the end | 349 | |
| 557 | (setq args (nconc (delq ?- args) (list ?-)))) | 350 | ;; Split x-] and move the lone ] to the start. |
| 558 | ((setq m (assq ?- args)) | 351 | (let ((rbrac-r (rassq ?\] items))) |
| 559 | ;; next to the bracket's range, make the second range | 352 | (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) |
| 560 | (setcdr args (cons m (delq m (cdr args))))))) | 353 | (setcdr rbrac-r ?\\) |
| 561 | ;; bracket in the end range | 354 | (setq items (cons '(?\] . ?\]) items)))) |
| 562 | ;; => "[]...-]" | 355 | |
| 563 | ((setq m (rassq ?\] args)) | 356 | ;; Split ,-- (which would end up as ,- otherwise). |
| 564 | ;; set ] at the beginning | 357 | (let ((dash-r (rassq ?- items))) |
| 565 | (setq args (cons ?\] (rx-any-delete-from-range ?\] args))) | 358 | (when (eq (car dash-r) ?,) |
| 566 | ;; set - at the end | 359 | (setcdr dash-r ?,) |
| 567 | (if (or (memq ?- args) (assq ?- args)) | 360 | (setq items (nconc items '((?- . ?-)))))) |
| 568 | (setq args (nconc (rx-any-delete-from-range ?- args) | 361 | |
| 569 | (list ?-))))) | 362 | ;; Remove - (lone or at start of interval) |
| 570 | ;; {no close bracket appears} | 363 | (let ((dash-l (assq ?- items))) |
| 571 | ;; | 364 | (when dash-l |
| 572 | ;; bring single bar to the beginning | 365 | (if (eq (cdr dash-l) ?-) |
| 573 | ((memq ?- args) | 366 | (setq items (delq dash-l items)) ; Remove lone - |
| 574 | (setq args (cons ?- (delq ?- args)))) | 367 | (setcar dash-l ?.)) ; Reduce --x to .-x |
| 575 | ;; bar start a range, bring it to the beginning | 368 | (setq items (nconc items '((?- . ?-)))))) |
| 576 | ((setq m (assq ?- args)) | 369 | |
| 577 | (setq args (cons m (delq m args)))) | 370 | ;; Deal with leading ^ and range ^-x. |
| 578 | ;; | 371 | (when (and (consp (car items)) |
| 579 | ;; hat at the beginning? | 372 | (eq (caar items) ?^) |
| 580 | ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^)) | 373 | (cdr items)) |
| 581 | (setq args (if (cdr args) | 374 | ;; Move ^ and ^-x to second place. |
| 582 | `(,(cadr args) ,(car args) ,@(cddr args)) | 375 | (setq items (cons (cadr items) |
| 583 | (nconc (rx-any-delete-from-range ?^ args) | 376 | (cons (car items) (cddr items))))) |
| 584 | (list ?^)))))) | 377 | |
| 585 | ;; some 1-char? | ||
| 586 | (if (and (null (cdr args)) (numberp (car args)) | ||
| 587 | (or (= 1 (length | ||
| 588 | (setq s (regexp-quote (string (car args)))))) | ||
| 589 | (and (equal (car args) ?^) ;; unnecessary predicate? | ||
| 590 | (null (eq rx-parent '!))))) | ||
| 591 | s | ||
| 592 | (concat "[" | ||
| 593 | (mapconcat | ||
| 594 | (lambda (e) (cond | ||
| 595 | ((numberp e) (string e)) | ||
| 596 | ((consp e) | ||
| 597 | (if (and (= (1+ (car e)) (cdr e)) | ||
| 598 | ;; rx-any-condense-range should | ||
| 599 | ;; prevent this case from happening. | ||
| 600 | (null (memq (car e) '(?\] ?-))) | ||
| 601 | (null (memq (cdr e) '(?\] ?-)))) | ||
| 602 | (string (car e) (cdr e)) | ||
| 603 | (string (car e) ?- (cdr e)))) | ||
| 604 | (e))) | ||
| 605 | args | ||
| 606 | nil) | ||
| 607 | "]")))) | ||
| 608 | |||
| 609 | |||
| 610 | (defun rx-check-not (arg) | ||
| 611 | "Check arg ARG for Rx `not'." | ||
| 612 | (unless (or (and (symbolp arg) | ||
| 613 | (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" | ||
| 614 | (condition-case nil | ||
| 615 | (rx-form arg) | ||
| 616 | (error "")))) | ||
| 617 | (eq arg 'word-boundary) | ||
| 618 | (and (consp arg) | ||
| 619 | (memq (car arg) '(not any in syntax category)))) | ||
| 620 | (error "rx `not' syntax error: %s" arg)) | ||
| 621 | t) | ||
| 622 | |||
| 623 | |||
| 624 | (defun rx-not (form) | ||
| 625 | "Parse and produce code from FORM. FORM is `(not ...)'." | ||
| 626 | (rx-check form) | ||
| 627 | (let ((result (rx-form (cadr form) '!)) | ||
| 628 | case-fold-search) | ||
| 629 | (cond ((string-match "\\`\\[\\^" result) | ||
| 630 | (cond | ||
| 631 | ((equal result "[^]") "[^^]") | ||
| 632 | ((and (= (length result) 4) (null (eq rx-parent '!))) | ||
| 633 | (regexp-quote (substring result 2 3))) | ||
| 634 | ((concat "[" (substring result 2))))) | ||
| 635 | ((eq ?\[ (aref result 0)) | ||
| 636 | (concat "[^" (substring result 1))) | ||
| 637 | ((string-match "\\`\\\\[scbw]" result) | ||
| 638 | (concat (upcase (substring result 0 2)) | ||
| 639 | (substring result 2))) | ||
| 640 | ((string-match "\\`\\\\[SCBW]" result) | ||
| 641 | (concat (downcase (substring result 0 2)) | ||
| 642 | (substring result 2))) | ||
| 643 | (t | ||
| 644 | (concat "[^" result "]"))))) | ||
| 645 | |||
| 646 | |||
| 647 | (defun rx-not-char (form) | ||
| 648 | "Parse and produce code from FORM. FORM is `(not-char ...)'." | ||
| 649 | (rx-check form) | ||
| 650 | (rx-not `(not (in ,@(cdr form))))) | ||
| 651 | |||
| 652 | |||
| 653 | (defun rx-not-syntax (form) | ||
| 654 | "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." | ||
| 655 | (rx-check form) | ||
| 656 | (rx-not `(not (syntax ,@(cdr form))))) | ||
| 657 | |||
| 658 | |||
| 659 | (defun rx-trans-forms (form &optional skip) | ||
| 660 | "If FORM's length is greater than two, transform it to length two. | ||
| 661 | A form (HEAD REST ...) becomes (HEAD (and REST ...)). | ||
| 662 | If SKIP is non-nil, allow that number of items after the head, i.e. | ||
| 663 | `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." | ||
| 664 | (unless skip (setq skip 0)) | ||
| 665 | (let ((tail (nthcdr (1+ skip) form))) | ||
| 666 | (if (= (length tail) 1) | ||
| 667 | form | ||
| 668 | (let ((form (copy-sequence form))) | ||
| 669 | (setcdr (nthcdr skip form) (list (cons 'and tail))) | ||
| 670 | form)))) | ||
| 671 | |||
| 672 | |||
| 673 | (defun rx-= (form) | ||
| 674 | "Parse and produce code from FORM `(= N ...)'." | ||
| 675 | (rx-check form) | ||
| 676 | (setq form (rx-trans-forms form 1)) | ||
| 677 | (unless (and (integerp (nth 1 form)) | ||
| 678 | (> (nth 1 form) 0)) | ||
| 679 | (error "rx `=' requires positive integer first arg")) | ||
| 680 | (let ((subform (rx-form (nth 2 form) '*))) | ||
| 681 | (if (stringp subform) | ||
| 682 | (format "%s\\{%d\\}" subform (nth 1 form)) | ||
| 683 | `(,@subform ,(format "\\{%d\\}" (nth 1 form)))))) | ||
| 684 | |||
| 685 | |||
| 686 | (defun rx->= (form) | ||
| 687 | "Parse and produce code from FORM `(>= N ...)'." | ||
| 688 | (rx-check form) | ||
| 689 | (setq form (rx-trans-forms form 1)) | ||
| 690 | (unless (and (integerp (nth 1 form)) | ||
| 691 | (> (nth 1 form) 0)) | ||
| 692 | (error "rx `>=' requires positive integer first arg")) | ||
| 693 | (let ((subform (rx-form (nth 2 form) '*))) | ||
| 694 | (if (stringp subform) | ||
| 695 | (format "%s\\{%d,\\}" subform (nth 1 form)) | ||
| 696 | `(,@subform ,(format "\\{%d,\\}" (nth 1 form)))))) | ||
| 697 | |||
| 698 | |||
| 699 | (defun rx-** (form) | ||
| 700 | "Parse and produce code from FORM `(** N M ...)'." | ||
| 701 | (rx-check form) | ||
| 702 | (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*)) | ||
| 703 | |||
| 704 | |||
| 705 | (defun rx-repeat (form) | ||
| 706 | "Parse and produce code from FORM. | ||
| 707 | FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." | ||
| 708 | (rx-check form) | ||
| 709 | (if (> (length form) 4) | ||
| 710 | (setq form (rx-trans-forms form 2))) | ||
| 711 | (if (null (nth 2 form)) | ||
| 712 | (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form))))) | ||
| 713 | (cond ((= (length form) 3) | ||
| 714 | (unless (and (integerp (nth 1 form)) | ||
| 715 | (> (nth 1 form) 0)) | ||
| 716 | (error "rx `repeat' requires positive integer first arg")) | ||
| 717 | (let ((subform (rx-form (nth 2 form) '*))) | ||
| 718 | (if (stringp subform) | ||
| 719 | (format "%s\\{%d\\}" subform (nth 1 form)) | ||
| 720 | `(,@subform ,(format "\\{%d\\}" (nth 1 form)))))) | ||
| 721 | ((or (not (integerp (nth 2 form))) | ||
| 722 | (< (nth 2 form) 0) | ||
| 723 | (not (integerp (nth 1 form))) | ||
| 724 | (< (nth 1 form) 0) | ||
| 725 | (< (nth 2 form) (nth 1 form))) | ||
| 726 | (error "rx `repeat' range error")) | ||
| 727 | (t | ||
| 728 | (let ((subform (rx-form (nth 3 form) '*))) | ||
| 729 | (if (stringp subform) | ||
| 730 | (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form)) | ||
| 731 | `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form)))))))) | ||
| 732 | |||
| 733 | |||
| 734 | (defun rx-submatch (form) | ||
| 735 | "Parse and produce code from FORM, which is `(submatch ...)'." | ||
| 736 | (let ((subforms (rx--subforms (cdr form) ':))) | ||
| 737 | (if (stringp subforms) | ||
| 738 | (concat "\\(" subforms "\\)") | ||
| 739 | `("\\(" ,@subforms "\\)")))) | ||
| 740 | |||
| 741 | (defun rx-submatch-n (form) | ||
| 742 | "Parse and produce code from FORM, which is `(submatch-n N ...)'." | ||
| 743 | (let ((n (nth 1 form)) | ||
| 744 | (subforms (rx--subforms (cddr form) ':))) | ||
| 745 | (unless (and (integerp n) (> n 0)) | ||
| 746 | (error "rx `submatch-n' argument must be positive")) | ||
| 747 | (if (stringp subforms) | ||
| 748 | (concat "\\(?" (number-to-string n) ":" subforms "\\)") | ||
| 749 | `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)")))) | ||
| 750 | |||
| 751 | (defun rx-backref (form) | ||
| 752 | "Parse and produce code from FORM, which is `(backref N)'." | ||
| 753 | (rx-check form) | ||
| 754 | (format "\\%d" (nth 1 form))) | ||
| 755 | |||
| 756 | (defun rx-check-backref (arg) | ||
| 757 | "Check arg ARG for Rx `backref'." | ||
| 758 | (or (and (integerp arg) (>= arg 1) (<= arg 9)) | ||
| 759 | (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) | ||
| 760 | |||
| 761 | (defun rx-kleene (form) | ||
| 762 | "Parse and produce code from FORM. | ||
| 763 | FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | ||
| 764 | `zero-or-more' etc. operators. | ||
| 765 | If OP is one of `*', `+', `?', produce a greedy regexp. | ||
| 766 | If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | ||
| 767 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | ||
| 768 | is non-nil." | ||
| 769 | (rx-check form) | ||
| 770 | (setq form (rx-trans-forms form)) | ||
| 771 | (let ((suffix (cond ((memq (car form) '(* + \? ?\s)) "") | ||
| 772 | ((memq (car form) '(*? +? \?? ??)) "?") | ||
| 773 | (rx-greedy-flag "") | ||
| 774 | (t "?"))) | ||
| 775 | (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | ||
| 776 | ((memq (car form) '(+ +? 1+ one-or-more)) "+") | ||
| 777 | (t "?"))) | ||
| 778 | (subform (rx-form (cadr form) '*))) | ||
| 779 | (rx-group-if | ||
| 780 | (if (stringp subform) | ||
| 781 | (concat subform op suffix) | ||
| 782 | `(,@subform ,(concat op suffix))) | ||
| 783 | (and (memq rx-parent '(t *)) rx-parent)))) | ||
| 784 | |||
| 785 | |||
| 786 | (defun rx-atomic-p (r &optional lax) | ||
| 787 | "Return non-nil if regexp string R is atomic. | ||
| 788 | An atomic regexp R is one such that a suffix operator | ||
| 789 | appended to R will apply to all of R. For example, \"a\" | ||
| 790 | \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", | ||
| 791 | \"[ab]c\", and \"ab\\|ab*c\" are not atomic. | ||
| 792 | |||
| 793 | This function may return false negatives, but it will not | ||
| 794 | return false positives. It is nevertheless useful in | ||
| 795 | situations where an efficiency shortcut can be taken only if a | ||
| 796 | regexp is atomic. The function can be improved to detect | ||
| 797 | more cases of atomic regexps. Presently, this function | ||
| 798 | detects the following categories of atomic regexp; | ||
| 799 | |||
| 800 | a group or shy group: \\(...\\) | ||
| 801 | a character class: [...] | ||
| 802 | a single character: a | ||
| 803 | |||
| 804 | On the other hand, false negatives will be returned for | ||
| 805 | regexps that are atomic but end in operators, such as | ||
| 806 | \"a+\". I think these are rare. Probably such cases could | ||
| 807 | be detected without much effort. A guarantee of no false | ||
| 808 | negatives would require a theoretic specification of the set | ||
| 809 | of all atomic regexps." | ||
| 810 | (if (and rx--compile-to-lisp | ||
| 811 | (not (stringp r))) | ||
| 812 | nil ;; Runtime value, we must assume non-atomic. | ||
| 813 | (let ((l (length r))) | ||
| 814 | (cond | 378 | (cond |
| 815 | ((<= l 1)) | 379 | ;; Empty set: if negated, any char, otherwise match-nothing. |
| 816 | ((= l 2) (= (aref r 0) ?\\)) | 380 | ((null items) |
| 817 | ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) | 381 | (if negated |
| 818 | ((null lax) | 382 | (rx--translate-symbol 'anything) |
| 819 | (cond | 383 | (rx--empty))) |
| 820 | ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r)) | 384 | ;; Single non-negated character. |
| 821 | ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))) | 385 | ((and (null (cdr items)) |
| 822 | 386 | (consp (car items)) | |
| 823 | 387 | (eq (caar items) (cdar items)) | |
| 824 | (defun rx-syntax (form) | 388 | (not negated)) |
| 825 | "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | 389 | (cons (list (regexp-quote (char-to-string (caar items)))) |
| 826 | (rx-check form) | 390 | t)) |
| 827 | (let* ((sym (cadr form)) | 391 | ;; At least one character or class, possibly negated. |
| 828 | (syntax (cdr (assq sym rx-syntax)))) | 392 | (t |
| 393 | (cons | ||
| 394 | (list | ||
| 395 | (concat | ||
| 396 | "[" | ||
| 397 | (and negated "^") | ||
| 398 | (mapconcat (lambda (item) | ||
| 399 | (cond ((symbolp item) | ||
| 400 | (format "[:%s:]" item)) | ||
| 401 | ((eq (car item) (cdr item)) | ||
| 402 | (char-to-string (car item))) | ||
| 403 | ((eq (1+ (car item)) (cdr item)) | ||
| 404 | (string (car item) (cdr item))) | ||
| 405 | (t | ||
| 406 | (string (car item) ?- (cdr item))))) | ||
| 407 | items nil) | ||
| 408 | "]")) | ||
| 409 | t)))))) | ||
| 410 | |||
| 411 | (defun rx--translate-not (negated body) | ||
| 412 | "Translate a (not ...) construct. Return (REGEXP . PRECEDENCE). | ||
| 413 | If NEGATED, negate the sense (thus making it positive)." | ||
| 414 | (unless (and body (null (cdr body))) | ||
| 415 | (error "rx `not' form takes exactly one argument")) | ||
| 416 | (let ((arg (car body))) | ||
| 417 | (cond | ||
| 418 | ((consp arg) | ||
| 419 | (pcase (car arg) | ||
| 420 | ((or 'any 'in 'char) (rx--translate-any (not negated) (cdr arg))) | ||
| 421 | ('syntax (rx--translate-syntax (not negated) (cdr arg))) | ||
| 422 | ('category (rx--translate-category (not negated) (cdr arg))) | ||
| 423 | ('not (rx--translate-not (not negated) (cdr arg))) | ||
| 424 | (_ (error "Illegal argument to rx `not': %S" arg)))) | ||
| 425 | ((eq arg 'word-boundary) | ||
| 426 | (rx--translate-symbol | ||
| 427 | (if negated 'word-boundary 'not-word-boundary))) | ||
| 428 | (t | ||
| 429 | (let ((class (cdr (assq arg rx--char-classes)))) | ||
| 430 | (if class | ||
| 431 | (rx--translate-any (not negated) (list class)) | ||
| 432 | (error "Illegal argument to rx `not': %s" arg))))))) | ||
| 433 | |||
| 434 | (defun rx--atomic-regexp (item) | ||
| 435 | "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t." | ||
| 436 | (if (eq (cdr item) t) | ||
| 437 | (car item) | ||
| 438 | (rx--bracket (car item)))) | ||
| 439 | |||
| 440 | (defun rx--translate-counted-repetition (min-count max-count body) | ||
| 441 | (let ((operand (rx--translate-seq body))) | ||
| 442 | (if (car operand) | ||
| 443 | (cons (append | ||
| 444 | (rx--atomic-regexp operand) | ||
| 445 | (list (concat "\\{" | ||
| 446 | (number-to-string min-count) | ||
| 447 | (cond ((null max-count) ",") | ||
| 448 | ((< min-count max-count) | ||
| 449 | (concat "," (number-to-string max-count)))) | ||
| 450 | "\\}"))) | ||
| 451 | t) | ||
| 452 | operand))) | ||
| 453 | |||
| 454 | (defun rx--check-repeat-arg (name min-args body) | ||
| 455 | (unless (>= (length body) min-args) | ||
| 456 | (error "rx `%s' requires at least %d argument%s" | ||
| 457 | name min-args (if (= min-args 1) "" "s"))) | ||
| 458 | ;; There seems to be no reason to disallow zero counts. | ||
| 459 | (unless (natnump (car body)) | ||
| 460 | (error "rx `%s' first argument must be nonnegative" name))) | ||
| 461 | |||
| 462 | (defun rx--translate-bounded-repetition (name body) | ||
| 463 | (let ((min-count (car body)) | ||
| 464 | (max-count (cadr body)) | ||
| 465 | (items (cddr body))) | ||
| 466 | (unless (and (natnump min-count) | ||
| 467 | (natnump max-count) | ||
| 468 | (<= min-count max-count)) | ||
| 469 | (error "rx `%s' range error" name)) | ||
| 470 | (rx--translate-counted-repetition min-count max-count items))) | ||
| 471 | |||
| 472 | (defun rx--translate-repeat (body) | ||
| 473 | (rx--check-repeat-arg 'repeat 2 body) | ||
| 474 | (if (= (length body) 2) | ||
| 475 | (rx--translate-counted-repetition (car body) (car body) (cdr body)) | ||
| 476 | (rx--translate-bounded-repetition 'repeat body))) | ||
| 477 | |||
| 478 | (defun rx--translate-** (body) | ||
| 479 | (rx--check-repeat-arg '** 2 body) | ||
| 480 | (rx--translate-bounded-repetition '** body)) | ||
| 481 | |||
| 482 | (defun rx--translate->= (body) | ||
| 483 | (rx--check-repeat-arg '>= 1 body) | ||
| 484 | (rx--translate-counted-repetition (car body) nil (cdr body))) | ||
| 485 | |||
| 486 | (defun rx--translate-= (body) | ||
| 487 | (rx--check-repeat-arg '= 1 body) | ||
| 488 | (rx--translate-counted-repetition (car body) (car body) (cdr body))) | ||
| 489 | |||
| 490 | (defvar rx--greedy t) | ||
| 491 | |||
| 492 | (defun rx--translate-rep (op-string greedy body) | ||
| 493 | "Translate a repetition; OP-STRING is one of \"*\", \"+\" or \"?\". | ||
| 494 | GREEDY is a boolean. Return (REGEXP . PRECEDENCE)." | ||
| 495 | (let ((operand (rx--translate-seq body))) | ||
| 496 | (if (car operand) | ||
| 497 | (cons (append (rx--atomic-regexp operand) | ||
| 498 | (list (concat op-string (unless greedy "?")))) | ||
| 499 | ;; The result has precedence seq to avoid (? (* "a")) -> "a*?" | ||
| 500 | 'seq) | ||
| 501 | operand))) | ||
| 502 | |||
| 503 | (defun rx--control-greedy (greedy body) | ||
| 504 | "Translate the sequence BODY with greediness GREEDY. | ||
| 505 | Return (REGEXP . PRECEDENCE)." | ||
| 506 | (let ((rx--greedy greedy)) | ||
| 507 | (rx--translate-seq body))) | ||
| 508 | |||
| 509 | (defun rx--translate-group (body) | ||
| 510 | "Translate the `group' form. Return (REGEXP . PRECEDENCE)." | ||
| 511 | (cons (rx--enclose "\\(" | ||
| 512 | (car (rx--translate-seq body)) | ||
| 513 | "\\)") | ||
| 514 | t)) | ||
| 515 | |||
| 516 | (defun rx--translate-group-n (body) | ||
| 517 | "Translate the `group-n' form. Return (REGEXP . PRECEDENCE)." | ||
| 518 | (unless (and (integerp (car body)) (> (car body) 0)) | ||
| 519 | (error "rx `group-n' requires a positive number as first argument")) | ||
| 520 | (cons (rx--enclose (concat "\\(?" (number-to-string (car body)) ":") | ||
| 521 | (car (rx--translate-seq (cdr body))) | ||
| 522 | "\\)") | ||
| 523 | t)) | ||
| 524 | |||
| 525 | (defun rx--translate-backref (body) | ||
| 526 | "Translate the `backref' form. Return (REGEXP . PRECEDENCE)." | ||
| 527 | (unless (and (= (length body) 1) (integerp (car body)) (<= 1 (car body) 9)) | ||
| 528 | (error "rx `backref' requires an argument in the range 1..9")) | ||
| 529 | (cons (list "\\" (number-to-string (car body))) t)) | ||
| 530 | |||
| 531 | (defconst rx--syntax-codes | ||
| 532 | '((whitespace . ?-) ; SPC also accepted | ||
| 533 | (punctuation . ?.) | ||
| 534 | (word . ?w) ; W also accepted | ||
| 535 | (symbol . ?_) | ||
| 536 | (open-parenthesis . ?\() | ||
| 537 | (close-parenthesis . ?\)) | ||
| 538 | (expression-prefix . ?\') | ||
| 539 | (string-quote . ?\") | ||
| 540 | (paired-delimiter . ?$) | ||
| 541 | (escape . ?\\) | ||
| 542 | (character-quote . ?/) | ||
| 543 | (comment-start . ?<) | ||
| 544 | (comment-end . ?>) | ||
| 545 | (string-delimiter . ?|) | ||
| 546 | (comment-delimiter . ?!))) | ||
| 547 | |||
| 548 | (defun rx--translate-syntax (negated body) | ||
| 549 | "Translate the `syntax' form. Return (REGEXP . PRECEDENCE)." | ||
| 550 | (unless (and body (null (cdr body))) | ||
| 551 | (error "rx `syntax' form takes exactly one argument")) | ||
| 552 | (let* ((sym (car body)) | ||
| 553 | (syntax (cdr (assq sym rx--syntax-codes)))) | ||
| 829 | (unless syntax | 554 | (unless syntax |
| 830 | ;; Try sregex compatibility. | ||
| 831 | (cond | 555 | (cond |
| 832 | ((characterp sym) (setq syntax sym)) | 556 | ;; Syntax character directly (sregex compatibility) |
| 557 | ((and (characterp sym) (rassq sym rx--syntax-codes)) | ||
| 558 | (setq syntax sym)) | ||
| 559 | ;; Syntax character as symbol (sregex compatibility) | ||
| 833 | ((symbolp sym) | 560 | ((symbolp sym) |
| 834 | (let ((name (symbol-name sym))) | 561 | (let ((name (symbol-name sym))) |
| 835 | (if (= 1 (length name)) | 562 | (when (= (length name) 1) |
| 836 | (setq syntax (aref name 0)))))) | 563 | (let ((char (string-to-char name))) |
| 564 | (when (rassq char rx--syntax-codes) | ||
| 565 | (setq syntax char))))))) | ||
| 837 | (unless syntax | 566 | (unless syntax |
| 838 | (error "Unknown rx syntax `%s'" sym))) | 567 | (error "Unknown rx syntax name `%s'" sym))) |
| 839 | (format "\\s%c" syntax))) | 568 | (cons (list (string ?\\ (if negated ?S ?s) syntax)) |
| 840 | 569 | t))) | |
| 841 | 570 | ||
| 842 | (defun rx-check-category (form) | 571 | (defconst rx--categories |
| 843 | "Check the argument FORM of a `(category FORM)'." | 572 | '((space-for-indent . ?\s) |
| 844 | (unless (or (integerp form) | 573 | (base . ?.) |
| 845 | (cdr (assq form rx-categories))) | 574 | (consonant . ?0) |
| 846 | (error "Unknown category `%s'" form)) | 575 | (base-vowel . ?1) |
| 847 | t) | 576 | (upper-diacritical-mark . ?2) |
| 848 | 577 | (lower-diacritical-mark . ?3) | |
| 849 | 578 | (tone-mark . ?4) | |
| 850 | (defun rx-category (form) | 579 | (symbol . ?5) |
| 851 | "Parse and produce code from FORM, which is `(category SYMBOL)'." | 580 | (digit . ?6) |
| 852 | (rx-check form) | 581 | (vowel-modifying-diacritical-mark . ?7) |
| 853 | (let ((char (if (integerp (cadr form)) | 582 | (vowel-sign . ?8) |
| 854 | (cadr form) | 583 | (semivowel-lower . ?9) |
| 855 | (cdr (assq (cadr form) rx-categories))))) | 584 | (not-at-end-of-line . ?<) |
| 856 | (format "\\c%c" char))) | 585 | (not-at-beginning-of-line . ?>) |
| 857 | 586 | (alpha-numeric-two-byte . ?A) | |
| 858 | 587 | (chinese-two-byte . ?C) | |
| 859 | (defun rx-eval (form) | 588 | (chinse-two-byte . ?C) ; A typo in Emacs 21.1-24.3. |
| 860 | "Parse and produce code from FORM, which is `(eval FORM)'." | 589 | (greek-two-byte . ?G) |
| 861 | (rx-check form) | 590 | (japanese-hiragana-two-byte . ?H) |
| 862 | (rx-form (eval (cadr form)) rx-parent)) | 591 | (indian-two-byte . ?I) |
| 863 | 592 | (japanese-katakana-two-byte . ?K) | |
| 864 | 593 | (strong-left-to-right . ?L) | |
| 865 | (defun rx-greedy (form) | 594 | (korean-hangul-two-byte . ?N) |
| 866 | "Parse and produce code from FORM. | 595 | (strong-right-to-left . ?R) |
| 867 | If FORM is `(minimal-match FORM1)', non-greedy versions of `*', | 596 | (cyrillic-two-byte . ?Y) |
| 868 | `+', and `?' operators will be used in FORM1. If FORM is | 597 | (combining-diacritic . ?^) |
| 869 | `(maximal-match FORM1)', greedy operators will be used." | 598 | (ascii . ?a) |
| 870 | (rx-check form) | 599 | (arabic . ?b) |
| 871 | (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | 600 | (chinese . ?c) |
| 872 | (rx-form (cadr form) rx-parent))) | 601 | (ethiopic . ?e) |
| 873 | 602 | (greek . ?g) | |
| 874 | 603 | (korean . ?h) | |
| 875 | (defun rx-regexp (form) | 604 | (indian . ?i) |
| 876 | "Parse and produce code from FORM, which is `(regexp STRING)'." | 605 | (japanese . ?j) |
| 877 | (cond ((stringp (cadr form)) | 606 | (japanese-katakana . ?k) |
| 878 | (rx-group-if (cadr form) rx-parent)) | 607 | (latin . ?l) |
| 879 | (rx--compile-to-lisp | 608 | (lao . ?o) |
| 880 | ;; Always group non-string forms, since we can't be sure they | 609 | (tibetan . ?q) |
| 881 | ;; are atomic. | 610 | (japanese-roman . ?r) |
| 882 | (rx-group-if (cdr form) t)) | 611 | (thai . ?t) |
| 883 | (t (rx-check form)))) | 612 | (vietnamese . ?v) |
| 884 | 613 | (hebrew . ?w) | |
| 885 | (defun rx-literal (form) | 614 | (cyrillic . ?y) |
| 886 | "Parse and produce code from FORM, which is `(literal STRING-EXP)'." | 615 | (can-break . ?|))) |
| 887 | (cond ((stringp (cadr form)) | 616 | |
| 888 | ;; This is allowed, but makes little sense, you could just | 617 | (defun rx--translate-category (negated body) |
| 889 | ;; use STRING directly. | 618 | "Translate the `category' form. Return (REGEXP . PRECEDENCE)." |
| 890 | (rx-group-if (regexp-quote (cadr form)) rx-parent)) | 619 | (unless (and body (null (cdr body))) |
| 891 | (rx--compile-to-lisp | 620 | (error "rx `category' form takes exactly one argument")) |
| 892 | (rx-group-if `((regexp-quote ,(cadr form))) rx-parent)) | 621 | (let* ((arg (car body)) |
| 893 | (t (rx-check form)))) | 622 | (category |
| 894 | 623 | (cond ((symbolp arg) | |
| 895 | (defun rx-form (form &optional parent) | 624 | (let ((cat (assq arg rx--categories))) |
| 896 | "Parse and produce code for regular expression FORM. | 625 | (unless cat |
| 897 | FORM is a regular expression in sexp form. | 626 | (error "Unknown rx category `%s'" arg)) |
| 898 | PARENT shows which type of expression calls and controls putting of | 627 | (cdr cat))) |
| 899 | shy groups around the result and some more in other functions." | 628 | ((characterp arg) arg) |
| 900 | (let ((rx-parent parent)) | 629 | (t (error "Invalid rx `category' argument `%s'" arg))))) |
| 901 | (cond | 630 | (cons (list (string ?\\ (if negated ?C ?c) category)) |
| 902 | ((stringp form) | 631 | t))) |
| 903 | (rx-group-if (regexp-quote form) | 632 | |
| 904 | (if (and (eq parent '*) (< 1 (length form))) | 633 | (defvar rx--delayed-evaluation nil |
| 905 | parent))) | 634 | "Whether to allow certain forms to be evaluated at runtime.") |
| 906 | ((integerp form) | 635 | |
| 907 | (regexp-quote (char-to-string form))) | 636 | (defun rx--translate-literal (body) |
| 908 | ((symbolp form) | 637 | "Translate the `literal' form. Return (REGEXP . PRECEDENCE)." |
| 909 | (let ((info (rx-info form nil))) | 638 | (unless (and body (null (cdr body))) |
| 910 | (cond ((stringp info) | 639 | (error "rx `literal' form takes exactly one argument")) |
| 911 | info) | 640 | (let ((arg (car body))) |
| 912 | ((null info) | 641 | (cond ((stringp arg) |
| 913 | (error "Unknown rx form `%s'" form)) | 642 | (cons (list (regexp-quote arg)) (if (= (length arg) 1) t 'seq))) |
| 914 | (t | 643 | (rx--delayed-evaluation |
| 915 | (funcall (nth 0 info) form))))) | 644 | (cons (list (list 'regexp-quote arg)) 'seq)) |
| 916 | ((consp form) | 645 | (t (error "rx `literal' form with non-string argument"))))) |
| 917 | (let ((info (rx-info (car form) 'head))) | 646 | |
| 918 | (unless (consp info) | 647 | (defun rx--translate-eval (body) |
| 919 | (error "Unknown rx form `%s'" (car form))) | 648 | "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." |
| 920 | (funcall (nth 0 info) form))) | 649 | (unless (and body (null (cdr body))) |
| 921 | (t | 650 | (error "rx `eval' form takes exactly one argument")) |
| 922 | (error "rx syntax error at `%s'" form))))) | 651 | (rx--translate (eval (car body)))) |
| 923 | 652 | ||
| 924 | (defun rx--subforms (subforms &optional parent separator) | 653 | (defvar rx--regexp-atomic-regexp nil) |
| 925 | "Produce code for regular expressions SUBFORMS. | 654 | |
| 926 | SUBFORMS is a list of regular expression sexps. | 655 | (defun rx--translate-regexp (body) |
| 927 | PARENT controls grouping, as in `rx-form'. | 656 | "Translate the `regexp' form. Return (REGEXP . PRECEDENCE)." |
| 928 | Insert SEPARATOR between the code from each of SUBFORMS." | 657 | (unless (and body (null (cdr body))) |
| 929 | (if (null (cdr subforms)) | 658 | (error "rx `regexp' form takes exactly one argument")) |
| 930 | ;; Zero or one forms, no need for grouping. | 659 | (let ((arg (car body))) |
| 931 | (and subforms (rx-form (car subforms))) | 660 | (cond ((stringp arg) |
| 932 | (let ((listify (lambda (x) | 661 | ;; Generate the regexp when needed, since rx isn't |
| 933 | (if (listp x) (copy-sequence x) | 662 | ;; necessarily present in the byte-compilation environment. |
| 934 | (list x))))) | 663 | (unless rx--regexp-atomic-regexp |
| 935 | (setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms)) | 664 | (setq rx--regexp-atomic-regexp |
| 936 | (cond ((or (not rx--compile-to-lisp) | 665 | ;; Match atomic (precedence t) regexps: may give |
| 937 | (cl-every #'stringp subforms)) | 666 | ;; false negatives but no false positives, assuming |
| 938 | (mapconcat #'identity subforms separator)) | 667 | ;; the target string is syntactically correct. |
| 939 | (separator | 668 | (rx-to-string |
| 940 | (nconc (funcall listify (car subforms)) | 669 | '(seq |
| 941 | (mapcan (lambda (x) | 670 | bos |
| 942 | (cons separator (funcall listify x))) | 671 | (or (seq "[" |
| 943 | (cdr subforms)))) | 672 | (opt "^") |
| 944 | (t (mapcan listify subforms)))))) | 673 | (opt "]") |
| 674 | (* (or (seq "[:" (+ (any "a-z")) ":]") | ||
| 675 | (not (any "]")))) | ||
| 676 | "]") | ||
| 677 | anything | ||
| 678 | (seq "\\" | ||
| 679 | (or anything | ||
| 680 | (seq (any "sScC_") anything) | ||
| 681 | (seq "(" | ||
| 682 | (* (or (not (any "\\")) | ||
| 683 | (seq "\\" (not (any ")"))))) | ||
| 684 | "\\)")))) | ||
| 685 | eos) | ||
| 686 | t))) | ||
| 687 | (cons (list arg) | ||
| 688 | (if (string-match-p rx--regexp-atomic-regexp arg) t nil))) | ||
| 689 | (rx--delayed-evaluation | ||
| 690 | (cons (list arg) nil)) | ||
| 691 | (t (error "rx `regexp' form with non-string argument"))))) | ||
| 692 | |||
| 693 | (defun rx--translate-compat-form (def form) | ||
| 694 | "Translate a compatibility form from `rx-constituents'. | ||
| 695 | DEF is the definition tuple. Return (REGEXP . PRECEDENCE)." | ||
| 696 | (let* ((fn (nth 0 def)) | ||
| 697 | (min-args (nth 1 def)) | ||
| 698 | (max-args (nth 2 def)) | ||
| 699 | (predicate (nth 3 def)) | ||
| 700 | (nargs (1- (length form)))) | ||
| 701 | (when (< nargs min-args) | ||
| 702 | (error "The `%s' form takes at least %d argument(s)" | ||
| 703 | (car form) min-args)) | ||
| 704 | (when (and max-args (> nargs max-args)) | ||
| 705 | (error "The `%s' form takes at most %d argument(s)" | ||
| 706 | (car form) max-args)) | ||
| 707 | (when (and predicate (not (rx--every predicate (cdr form)))) | ||
| 708 | (error "The `%s' form requires arguments satisfying `%s'" | ||
| 709 | (car form) predicate)) | ||
| 710 | (let ((regexp (funcall fn form))) | ||
| 711 | (unless (stringp regexp) | ||
| 712 | (error "The `%s' form did not expand to a string" (car form))) | ||
| 713 | (cons (list regexp) nil)))) | ||
| 714 | |||
| 715 | (defun rx--translate-form (form) | ||
| 716 | "Translate an rx form (list structure). Return (REGEXP . PRECEDENCE)." | ||
| 717 | (let ((body (cdr form))) | ||
| 718 | (pcase (car form) | ||
| 719 | ((or 'seq : 'and 'sequence) (rx--translate-seq body)) | ||
| 720 | ((or 'or '|) (rx--translate-or body)) | ||
| 721 | ((or 'any 'in 'char) (rx--translate-any nil body)) | ||
| 722 | ('not-char (rx--translate-any t body)) | ||
| 723 | ('not (rx--translate-not nil body)) | ||
| 724 | |||
| 725 | ('repeat (rx--translate-repeat body)) | ||
| 726 | ('= (rx--translate-= body)) | ||
| 727 | ('>= (rx--translate->= body)) | ||
| 728 | ('** (rx--translate-** body)) | ||
| 729 | |||
| 730 | ((or 'zero-or-more '0+) (rx--translate-rep "*" rx--greedy body)) | ||
| 731 | ((or 'one-or-more '1+) (rx--translate-rep "+" rx--greedy body)) | ||
| 732 | ((or 'zero-or-one 'opt 'optional) (rx--translate-rep "?" rx--greedy body)) | ||
| 733 | |||
| 734 | ('* (rx--translate-rep "*" t body)) | ||
| 735 | ('+ (rx--translate-rep "+" t body)) | ||
| 736 | ((or '\? ?\s) (rx--translate-rep "?" t body)) | ||
| 737 | |||
| 738 | ('*? (rx--translate-rep "*" nil body)) | ||
| 739 | ('+? (rx--translate-rep "+" nil body)) | ||
| 740 | ((or '\?? ??) (rx--translate-rep "?" nil body)) | ||
| 741 | |||
| 742 | ('minimal-match (rx--control-greedy nil body)) | ||
| 743 | ('maximal-match (rx--control-greedy t body)) | ||
| 744 | |||
| 745 | ((or 'group 'submatch) (rx--translate-group body)) | ||
| 746 | ((or 'group-n 'submatch-n) (rx--translate-group-n body)) | ||
| 747 | ('backref (rx--translate-backref body)) | ||
| 748 | |||
| 749 | ('syntax (rx--translate-syntax nil body)) | ||
| 750 | ('not-syntax (rx--translate-syntax t body)) | ||
| 751 | ('category (rx--translate-category nil body)) | ||
| 752 | |||
| 753 | ('literal (rx--translate-literal body)) | ||
| 754 | ('eval (rx--translate-eval body)) | ||
| 755 | ((or 'regexp 'regex) (rx--translate-regexp body)) | ||
| 756 | |||
| 757 | (op | ||
| 758 | (unless (symbolp op) | ||
| 759 | (error "Bad rx operator `%S'" op)) | ||
| 760 | |||
| 761 | ;; For compatibility with old rx. | ||
| 762 | (let ((entry (assq op rx-constituents))) | ||
| 763 | (if (progn | ||
| 764 | (while (and entry (not (consp (cdr entry)))) | ||
| 765 | (setq entry | ||
| 766 | (if (symbolp (cdr entry)) | ||
| 767 | ;; Alias for another entry. | ||
| 768 | (assq (cdr entry) rx-constituents) | ||
| 769 | ;; Wrong type, try further down the list. | ||
| 770 | (assq (car entry) | ||
| 771 | (cdr (memq entry rx-constituents)))))) | ||
| 772 | entry) | ||
| 773 | (rx--translate-compat-form (cdr entry) form) | ||
| 774 | (error "Unknown rx form `%s'" op))))))) | ||
| 775 | |||
| 776 | ;; Defined here rather than in re-builder to lower the odds that it | ||
| 777 | ;; will be kept in sync with changes. | ||
| 778 | (defconst rx--builtin-forms | ||
| 779 | '(seq sequence : and or | any in char not-char not | ||
| 780 | repeat = >= ** | ||
| 781 | zero-or-more 0+ * | ||
| 782 | one-or-more 1+ + | ||
| 783 | zero-or-one opt optional \? | ||
| 784 | *? +? \?? | ||
| 785 | minimal-match maximal-match | ||
| 786 | group submatch group-n submatch-n backref | ||
| 787 | syntax not-syntax category | ||
| 788 | literal eval regexp regex) | ||
| 789 | "List of built-in rx forms. For use in re-builder only.") | ||
| 790 | |||
| 791 | (defun rx--translate (item) | ||
| 792 | "Translate the rx-expression ITEM. Return (REGEXP . PRECEDENCE)." | ||
| 793 | (cond | ||
| 794 | ((stringp item) | ||
| 795 | (if (= (length item) 0) | ||
| 796 | (cons nil 'seq) | ||
| 797 | (cons (list (regexp-quote item)) (if (= (length item) 1) t 'seq)))) | ||
| 798 | ((characterp item) | ||
| 799 | (cons (list (regexp-quote (char-to-string item))) t)) | ||
| 800 | ((symbolp item) | ||
| 801 | (rx--translate-symbol item)) | ||
| 802 | ((consp item) | ||
| 803 | (rx--translate-form item)) | ||
| 804 | (t (error "Bad rx expression: %S" item)))) | ||
| 945 | 805 | ||
| 946 | 806 | ||
| 947 | ;;;###autoload | 807 | ;;;###autoload |
| 948 | (defun rx-to-string (form &optional no-group) | 808 | (defun rx-to-string (form &optional no-group) |
| 949 | "Parse and produce code for regular expression FORM. | 809 | "Translate FORM from `rx' sexp syntax into a string regexp. |
| 950 | FORM is a regular expression in sexp form. | 810 | The arguments to `literal' and `regexp' forms inside FORM must be |
| 951 | NO-GROUP non-nil means don't put shy groups around the result. | 811 | constant strings. |
| 952 | 812 | If NO-GROUP is non-nil, don't bracket the result in a non-capturing | |
| 953 | In contrast to the `rx' macro, subforms `literal' and `regexp' | 813 | group." |
| 954 | will not accept non-string arguments, i.e., (literal STRING) | 814 | (let* ((item (rx--translate form)) |
| 955 | becomes just a more verbose version of STRING." | 815 | (exprs (if no-group |
| 956 | (rx-group-if (rx-form form) (null no-group))) | 816 | (car item) |
| 817 | (rx--atomic-regexp item)))) | ||
| 818 | (apply #'concat exprs))) | ||
| 819 | |||
| 820 | (defun rx--to-expr (form) | ||
| 821 | "Translate the rx-expression FORM to a Lisp expression yielding a regexp." | ||
| 822 | (let* ((rx--delayed-evaluation t) | ||
| 823 | (elems (car (rx--translate form))) | ||
| 824 | (args nil)) | ||
| 825 | ;; Merge adjacent strings. | ||
| 826 | (while elems | ||
| 827 | (let ((strings nil)) | ||
| 828 | (while (and elems (stringp (car elems))) | ||
| 829 | (push (car elems) strings) | ||
| 830 | (setq elems (cdr elems))) | ||
| 831 | (let ((s (apply #'concat (nreverse strings)))) | ||
| 832 | (unless (zerop (length s)) | ||
| 833 | (push s args)))) | ||
| 834 | (when elems | ||
| 835 | (push (car elems) args) | ||
| 836 | (setq elems (cdr elems)))) | ||
| 837 | (cond ((null args) "") ; 0 args | ||
| 838 | ((cdr args) (cons 'concat (nreverse args))) ; â„2 args | ||
| 839 | (t (car args))))) ; 1 arg | ||
| 957 | 840 | ||
| 958 | 841 | ||
| 959 | ;;;###autoload | 842 | ;;;###autoload |
| @@ -1054,78 +937,64 @@ Zero-width assertions: these all match the empty string in specific places. | |||
| 1054 | 937 | ||
| 1055 | (literal EXPR) Match the literal string from evaluating EXPR at run time. | 938 | (literal EXPR) Match the literal string from evaluating EXPR at run time. |
| 1056 | (regexp EXPR) Match the string regexp from evaluating EXPR at run time. | 939 | (regexp EXPR) Match the string regexp from evaluating EXPR at run time. |
| 1057 | (eval EXPR) Match the rx sexp from evaluating EXPR at compile time." | 940 | (eval EXPR) Match the rx sexp from evaluating EXPR at compile time. |
| 1058 | (let* ((rx--compile-to-lisp t) | 941 | |
| 1059 | (re (cond ((null regexps) | 942 | \(fn REGEXPS...)" |
| 1060 | (error "No regexp")) | 943 | (rx--to-expr (cons 'seq regexps))) |
| 1061 | ((cdr regexps) | 944 | |
| 1062 | (rx-to-string `(and ,@regexps) t)) | 945 | |
| 1063 | (t | 946 | ;; During `rx--pcase-transform', list of defined variables in right-to-left |
| 1064 | (rx-to-string (car regexps) t))))) | 947 | ;; order. |
| 1065 | (if (stringp re) | 948 | (defvar rx--pcase-vars) |
| 1066 | re | 949 | |
| 1067 | `(concat ,@re)))) | 950 | (defun rx--pcase-transform (rx) |
| 1068 | 951 | "Transform RX, an rx-expression augmented with `let' and named `backref', | |
| 952 | into a plain rx-expression, collecting names into `rx--pcase-vars'." | ||
| 953 | (pcase rx | ||
| 954 | (`(let ,name . ,body) | ||
| 955 | (let* ((index (length (memq name rx--pcase-vars))) | ||
| 956 | (i (if (zerop index) | ||
| 957 | (length (push name rx--pcase-vars)) | ||
| 958 | index))) | ||
| 959 | `(group-n ,i ,(rx--pcase-transform (cons 'seq body))))) | ||
| 960 | ((and `(backref ,ref) | ||
| 961 | (guard (symbolp ref))) | ||
| 962 | (let ((index (length (memq ref rx--pcase-vars)))) | ||
| 963 | (when (zerop index) | ||
| 964 | (error "rx `backref' variable must be one of: %s" | ||
| 965 | (mapconcat #'symbol-name rx--pcase-vars " "))) | ||
| 966 | `(backref ,index))) | ||
| 967 | ((and `(,head . ,rest) | ||
| 968 | (guard (and (symbolp head) | ||
| 969 | (not (memq head '(literal regexp regex eval)))))) | ||
| 970 | (cons head (mapcar #'rx--pcase-transform rest))) | ||
| 971 | (_ rx))) | ||
| 1069 | 972 | ||
| 1070 | (pcase-defmacro rx (&rest regexps) | 973 | (pcase-defmacro rx (&rest regexps) |
| 1071 | "Build a `pcase' pattern matching `rx' REGEXPS in sexp form. | 974 | "A pattern that matches strings against `rx' REGEXPS in sexp form. |
| 1072 | The REGEXPS are interpreted as in `rx'. The pattern matches any | 975 | REGEXPS are interpreted as in `rx'. The pattern matches any |
| 1073 | string that is a match for the regular expression so constructed, | 976 | string that is a match for REGEXPS, as if by `string-match'. |
| 1074 | as if by `string-match'. | ||
| 1075 | 977 | ||
| 1076 | In addition to the usual `rx' constructs, REGEXPS can contain the | 978 | In addition to the usual `rx' syntax, REGEXPS can contain the |
| 1077 | following constructs: | 979 | following constructs: |
| 1078 | 980 | ||
| 1079 | (let REF SEXP...) creates a new explicitly named reference to | 981 | (let REF RX...) binds the symbol REF to a submatch that matches |
| 1080 | a submatch that matches regular expressions | 982 | the regular expressions RX. REF is bound in |
| 1081 | SEXP, and binds the match to REF. | 983 | CODE to the string of the submatch or nil, but |
| 1082 | (backref REF) creates a backreference to the submatch | 984 | can also be used in `backref'. |
| 1083 | introduced by a previous (let REF ...) | 985 | (backref REF) matches whatever the submatch REF matched. |
| 1084 | construct. REF can be the same symbol | 986 | REF can be a number, as usual, or a name |
| 1085 | in the first argument of the corresponding | 987 | introduced by a previous (let REF ...) |
| 1086 | (let REF ...) construct, or it can be a | 988 | construct." |
| 1087 | submatch number. It matches the referenced | 989 | (let* ((rx--pcase-vars nil) |
| 1088 | submatch. | 990 | (regexp (rx--to-expr (rx--pcase-transform (cons 'seq regexps))))) |
| 1089 | |||
| 1090 | The REFs are associated with explicitly named submatches starting | ||
| 1091 | from 1. Multiple occurrences of the same REF refer to the same | ||
| 1092 | submatch. | ||
| 1093 | |||
| 1094 | If a case matches, the match data is modified as usual so you can | ||
| 1095 | use it in the case body, but you still have to pass the correct | ||
| 1096 | string as argument to `match-string'." | ||
| 1097 | (let* ((vars ()) | ||
| 1098 | (rx-constituents | ||
| 1099 | `((let | ||
| 1100 | ,(lambda (form) | ||
| 1101 | (rx-check form) | ||
| 1102 | (let ((var (cadr form))) | ||
| 1103 | (cl-check-type var symbol) | ||
| 1104 | (let ((i (or (cl-position var vars :test #'eq) | ||
| 1105 | (prog1 (length vars) | ||
| 1106 | (setq vars `(,@vars ,var)))))) | ||
| 1107 | (rx-form `(submatch-n ,(1+ i) ,@(cddr form)))))) | ||
| 1108 | 1 nil) | ||
| 1109 | (backref | ||
| 1110 | ,(lambda (form) | ||
| 1111 | (rx-check form) | ||
| 1112 | (rx-backref | ||
| 1113 | `(backref ,(let ((var (cadr form))) | ||
| 1114 | (if (integerp var) var | ||
| 1115 | (1+ (cl-position var vars :test #'eq))))))) | ||
| 1116 | 1 1 | ||
| 1117 | ,(lambda (var) | ||
| 1118 | (cond ((integerp var) (rx-check-backref var)) | ||
| 1119 | ((memq var vars) t) | ||
| 1120 | (t (error "rx `backref' variable must be one of %s: %s" | ||
| 1121 | vars var))))) | ||
| 1122 | ,@rx-constituents)) | ||
| 1123 | (regexp (rx-to-string `(seq ,@regexps) :no-group))) | ||
| 1124 | `(and (pred (string-match ,regexp)) | 991 | `(and (pred (string-match ,regexp)) |
| 1125 | ,@(cl-loop for i from 1 | 992 | ,@(let ((i 0)) |
| 1126 | for var in vars | 993 | (mapcar (lambda (name) |
| 1127 | collect `(app (match-string ,i) ,var))))) | 994 | (setq i (1+ i)) |
| 1128 | 995 | `(app (match-string ,i) ,name)) | |
| 996 | (reverse rx--pcase-vars)))))) | ||
| 997 | |||
| 1129 | (provide 'rx) | 998 | (provide 'rx) |
| 1130 | 999 | ||
| 1131 | ;;; rx.el ends here | 1000 | ;;; rx.el ends here |
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 8845ebf46d1..fec046dd991 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | ;;; rx-tests.el --- test for rx.el functions -*- lexical-binding: t -*- | 1 | ;;; rx-tests.el --- tests for rx.el -*- lexical-binding: t -*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 2016-2019 Free Software Foundation, Inc. | 3 | ;; Copyright (C) 2016-2019 Free Software Foundation, Inc. |
| 4 | 4 | ||
| @@ -17,21 +17,44 @@ | |||
| 17 | ;; You should have received a copy of the GNU General Public License | 17 | ;; You should have received a copy of the GNU General Public License |
| 18 | ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. | 18 | ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. |
| 19 | 19 | ||
| 20 | ;;; Commentary: | ||
| 21 | |||
| 22 | (require 'ert) | 20 | (require 'ert) |
| 23 | (require 'rx) | 21 | (require 'rx) |
| 24 | 22 | ||
| 25 | ;;; Code: | 23 | (ert-deftest rx-seq () |
| 24 | (should (equal (rx "a.b" "*" "c") | ||
| 25 | "a\\.b\\*c")) | ||
| 26 | (should (equal (rx (seq "a" (: "b" (and "c" (sequence "d" nonl) | ||
| 27 | "e") | ||
| 28 | "f") | ||
| 29 | "g")) | ||
| 30 | "abcd.efg")) | ||
| 31 | (should (equal (rx "a$" "b") | ||
| 32 | "a\\$b")) | ||
| 33 | (should (equal (rx bol "a" "b" ?c eol) | ||
| 34 | "^abc$")) | ||
| 35 | (should (equal (rx "a" "" "b") | ||
| 36 | "ab")) | ||
| 37 | (should (equal (rx (seq)) | ||
| 38 | "")) | ||
| 39 | (should (equal (rx "" (or "ab" nonl) "") | ||
| 40 | "ab\\|."))) | ||
| 41 | |||
| 42 | (ert-deftest rx-or () | ||
| 43 | (should (equal (rx (or "ab" (| "c" nonl) "de")) | ||
| 44 | "ab\\|c\\|.\\|de")) | ||
| 45 | (should (equal (rx (or "ab" "abc" "a")) | ||
| 46 | "\\(?:ab\\|abc\\|a\\)")) | ||
| 47 | (should (equal (rx (| nonl "a") (| "b" blank)) | ||
| 48 | "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) | ||
| 49 | (should (equal (rx (|)) | ||
| 50 | "\\`a\\`"))) | ||
| 26 | 51 | ||
| 27 | (ert-deftest rx-char-any () | 52 | (ert-deftest rx-char-any () |
| 28 | "Test character alternatives with `]' and `-' (Bug#25123)." | 53 | "Test character alternatives with `]' and `-' (Bug#25123)." |
| 29 | (should (string-match | 54 | (should (equal |
| 30 | (rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:))) | 55 | (rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:))) |
| 31 | string-end) | 56 | string-end) |
| 32 | (apply #'string (nconc (number-sequence ?\] ?\{) | 57 | "\\`[.-:<-{-]+\\'"))) |
| 33 | (number-sequence ?< ?\]) | ||
| 34 | (number-sequence ?- ?:)))))) | ||
| 35 | 58 | ||
| 36 | (ert-deftest rx-char-any-range-nl () | 59 | (ert-deftest rx-char-any-range-nl () |
| 37 | "Test character alternatives with LF as a range endpoint." | 60 | "Test character alternatives with LF as a range endpoint." |
| @@ -40,28 +63,72 @@ | |||
| 40 | (should (equal (rx (any "\a-\n")) | 63 | (should (equal (rx (any "\a-\n")) |
| 41 | "[\a-\n]"))) | 64 | "[\a-\n]"))) |
| 42 | 65 | ||
| 43 | (ert-deftest rx-char-any-range-bad () | ||
| 44 | (should-error (rx (any "0-9a-Z"))) | ||
| 45 | (should-error (rx (any (?0 . ?9) (?a . ?Z))))) | ||
| 46 | |||
| 47 | (ert-deftest rx-char-any-raw-byte () | 66 | (ert-deftest rx-char-any-raw-byte () |
| 48 | "Test raw bytes in character alternatives." | 67 | "Test raw bytes in character alternatives." |
| 68 | |||
| 69 | ;; The multibyteness of the rx return value sometimes depends on whether | ||
| 70 | ;; the test had been byte-compiled or not, so we add explicit conversions. | ||
| 71 | |||
| 49 | ;; Separate raw characters. | 72 | ;; Separate raw characters. |
| 50 | (should (equal (string-match-p (rx (any "\326A\333B")) | 73 | (should (equal (string-to-multibyte (rx (any "\326A\333B"))) |
| 51 | "X\326\333") | 74 | (string-to-multibyte "[AB\326\333]"))) |
| 52 | 1)) | ||
| 53 | ;; Range of raw characters, unibyte. | 75 | ;; Range of raw characters, unibyte. |
| 54 | (should (equal (string-match-p (rx (any "\200-\377")) | 76 | (should (equal (string-to-multibyte (rx (any "\200-\377"))) |
| 55 | "ĂżA\310B") | 77 | (string-to-multibyte "[\200-\377]"))) |
| 56 | 2)) | 78 | |
| 57 | ;; Range of raw characters, multibyte. | 79 | ;; Range of raw characters, multibyte. |
| 58 | (should (equal (string-match-p (rx (any "Ă \211\326-\377\177")) | 80 | (should (equal (rx (any "Ă \211\326-\377\177")) |
| 59 | "XY\355\177\327") | 81 | "[\177Ă \211\326-\377]")) |
| 60 | 2)) | ||
| 61 | ;; Split range; \177-\377Ăż should not be optimised to \177-\377. | 82 | ;; Split range; \177-\377Ăż should not be optimised to \177-\377. |
| 62 | (should (equal (string-match-p (rx (any "\177-\377" ?Ăż)) | 83 | (should (equal (rx (any "\177-\377" ?Ăż)) |
| 63 | "ĂżA\310B") | 84 | "[\177Ăż\200-\377]"))) |
| 64 | 0))) | 85 | |
| 86 | (ert-deftest rx-any () | ||
| 87 | (should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS")) | ||
| 88 | "[ACDF-HJ-S]")) | ||
| 89 | (should (equal (rx (in "a!f" ?c) (char "q-z" "0-3") | ||
| 90 | (not-char "a-e1-5") (not (in "A-M" ?q))) | ||
| 91 | "[!acf][0-3q-z][^1-5a-e][^A-Mq]")) | ||
| 92 | (should (equal (rx (any "^") (any "]") (any "-") | ||
| 93 | (not (any "^")) (not (any "]")) (not (any "-"))) | ||
| 94 | "\\^]-[^^][^]][^-]")) | ||
| 95 | (should (equal (rx (any "]" "^") (any "]" "-") (any "-" "^") | ||
| 96 | (not (any "]" "^")) (not (any "]" "-")) | ||
| 97 | (not (any "-" "^"))) | ||
| 98 | "[]^][]-][-^][^]^][^]-][^-^]")) | ||
| 99 | (should (equal (rx (any "]" "^" "-") (not (any "]" "^" "-"))) | ||
| 100 | "[]^-][^]^-]")) | ||
| 101 | (should (equal (rx (any "-" ascii) (any "^" ascii) (any "]" ascii)) | ||
| 102 | "[[:ascii:]-][[:ascii:]^][][:ascii:]]")) | ||
| 103 | (should (equal (rx (not (any "-" ascii)) (not (any "^" ascii)) | ||
| 104 | (not (any "]" ascii))) | ||
| 105 | "[^[:ascii:]-][^[:ascii:]^][^][:ascii:]]")) | ||
| 106 | (should (equal (rx (any "-]" ascii) (any "^]" ascii) (any "-^" ascii)) | ||
| 107 | "[][:ascii:]-][]^[:ascii:]][[:ascii:]^-]")) | ||
| 108 | (should (equal (rx (not (any "-]" ascii)) (not (any "^]" ascii)) | ||
| 109 | (not (any "-^" ascii))) | ||
| 110 | "[^][:ascii:]-][^]^[:ascii:]][^[:ascii:]^-]")) | ||
| 111 | (should (equal (rx (any "-]^" ascii) (not (any "-]^" ascii))) | ||
| 112 | "[]^[:ascii:]-][^]^[:ascii:]-]")) | ||
| 113 | (should (equal (rx (any "^" lower upper) (not (any "^" lower upper))) | ||
| 114 | "[[:lower:]^[:upper:]][^[:lower:]^[:upper:]]")) | ||
| 115 | (should (equal (rx (any "-" lower upper) (not (any "-" lower upper))) | ||
| 116 | "[[:lower:][:upper:]-][^[:lower:][:upper:]-]")) | ||
| 117 | (should (equal (rx (any "]" lower upper) (not (any "]" lower upper))) | ||
| 118 | "[][:lower:][:upper:]][^][:lower:][:upper:]]")) | ||
| 119 | (should (equal (rx (any "-a" "c-" "f-f" "--/*--")) | ||
| 120 | "[*-/acf]")) | ||
| 121 | (should (equal (rx (any "]-a" ?-) (not (any "]-a" ?-))) | ||
| 122 | "[]-a-][^]-a-]")) | ||
| 123 | (should (equal (rx (any "--]") (not (any "--]")) | ||
| 124 | (any "-" "^-a") (not (any "-" "^-a"))) | ||
| 125 | "[].-\\-][^].-\\-][-^-a][^-^-a]")) | ||
| 126 | (should (equal (rx (not (any "!a" "0-8" digit nonascii))) | ||
| 127 | "[^!0-8a[:digit:][:nonascii:]]")) | ||
| 128 | (should (equal (rx (any) (not (any))) | ||
| 129 | "\\`a\\`\\(?:.\\|\n\\)")) | ||
| 130 | (should (equal (rx (any "") (not (any ""))) | ||
| 131 | "\\`a\\`\\(?:.\\|\n\\)"))) | ||
| 65 | 132 | ||
| 66 | (ert-deftest rx-pcase () | 133 | (ert-deftest rx-pcase () |
| 67 | (should (equal (pcase "a 1 2 3 1 1 b" | 134 | (should (equal (pcase "a 1 2 3 1 1 b" |
| @@ -71,7 +138,11 @@ | |||
| 71 | (backref u) space | 138 | (backref u) space |
| 72 | (backref 1)) | 139 | (backref 1)) |
| 73 | (list u v))) | 140 | (list u v))) |
| 74 | '("1" "3")))) | 141 | '("1" "3"))) |
| 142 | (let ((k "blue")) | ||
| 143 | (should (equal (pcase "<blue>" | ||
| 144 | ((rx "<" (literal k) ">") 'ok)) | ||
| 145 | 'ok)))) | ||
| 75 | 146 | ||
| 76 | (ert-deftest rx-kleene () | 147 | (ert-deftest rx-kleene () |
| 77 | "Test greedy and non-greedy repetition operators." | 148 | "Test greedy and non-greedy repetition operators." |
| @@ -94,71 +165,158 @@ | |||
| 94 | (should (equal (rx (maximal-match | 165 | (should (equal (rx (maximal-match |
| 95 | (seq (* "a") (+ "b") (\? "c") (?\s "d") | 166 | (seq (* "a") (+ "b") (\? "c") (?\s "d") |
| 96 | (*? "e") (+? "f") (\?? "g") (?? "h")))) | 167 | (*? "e") (+? "f") (\?? "g") (?? "h")))) |
| 97 | "a*b+c?d?e*?f+?g??h??"))) | 168 | "a*b+c?d?e*?f+?g??h??")) |
| 169 | (should (equal (rx "a" (*) (+ (*)) (? (*) (+)) "b") | ||
| 170 | "ab"))) | ||
| 98 | 171 | ||
| 99 | (ert-deftest rx-or () | 172 | (ert-deftest rx-repeat () |
| 100 | ;; Test or-pattern reordering (Bug#34641). | 173 | (should (equal (rx (= 3 "a") (>= 51 "b") |
| 101 | (let ((s "abc")) | 174 | (** 2 11 "c") (repeat 6 "d") (repeat 4 8 "e")) |
| 102 | (should (equal (and (string-match (rx (or "abc" "ab" "a")) s) | 175 | "a\\{3\\}b\\{51,\\}c\\{2,11\\}d\\{6\\}e\\{4,8\\}")) |
| 103 | (match-string 0 s)) | 176 | (should (equal (rx (= 0 "k") (>= 0 "l") (** 0 0 "m") (repeat 0 "n") |
| 104 | "abc")) | 177 | (repeat 0 0 "o")) |
| 105 | (should (equal (and (string-match (rx (or "ab" "abc" "a")) s) | 178 | "k\\{0\\}l\\{0,\\}m\\{0\\}n\\{0\\}o\\{0\\}")) |
| 106 | (match-string 0 s)) | 179 | (should (equal (rx (opt (0+ "a"))) |
| 107 | "ab")) | 180 | "\\(?:a*\\)?")) |
| 108 | (should (equal (and (string-match (rx (or "a" "ab" "abc")) s) | 181 | (should (equal (rx (opt (= 4 "a"))) |
| 109 | (match-string 0 s)) | 182 | "a\\{4\\}?")) |
| 110 | "a"))) | 183 | (should (equal (rx "a" (** 3 7) (= 4) (>= 3) (= 4 (>= 7) (= 2)) "b") |
| 111 | ;; Test zero-argument `or'. | 184 | "ab"))) |
| 112 | (should (equal (rx (or)) regexp-unmatchable))) | 185 | |
| 186 | (ert-deftest rx-atoms () | ||
| 187 | (should (equal (rx anything) | ||
| 188 | ".\\|\n")) | ||
| 189 | (should (equal (rx line-start not-newline nonl any line-end) | ||
| 190 | "^...$")) | ||
| 191 | (should (equal (rx bol string-start string-end buffer-start buffer-end | ||
| 192 | bos eos bot eot eol) | ||
| 193 | "^\\`\\'\\`\\'\\`\\'\\`\\'$")) | ||
| 194 | (should (equal (rx point word-start word-end bow eow symbol-start symbol-end | ||
| 195 | word-boundary not-word-boundary not-wordchar) | ||
| 196 | "\\=\\<\\>\\<\\>\\_<\\_>\\b\\B\\W")) | ||
| 197 | (should (equal (rx digit numeric num control cntrl) | ||
| 198 | "[[:digit:]][[:digit:]][[:digit:]][[:cntrl:]][[:cntrl:]]")) | ||
| 199 | (should (equal (rx hex-digit hex xdigit blank) | ||
| 200 | "[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:blank:]]")) | ||
| 201 | (should (equal (rx graph graphic print printing) | ||
| 202 | "[[:graph:]][[:graph:]][[:print:]][[:print:]]")) | ||
| 203 | (should (equal (rx alphanumeric alnum letter alphabetic alpha) | ||
| 204 | "[[:alnum:]][[:alnum:]][[:alpha:]][[:alpha:]][[:alpha:]]")) | ||
| 205 | (should (equal (rx ascii nonascii lower lower-case) | ||
| 206 | "[[:ascii:]][[:nonascii:]][[:lower:]][[:lower:]]")) | ||
| 207 | (should (equal (rx punctuation punct space whitespace white) | ||
| 208 | "[[:punct:]][[:punct:]][[:space:]][[:space:]][[:space:]]")) | ||
| 209 | (should (equal (rx upper upper-case word wordchar) | ||
| 210 | "[[:upper:]][[:upper:]][[:word:]][[:word:]]")) | ||
| 211 | (should (equal (rx unibyte multibyte) | ||
| 212 | "[[:unibyte:]][[:multibyte:]]"))) | ||
| 213 | |||
| 214 | (ert-deftest rx-syntax () | ||
| 215 | (should (equal (rx (syntax whitespace) (syntax punctuation) | ||
| 216 | (syntax word) (syntax symbol) | ||
| 217 | (syntax open-parenthesis) (syntax close-parenthesis)) | ||
| 218 | "\\s-\\s.\\sw\\s_\\s(\\s)")) | ||
| 219 | (should (equal (rx (syntax string-quote) (syntax paired-delimiter) | ||
| 220 | (syntax escape) (syntax character-quote) | ||
| 221 | (syntax comment-start) (syntax comment-end) | ||
| 222 | (syntax string-delimiter) (syntax comment-delimiter)) | ||
| 223 | "\\s\"\\s$\\s\\\\s/\\s<\\s>\\s|\\s!"))) | ||
| 224 | |||
| 225 | (ert-deftest rx-category () | ||
| 226 | (should (equal (rx (category space-for-indent) (category base) | ||
| 227 | (category consonant) (category base-vowel) | ||
| 228 | (category upper-diacritical-mark) | ||
| 229 | (category lower-diacritical-mark) | ||
| 230 | (category tone-mark) (category symbol) | ||
| 231 | (category digit) | ||
| 232 | (category vowel-modifying-diacritical-mark) | ||
| 233 | (category vowel-sign) (category semivowel-lower) | ||
| 234 | (category not-at-end-of-line) | ||
| 235 | (category not-at-beginning-of-line)) | ||
| 236 | "\\c \\c.\\c0\\c1\\c2\\c3\\c4\\c5\\c6\\c7\\c8\\c9\\c<\\c>")) | ||
| 237 | (should (equal (rx (category alpha-numeric-two-byte) | ||
| 238 | (category chinese-two-byte) (category greek-two-byte) | ||
| 239 | (category japanese-hiragana-two-byte) | ||
| 240 | (category indian-two-byte) | ||
| 241 | (category japanese-katakana-two-byte) | ||
| 242 | (category strong-left-to-right) | ||
| 243 | (category korean-hangul-two-byte) | ||
| 244 | (category strong-right-to-left) | ||
| 245 | (category cyrillic-two-byte) | ||
| 246 | (category combining-diacritic)) | ||
| 247 | "\\cA\\cC\\cG\\cH\\cI\\cK\\cL\\cN\\cR\\cY\\c^")) | ||
| 248 | (should (equal (rx (category ascii) (category arabic) (category chinese) | ||
| 249 | (category ethiopic) (category greek) (category korean) | ||
| 250 | (category indian) (category japanese) | ||
| 251 | (category japanese-katakana) (category latin) | ||
| 252 | (category lao) (category tibetan)) | ||
| 253 | "\\ca\\cb\\cc\\ce\\cg\\ch\\ci\\cj\\ck\\cl\\co\\cq")) | ||
| 254 | (should (equal (rx (category japanese-roman) (category thai) | ||
| 255 | (category vietnamese) (category hebrew) | ||
| 256 | (category cyrillic) (category can-break)) | ||
| 257 | "\\cr\\ct\\cv\\cw\\cy\\c|")) | ||
| 258 | (should (equal (rx (category ?g) (not (category ?~))) | ||
| 259 | "\\cg\\C~"))) | ||
| 260 | |||
| 261 | (ert-deftest rx-not () | ||
| 262 | (should (equal (rx (not word-boundary)) | ||
| 263 | "\\B")) | ||
| 264 | (should (equal (rx (not ascii) (not lower-case) (not wordchar)) | ||
| 265 | "[^[:ascii:]][^[:lower:]][^[:word:]]")) | ||
| 266 | (should (equal (rx (not (syntax punctuation)) (not (syntax escape))) | ||
| 267 | "\\S.\\S\\")) | ||
| 268 | (should (equal (rx (not (category tone-mark)) (not (category lao))) | ||
| 269 | "\\C4\\Co"))) | ||
| 270 | |||
| 271 | (ert-deftest rx-group () | ||
| 272 | (should (equal (rx (group nonl) (submatch "x") | ||
| 273 | (group-n 3 "y") (submatch-n 13 "z") (backref 1)) | ||
| 274 | "\\(.\\)\\(x\\)\\(?3:y\\)\\(?13:z\\)\\1")) | ||
| 275 | (should (equal (rx (group) (group-n 2)) | ||
| 276 | "\\(\\)\\(?2:\\)"))) | ||
| 277 | |||
| 278 | (ert-deftest rx-regexp () | ||
| 279 | (should (equal (rx (regexp "abc") (regex "[de]")) | ||
| 280 | "\\(?:abc\\)[de]")) | ||
| 281 | (let ((x "a*")) | ||
| 282 | (should (equal (rx (regexp x) "b") | ||
| 283 | "\\(?:a*\\)b")) | ||
| 284 | (should (equal (rx "" (regexp x) (eval "")) | ||
| 285 | "a*")))) | ||
| 286 | |||
| 287 | (ert-deftest rx-eval () | ||
| 288 | (should (equal (rx (eval (list 'syntax 'symbol))) | ||
| 289 | "\\s_")) | ||
| 290 | (should (equal (rx "a" (eval (concat)) "b") | ||
| 291 | "ab"))) | ||
| 292 | |||
| 293 | (ert-deftest rx-literal () | ||
| 294 | (should (equal (rx (literal (char-to-string 42)) nonl) | ||
| 295 | "\\*.")) | ||
| 296 | (let ((x "a+b")) | ||
| 297 | (should (equal (rx (opt (literal (upcase x)))) | ||
| 298 | "\\(?:A\\+B\\)?")))) | ||
| 299 | |||
| 300 | (ert-deftest rx-to-string () | ||
| 301 | (should (equal (rx-to-string '(or nonl "\nx")) | ||
| 302 | "\\(?:.\\|\nx\\)")) | ||
| 303 | (should (equal (rx-to-string '(or nonl "\nx") t) | ||
| 304 | ".\\|\nx"))) | ||
| 305 | |||
| 306 | |||
| 307 | (ert-deftest rx-constituents () | ||
| 308 | (let ((rx-constituents | ||
| 309 | (append '((beta . gamma) | ||
| 310 | (gamma . "a*b") | ||
| 311 | (delta . ((lambda (form) | ||
| 312 | (regexp-quote (format "<%S>" form))) | ||
| 313 | 1 nil symbolp)) | ||
| 314 | (epsilon . delta)) | ||
| 315 | rx-constituents))) | ||
| 316 | (should (equal (rx-to-string '(seq (+ beta) nonl gamma) t) | ||
| 317 | "\\(?:a*b\\)+.\\(?:a*b\\)")) | ||
| 318 | (should (equal (rx-to-string '(seq (delta a b c) (* (epsilon d e))) t) | ||
| 319 | "\\(?:<(delta a b c)>\\)\\(?:<(epsilon d e)>\\)*")))) | ||
| 113 | 320 | ||
| 114 | (ert-deftest rx-seq () | ||
| 115 | ;; Test zero-argument `seq'. | ||
| 116 | (should (equal (rx (seq)) ""))) | ||
| 117 | |||
| 118 | (defmacro rx-tests--match (regexp string &optional match) | ||
| 119 | (macroexp-let2 nil strexp string | ||
| 120 | `(ert-info ((format "Matching %S to %S" ',regexp ,strexp)) | ||
| 121 | (should (string-match ,regexp ,strexp)) | ||
| 122 | ,@(when match | ||
| 123 | `((should (equal (match-string 0 ,strexp) ,match))))))) | ||
| 124 | |||
| 125 | (ert-deftest rx-nonstring-expr () | ||
| 126 | (let ((bee "b") | ||
| 127 | (vowel "[aeiou]")) | ||
| 128 | (rx-tests--match (rx "a" (literal bee) "c") "abc") | ||
| 129 | (rx-tests--match (rx "a" (regexp bee) "c") "abc") | ||
| 130 | (rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc") | ||
| 131 | (rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc") | ||
| 132 | (should-not (string-match (rx (or (regexp bee) "xy")) "")) | ||
| 133 | (rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc") | ||
| 134 | (rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez") | ||
| 135 | (should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z")) | ||
| 136 | (rx-tests--match (rx "x" (= 3 (literal vowel)) "z") | ||
| 137 | "x[aeiou][aeiou][aeiou]z") | ||
| 138 | (rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz") | ||
| 139 | (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz") | ||
| 140 | (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz") | ||
| 141 | (rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz") | ||
| 142 | (rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz") | ||
| 143 | (rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz") | ||
| 144 | (rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz") | ||
| 145 | (rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz"))) | ||
| 146 | |||
| 147 | (ert-deftest rx-nonstring-expr-non-greedy () | ||
| 148 | "`rx's greediness can't affect runtime regexp parts." | ||
| 149 | (let ((ad-min "[ad]*?") | ||
| 150 | (ad-max "[ad]*") | ||
| 151 | (ad "[ad]")) | ||
| 152 | (rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda") | ||
| 153 | (rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada") | ||
| 154 | (rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" "cdaaada") | ||
| 155 | (rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" "cda") | ||
| 156 | (rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" "cda") | ||
| 157 | (rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" "cdaaada"))) | ||
| 158 | |||
| 159 | (ert-deftest rx-to-string-lisp-forms () | ||
| 160 | (rx-tests--match (rx-to-string '(seq "a" (literal "b") "c")) "abc") | ||
| 161 | (rx-tests--match (rx-to-string '(seq "a" (regexp "b") "c")) "abc")) | ||
| 162 | 321 | ||
| 163 | (provide 'rx-tests) | 322 | (provide 'rx-tests) |
| 164 | ;; rx-tests.el ends here. | ||