diff options
| author | Stefan Monnier | 2004-04-23 21:23:29 +0000 |
|---|---|---|
| committer | Stefan Monnier | 2004-04-23 21:23:29 +0000 |
| commit | ccfbe679888d8c3431b9946ff2e42d4e4c1c0816 (patch) | |
| tree | a4b613537ad83dac6ec7ee397b913aaf243ebd8b | |
| parent | 4b284383bb198e1c5ea311431bdc46947b4a46ef (diff) | |
| download | emacs-ccfbe679888d8c3431b9946ff2e42d4e4c1c0816.tar.gz emacs-ccfbe679888d8c3431b9946ff2e42d4e4c1c0816.zip | |
Doc fixes.
(rx-constituents): Add/extend many forms.
(rx-check): Check form is a list.
(bracket): Defvar.
(rx-check-any, rx-any, rx-check-not): Modify.
(rx-not): Simplify.
(rx-trans-forms, rx-=, rx->=, rx-**, rx-not-char, rx-not-syntax): New.
(rx-kleene): Use rx-trans-forms.
(rx-quote-for-set): Delete.
(rx): Allow multiple args.
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 427 |
1 files changed, 295 insertions, 132 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 6656cf5ed3c..042d711ee3d 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -32,6 +32,22 @@ | |||
| 32 | ;; from the bugs mentioned in the commentary section of Sregex, and | 32 | ;; from the bugs mentioned in the commentary section of Sregex, and |
| 33 | ;; uses a nicer syntax (IMHO, of course :-). | 33 | ;; uses a nicer syntax (IMHO, of course :-). |
| 34 | 34 | ||
| 35 | ;; This significantly extended version of the original, is almost | ||
| 36 | ;; compatible with Sregex. The only incompatibility I (fx) know of is | ||
| 37 | ;; that the `repeat' form can't have multiple regexp args. | ||
| 38 | |||
| 39 | ;; Now alternative forms are provided for a degree of compatibility | ||
| 40 | ;; with Shivers' attempted definitive SRE notation | ||
| 41 | ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not | ||
| 42 | ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, | ||
| 43 | ;; ,<exp>, (word ...), word+, posix-string, and character class forms. | ||
| 44 | ;; Some forms are inconsistent with SRE, either for historical reasons | ||
| 45 | ;; or because of the implementation -- simple translation into Emacs | ||
| 46 | ;; regexp strings. These include: any, word. Also, case-sensitivity | ||
| 47 | ;; and greediness are controlled by variables external to the regexp, | ||
| 48 | ;; and you need to feed the forms to the `posix-' functions to get | ||
| 49 | ;; SRE's POSIX semantics. There are probably more difficulties. | ||
| 50 | |||
| 35 | ;; Rx translates a sexp notation for regular expressions into the | 51 | ;; Rx translates a sexp notation for regular expressions into the |
| 36 | ;; usual string notation. The translation can be done at compile-time | 52 | ;; usual string notation. The translation can be done at compile-time |
| 37 | ;; by using the `rx' macro. It can be done at run-time by calling | 53 | ;; by using the `rx' macro. It can be done at run-time by calling |
| @@ -94,62 +110,103 @@ | |||
| 94 | 110 | ||
| 95 | ;;; Code: | 111 | ;;; Code: |
| 96 | 112 | ||
| 97 | |||
| 98 | (defconst rx-constituents | 113 | (defconst rx-constituents |
| 99 | '((and . (rx-and 1 nil)) | 114 | '((and . (rx-and 1 nil)) |
| 115 | (seq . and) ; SRE | ||
| 116 | (: . and) ; SRE | ||
| 117 | (sequence . and) ; sregex | ||
| 100 | (or . (rx-or 1 nil)) | 118 | (or . (rx-or 1 nil)) |
| 119 | (| . or) ; SRE | ||
| 101 | (not-newline . ".") | 120 | (not-newline . ".") |
| 121 | (nonl . not-newline) ; SRE | ||
| 102 | (anything . ".\\|\n") | 122 | (anything . ".\\|\n") |
| 103 | (any . (rx-any 1 1 rx-check-any)) | 123 | (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE |
| 104 | (in . any) | 124 | (in . any) |
| 125 | (char . any) ; sregex | ||
| 126 | (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex | ||
| 105 | (not . (rx-not 1 1 rx-check-not)) | 127 | (not . (rx-not 1 1 rx-check-not)) |
| 128 | ;; Partially consistent with sregex, whose `repeat' is like our | ||
| 129 | ;; `**'. (`repeat' with optional max arg and multiple sexp forms | ||
| 130 | ;; is ambiguous.) | ||
| 106 | (repeat . (rx-repeat 2 3)) | 131 | (repeat . (rx-repeat 2 3)) |
| 107 | (submatch . (rx-submatch 1 nil)) | 132 | (= . (rx-= 2 nil)) ; SRE |
| 133 | (>= . (rx->= 2 nil)) ; SRE | ||
| 134 | (** . (rx-** 2 nil)) ; SRE | ||
| 135 | (submatch . (rx-submatch 1 nil)) ; SRE | ||
| 108 | (group . submatch) | 136 | (group . submatch) |
| 109 | (zero-or-more . (rx-kleene 1 1)) | 137 | (zero-or-more . (rx-kleene 1 nil)) |
| 110 | (one-or-more . (rx-kleene 1 1)) | 138 | (one-or-more . (rx-kleene 1 nil)) |
| 111 | (zero-or-one . (rx-kleene 1 1)) | 139 | (zero-or-one . (rx-kleene 1 nil)) |
| 112 | (\? . zero-or-one) | 140 | (\? . zero-or-one) ; SRE |
| 113 | (\?? . zero-or-one) | 141 | (\?? . zero-or-one) |
| 114 | (* . zero-or-more) | 142 | (* . zero-or-more) ; SRE |
| 115 | (*? . zero-or-more) | 143 | (*? . zero-or-more) |
| 116 | (0+ . zero-or-more) | 144 | (0+ . zero-or-more) |
| 117 | (+ . one-or-more) | 145 | (+ . one-or-more) ; SRE |
| 118 | (+? . one-or-more) | 146 | (+? . one-or-more) |
| 119 | (1+ . one-or-more) | 147 | (1+ . one-or-more) |
| 120 | (optional . zero-or-one) | 148 | (optional . zero-or-one) |
| 149 | (opt . zero-or-one) ; sregex | ||
| 121 | (minimal-match . (rx-greedy 1 1)) | 150 | (minimal-match . (rx-greedy 1 1)) |
| 122 | (maximal-match . (rx-greedy 1 1)) | 151 | (maximal-match . (rx-greedy 1 1)) |
| 123 | (backref . (rx-backref 1 1 rx-check-backref)) | 152 | (backref . (rx-backref 1 1 rx-check-backref)) |
| 124 | (line-start . "^") | 153 | (line-start . "^") |
| 154 | (bol . line-start) ; SRE | ||
| 125 | (line-end . "$") | 155 | (line-end . "$") |
| 156 | (eol . line-end) ; SRE | ||
| 126 | (string-start . "\\`") | 157 | (string-start . "\\`") |
| 158 | (bos . string-start) ; SRE | ||
| 159 | (bot . string-start) ; sregex | ||
| 127 | (string-end . "\\'") | 160 | (string-end . "\\'") |
| 161 | (eos . string-end) ; SRE | ||
| 162 | (eot . string-end) ; sregex | ||
| 128 | (buffer-start . "\\`") | 163 | (buffer-start . "\\`") |
| 129 | (buffer-end . "\\'") | 164 | (buffer-end . "\\'") |
| 130 | (point . "\\=") | 165 | (point . "\\=") |
| 131 | (word-start . "\\<") | 166 | (word-start . "\\<") |
| 167 | (bow . word-start) ; SRE | ||
| 132 | (word-end . "\\>") | 168 | (word-end . "\\>") |
| 169 | (eow . word-end) ; SRE | ||
| 133 | (word-boundary . "\\b") | 170 | (word-boundary . "\\b") |
| 171 | (not-word-boundary . "\\B") ; sregex | ||
| 134 | (syntax . (rx-syntax 1 1)) | 172 | (syntax . (rx-syntax 1 1)) |
| 173 | (not-syntax . (rx-not-syntax 1 1)) ; sregex | ||
| 135 | (category . (rx-category 1 1 rx-check-category)) | 174 | (category . (rx-category 1 1 rx-check-category)) |
| 136 | (eval . (rx-eval 1 1)) | 175 | (eval . (rx-eval 1 1)) |
| 137 | (regexp . (rx-regexp 1 1 stringp)) | 176 | (regexp . (rx-regexp 1 1 stringp)) |
| 138 | (digit . "[[:digit:]]") | 177 | (digit . "[[:digit:]]") |
| 139 | (control . "[[:cntrl:]]") | 178 | (numeric . digit) ; SRE |
| 140 | (hex-digit . "[[:xdigit:]]") | 179 | (num . digit) ; SRE |
| 141 | (blank . "[[:blank:]]") | 180 | (control . "[[:cntrl:]]") ; SRE |
| 142 | (graphic . "[[:graph:]]") | 181 | (cntrl . control) ; SRE |
| 143 | (printing . "[[:print:]]") | 182 | (hex-digit . "[[:xdigit:]]") ; SRE |
| 144 | (alphanumeric . "[[:alnum:]]") | 183 | (hex . hex-digit) ; SRE |
| 184 | (xdigit . hex-digit) ; SRE | ||
| 185 | (blank . "[[:blank:]]") ; SRE | ||
| 186 | (graphic . "[[:graph:]]") ; SRE | ||
| 187 | (graph . graphic) ; SRE | ||
| 188 | (printing . "[[:print:]]") ; SRE | ||
| 189 | (print . printing) ; SRE | ||
| 190 | (alphanumeric . "[[:alnum:]]") ; SRE | ||
| 191 | (alnum . alphanumeric) ; SRE | ||
| 145 | (letter . "[[:alpha:]]") | 192 | (letter . "[[:alpha:]]") |
| 146 | (ascii . "[[:ascii:]]") | 193 | (alphabetic . letter) ; SRE |
| 194 | (alpha . letter) ; SRE | ||
| 195 | (ascii . "[[:ascii:]]") ; SRE | ||
| 147 | (nonascii . "[[:nonascii:]]") | 196 | (nonascii . "[[:nonascii:]]") |
| 148 | (lower . "[[:lower:]]") | 197 | (lower . "[[:lower:]]") ; SRE |
| 149 | (punctuation . "[[:punct:]]") | 198 | (lower-case . lower) ; SRE |
| 150 | (space . "[[:space:]]") | 199 | (punctuation . "[[:punct:]]") ; SRE |
| 151 | (upper . "[[:upper:]]") | 200 | (punct . punctuation) ; SRE |
| 152 | (word . "[[:word:]]")) | 201 | (space . "[[:space:]]") ; SRE |
| 202 | (whitespace . space) ; SRE | ||
| 203 | (white . space) ; SRE | ||
| 204 | (upper . "[[:upper:]]") ; SRE | ||
| 205 | (upper-case . upper) ; SRE | ||
| 206 | (word . "[[:word:]]") ; inconsistent with SRE | ||
| 207 | (wordchar . word) ; sregex | ||
| 208 | (not-wordchar . "[^[:word:]]") ; sregex (use \\W?) | ||
| 209 | ) | ||
| 153 | "Alist of sexp form regexp constituents. | 210 | "Alist of sexp form regexp constituents. |
| 154 | Each element of the alist has the form (SYMBOL . DEFN). | 211 | Each element of the alist has the form (SYMBOL . DEFN). |
| 155 | SYMBOL is a valid constituent of sexp regular expressions. | 212 | SYMBOL is a valid constituent of sexp regular expressions. |
| @@ -178,7 +235,23 @@ all arguments must satisfy PREDICATE.") | |||
| 178 | (comment-start . ?<) | 235 | (comment-start . ?<) |
| 179 | (comment-end . ?>) | 236 | (comment-end . ?>) |
| 180 | (string-delimiter . ?|) | 237 | (string-delimiter . ?|) |
| 181 | (comment-delimiter . ?!)) | 238 | (comment-delimiter . ?!) |
| 239 | ;; sregex compatibility | ||
| 240 | (- . ?-) | ||
| 241 | (\. . ?.) | ||
| 242 | (w . ?w) | ||
| 243 | (_ . ?_) | ||
| 244 | (\( . ?\() | ||
| 245 | (\) . ?\)) | ||
| 246 | (\' . ?\') | ||
| 247 | (\" . ?\") | ||
| 248 | (\$ . ?$) | ||
| 249 | (\\ . ?\\) | ||
| 250 | (/ . ?/) | ||
| 251 | (< . ?<) | ||
| 252 | (> . ?>) | ||
| 253 | (| . ?|) | ||
| 254 | (! . ?!)) | ||
| 182 | "Alist mapping Rx syntax symbols to syntax characters. | 255 | "Alist mapping Rx syntax symbols to syntax characters. |
| 183 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | 256 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid |
| 184 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character | 257 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character |
| @@ -252,6 +325,8 @@ See also `rx-constituents'." | |||
| 252 | 325 | ||
| 253 | (defun rx-check (form) | 326 | (defun rx-check (form) |
| 254 | "Check FORM according to its car's parsing info." | 327 | "Check FORM according to its car's parsing info." |
| 328 | (unless (listp form) | ||
| 329 | (error "rx `%s' needs argument(s)" form)) | ||
| 255 | (let* ((rx (rx-info (car form))) | 330 | (let* ((rx (rx-info (car form))) |
| 256 | (nargs (1- (length form))) | 331 | (nargs (1- (length form))) |
| 257 | (min-args (nth 1 rx)) | 332 | (min-args (nth 1 rx)) |
| @@ -297,53 +372,61 @@ FORM is of the form `(and FORM1 ...)'." | |||
| 297 | "\\)"))) | 372 | "\\)"))) |
| 298 | 373 | ||
| 299 | 374 | ||
| 300 | (defun rx-quote-for-set (string) | 375 | (defvar bracket) ; dynamically bound in `rx-any' |
| 301 | "Transform STRING for use in a character set. | ||
| 302 | If STRING contains a `]', move it to the front. | ||
| 303 | If STRING starts with a '^', move it to the end." | ||
| 304 | (when (string-match "\\`\\(\\(?:.\\|\n\\)+\\)\\]\\(\\(?:.\\|\n\\)\\)*\\'" | ||
| 305 | string) | ||
| 306 | (setq string (concat "]" (match-string 1 string) | ||
| 307 | (match-string 2 string)))) | ||
| 308 | (when (string-match "\\`^\\(\\(?:.\\|\n\\)+\\)\\'" string) | ||
| 309 | (setq string (concat (substring string 1) "^"))) | ||
| 310 | string) | ||
| 311 | |||
| 312 | 376 | ||
| 313 | (defun rx-check-any (arg) | 377 | (defun rx-check-any (arg) |
| 314 | "Check arg ARG for Rx `any'." | 378 | "Check arg ARG for Rx `any'." |
| 315 | (cond ((integerp arg) t) | 379 | (if (integerp arg) |
| 316 | ((and (stringp arg) (zerop (length arg))) | 380 | (setq arg (string arg))) |
| 317 | (error "String arg for rx `any' must not be empty")) | 381 | (when (stringp arg) |
| 318 | ((stringp arg) t) | 382 | (if (zerop (length arg)) |
| 319 | (t | 383 | (error "String arg for Rx `any' must not be empty")) |
| 320 | (error "rx `any' requires string or character arg")))) | 384 | ;; Quote ^ at start; don't bother to check whether this is first arg. |
| 321 | 385 | (if (eq ?^ (aref arg 0)) | |
| 386 | (setq arg (concat "\\" arg))) | ||
| 387 | ;; Remove ] and set flag for adding it to start of overall result. | ||
| 388 | (when (string-match "]" arg) | ||
| 389 | (setq arg (replace-regexp-in-string "]" "" arg) | ||
| 390 | bracket "]"))) | ||
| 391 | (when (symbolp arg) | ||
| 392 | (let ((translation (condition-case nil | ||
| 393 | (rx-to-string arg 'no-group) | ||
| 394 | (error nil)))) | ||
| 395 | (unless translation (error "Invalid char class `%s' in Rx `any'" arg)) | ||
| 396 | (setq arg (substring translation 1 -1)))) ; strip outer brackets | ||
| 397 | ;; sregex compatibility | ||
| 398 | (when (and (integerp (car-safe arg)) | ||
| 399 | (integerp (cdr-safe arg))) | ||
| 400 | (setq arg (string (car arg) ?- (cdr arg)))) | ||
| 401 | (unless (stringp arg) | ||
| 402 | (error "rx `any' requires string, character, char pair or char class args")) | ||
| 403 | arg) | ||
| 322 | 404 | ||
| 323 | (defun rx-any (form) | 405 | (defun rx-any (form) |
| 324 | "Parse and produce code from FORM, which is `(any STRING)'. | 406 | "Parse and produce code from FORM, which is `(any ARG ...)'. |
| 325 | STRING is optional. If it is omitted, build a regexp that | 407 | ARG is optional." |
| 326 | matches anything." | ||
| 327 | (rx-check form) | 408 | (rx-check form) |
| 328 | (let ((arg (cadr form))) | 409 | (let* (bracket |
| 329 | (cond ((integerp arg) | 410 | (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `bracket' |
| 330 | (char-to-string arg)) | 411 | ;; If there was a ?- in the form, move it to the front to avoid |
| 331 | ((= (length arg) 1) | 412 | ;; accidental range. |
| 332 | arg) | 413 | (if (member "-" args) |
| 333 | (t | 414 | (setq args (cons "-" (delete "-" args)))) |
| 334 | (concat "[" (rx-quote-for-set (cadr form)) "]"))))) | 415 | (apply #'concat "[" bracket (append args '("]"))))) |
| 335 | 416 | ||
| 336 | 417 | ||
| 337 | (defun rx-check-not (arg) | 418 | (defun rx-check-not (arg) |
| 338 | "Check arg ARG for Rx `not'." | 419 | "Check arg ARG for Rx `not'." |
| 339 | (unless (or (memq form | 420 | (unless (or (and (symbolp arg) |
| 340 | '(digit control hex-digit blank graphic printing | 421 | (string-match "\\`\\[\\[:[-a-z]:]]\\'" |
| 341 | alphanumeric letter ascii nonascii lower | 422 | (condition-case nil |
| 342 | punctuation space upper word)) | 423 | (rx-to-string arg 'no-group) |
| 343 | (and (consp form) | 424 | (error "")))) |
| 344 | (memq (car form) '(not any in syntax category:)))) | 425 | (eq arg 'word-boundary) |
| 345 | (error "rx `not' syntax error: %s" form)) | 426 | (and (consp arg) |
| 346 | t) | 427 | (memq (car arg) '(not any in syntax category)))) |
| 428 | (error "rx `not' syntax error: %s" arg)) | ||
| 429 | t) | ||
| 347 | 430 | ||
| 348 | 431 | ||
| 349 | (defun rx-not (form) | 432 | (defun rx-not (form) |
| @@ -355,24 +438,67 @@ matches anything." | |||
| 355 | (if (= (length result) 4) | 438 | (if (= (length result) 4) |
| 356 | (substring result 2 3) | 439 | (substring result 2 3) |
| 357 | (concat "[" (substring result 2)))) | 440 | (concat "[" (substring result 2)))) |
| 358 | ((string-match "\\`\\[" result) | 441 | ((eq ?\[ (aref result 0)) |
| 359 | (concat "[^" (substring result 1))) | 442 | (concat "[^" (substring result 1))) |
| 360 | ((string-match "\\`\\\\s." result) | 443 | ((string-match "\\`\\\\[scb]" result) |
| 361 | (concat "\\S" (substring result 2))) | 444 | (concat (capitalize (substring result 0 2)) (substring result 2))) |
| 362 | ((string-match "\\`\\\\S." result) | ||
| 363 | (concat "\\s" (substring result 2))) | ||
| 364 | ((string-match "\\`\\\\c." result) | ||
| 365 | (concat "\\C" (substring result 2))) | ||
| 366 | ((string-match "\\`\\\\C." result) | ||
| 367 | (concat "\\c" (substring result 2))) | ||
| 368 | ((string-match "\\`\\\\B" result) | ||
| 369 | (concat "\\b" (substring result 2))) | ||
| 370 | ((string-match "\\`\\\\b" result) | ||
| 371 | (concat "\\B" (substring result 2))) | ||
| 372 | (t | 445 | (t |
| 373 | (concat "[^" result "]"))))) | 446 | (concat "[^" result "]"))))) |
| 374 | 447 | ||
| 375 | 448 | ||
| 449 | (defun rx-not-char (form) | ||
| 450 | "Parse and produce code from FORM. FORM is `(not-char ...)'." | ||
| 451 | (rx-check form) | ||
| 452 | (rx-not `(not (in ,@(cdr form))))) | ||
| 453 | |||
| 454 | |||
| 455 | (defun rx-not-syntax (form) | ||
| 456 | "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." | ||
| 457 | (rx-check form) | ||
| 458 | (rx-not `(not (syntax ,@(cdr form))))) | ||
| 459 | |||
| 460 | |||
| 461 | (defun rx-trans-forms (form &optional skip) | ||
| 462 | "If FORM's length is greater than two, transform it to length two. | ||
| 463 | A form (HEAD REST ...) becomes (HEAD (and REST ...)). | ||
| 464 | If SKIP is non-nil, allow that number of items after the head, i.e. | ||
| 465 | `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." | ||
| 466 | (unless skip (setq skip 0)) | ||
| 467 | (let ((tail (nthcdr (1+ skip) form))) | ||
| 468 | (if (= (length tail) 1) | ||
| 469 | form | ||
| 470 | (let ((form (copy-sequence form))) | ||
| 471 | (setcdr (nthcdr skip form) (list (cons 'and tail))) | ||
| 472 | form)))) | ||
| 473 | |||
| 474 | |||
| 475 | (defun rx-= (form) | ||
| 476 | "Parse and produce code from FORM `(= N ...)'." | ||
| 477 | (rx-check form) | ||
| 478 | (setq form (rx-trans-forms form 1)) | ||
| 479 | (unless (and (integerp (nth 1 form)) | ||
| 480 | (> (nth 1 form) 0)) | ||
| 481 | (error "rx `=' requires positive integer first arg")) | ||
| 482 | (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | ||
| 483 | |||
| 484 | |||
| 485 | (defun rx->= (form) | ||
| 486 | "Parse and produce code from FORM `(>= N ...)'." | ||
| 487 | (rx-check form) | ||
| 488 | (setq form (rx-trans-forms form 1)) | ||
| 489 | (unless (and (integerp (nth 1 form)) | ||
| 490 | (> (nth 1 form) 0)) | ||
| 491 | (error "rx `>=' requires positive integer first arg")) | ||
| 492 | (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | ||
| 493 | |||
| 494 | |||
| 495 | (defun rx-** (form) | ||
| 496 | "Parse and produce code from FORM `(** N M ...)'." | ||
| 497 | (rx-check form) | ||
| 498 | (setq form (cons 'repeat (cdr (rx-trans-forms form 2)))) | ||
| 499 | (rx-to-string form)) | ||
| 500 | |||
| 501 | |||
| 376 | (defun rx-repeat (form) | 502 | (defun rx-repeat (form) |
| 377 | "Parse and produce code from FORM. | 503 | "Parse and produce code from FORM. |
| 378 | FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." | 504 | FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." |
| @@ -419,6 +545,7 @@ If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | |||
| 419 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | 545 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' |
| 420 | is non-nil." | 546 | is non-nil." |
| 421 | (rx-check form) | 547 | (rx-check form) |
| 548 | (setq form (rx-trans-forms form)) | ||
| 422 | (let ((suffix (cond ((memq (car form) '(* + ? )) "") | 549 | (let ((suffix (cond ((memq (car form) '(* + ? )) "") |
| 423 | ((memq (car form) '(*? +? ??)) "?") | 550 | ((memq (car form) '(*? +? ??)) "?") |
| 424 | (rx-greedy-flag "") | 551 | (rx-greedy-flag "") |
| @@ -483,7 +610,7 @@ of all atomic regexps." | |||
| 483 | 610 | ||
| 484 | 611 | ||
| 485 | (defun rx-category (form) | 612 | (defun rx-category (form) |
| 486 | "Parse and produce code from FORM, which is `(category SYMBOL ...)'." | 613 | "Parse and produce code from FORM, which is `(category SYMBOL)'." |
| 487 | (rx-check form) | 614 | (rx-check form) |
| 488 | (let ((char (if (integerp (cadr form)) | 615 | (let ((char (if (integerp (cadr form)) |
| 489 | (cadr form) | 616 | (cadr form) |
| @@ -543,8 +670,9 @@ NO-GROUP non-nil means don't put shy groups around the result." | |||
| 543 | 670 | ||
| 544 | 671 | ||
| 545 | ;;;###autoload | 672 | ;;;###autoload |
| 546 | (defmacro rx (regexp) | 673 | (defmacro rx (&rest regexps) |
| 547 | "Translate a regular expression REGEXP in sexp form to a regexp string. | 674 | "Translate regular expressions REGEXPS in sexp form to a regexp string. |
| 675 | REGEXPS is a non-empty sequence of forms of the sort listed below. | ||
| 548 | See also `rx-to-string' for how to do such a translation at run-time. | 676 | See also `rx-to-string' for how to do such a translation at run-time. |
| 549 | 677 | ||
| 550 | The following are valid subforms of regular expressions in sexp | 678 | The following are valid subforms of regular expressions in sexp |
| @@ -556,53 +684,58 @@ STRING | |||
| 556 | CHAR | 684 | CHAR |
| 557 | matches character CHAR literally. | 685 | matches character CHAR literally. |
| 558 | 686 | ||
| 559 | `not-newline' | 687 | `not-newline', `nonl' |
| 560 | matches any character except a newline. | 688 | matches any character except a newline. |
| 561 | . | 689 | . |
| 562 | `anything' | 690 | `anything' |
| 563 | matches any character | 691 | matches any character |
| 564 | 692 | ||
| 565 | `(any SET)' | 693 | `(any SET ...)' |
| 566 | matches any character in SET. SET may be a character or string. | 694 | `(in SET ...)' |
| 695 | `(char SET ...)' | ||
| 696 | matches any character in SET .... SET may be a character or string. | ||
| 567 | Ranges of characters can be specified as `A-Z' in strings. | 697 | Ranges of characters can be specified as `A-Z' in strings. |
| 698 | Ranges may also be specified as conses like `(?A . ?Z)'. | ||
| 568 | 699 | ||
| 569 | '(in SET)' | 700 | SET may also be the name of a character class: `digit', |
| 570 | like `any'. | 701 | `control', `hex-digit', `blank', `graph', `print', `alnum', |
| 702 | `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', | ||
| 703 | `word', or one of their synonyms. | ||
| 571 | 704 | ||
| 572 | `(not (any SET))' | 705 | `(not (any SET ...))' |
| 573 | matches any character not in SET | 706 | matches any character not in SET ... |
| 574 | 707 | ||
| 575 | `line-start' | 708 | `line-start', `bol' |
| 576 | matches the empty string, but only at the beginning of a line | 709 | matches the empty string, but only at the beginning of a line |
| 577 | in the text being matched | 710 | in the text being matched |
| 578 | 711 | ||
| 579 | `line-end' | 712 | `line-end', `eol' |
| 580 | is similar to `line-start' but matches only at the end of a line | 713 | is similar to `line-start' but matches only at the end of a line |
| 581 | 714 | ||
| 582 | `string-start' | 715 | `string-start', `bos', `bot' |
| 583 | matches the empty string, but only at the beginning of the | 716 | matches the empty string, but only at the beginning of the |
| 584 | string being matched against. | 717 | string being matched against. |
| 585 | 718 | ||
| 586 | `string-end' | 719 | `string-end', `eos', `eot' |
| 587 | matches the empty string, but only at the end of the | 720 | matches the empty string, but only at the end of the |
| 588 | string being matched against. | 721 | string being matched against. |
| 589 | 722 | ||
| 590 | `buffer-start' | 723 | `buffer-start' |
| 591 | matches the empty string, but only at the beginning of the | 724 | matches the empty string, but only at the beginning of the |
| 592 | buffer being matched against. | 725 | buffer being matched against. Actually equivalent to `string-start'. |
| 593 | 726 | ||
| 594 | `buffer-end' | 727 | `buffer-end' |
| 595 | matches the empty string, but only at the end of the | 728 | matches the empty string, but only at the end of the |
| 596 | buffer being matched against. | 729 | buffer being matched against. Actually equivalent to `string-end'. |
| 597 | 730 | ||
| 598 | `point' | 731 | `point' |
| 599 | matches the empty string, but only at point. | 732 | matches the empty string, but only at point. |
| 600 | 733 | ||
| 601 | `word-start' | 734 | `word-start', `bow' |
| 602 | matches the empty string, but only at the beginning or end of a | 735 | matches the empty string, but only at the beginning or end of a |
| 603 | word. | 736 | word. |
| 604 | 737 | ||
| 605 | `word-end' | 738 | `word-end', `eow' |
| 606 | matches the empty string, but only at the end of a word. | 739 | matches the empty string, but only at the end of a word. |
| 607 | 740 | ||
| 608 | `word-boundary' | 741 | `word-boundary' |
| @@ -610,34 +743,35 @@ CHAR | |||
| 610 | word. | 743 | word. |
| 611 | 744 | ||
| 612 | `(not word-boundary)' | 745 | `(not word-boundary)' |
| 746 | `not-word-boundary' | ||
| 613 | matches the empty string, but not at the beginning or end of a | 747 | matches the empty string, but not at the beginning or end of a |
| 614 | word. | 748 | word. |
| 615 | 749 | ||
| 616 | `digit' | 750 | `digit', `numeric', `num' |
| 617 | matches 0 through 9. | 751 | matches 0 through 9. |
| 618 | 752 | ||
| 619 | `control' | 753 | `control', `cntrl' |
| 620 | matches ASCII control characters. | 754 | matches ASCII control characters. |
| 621 | 755 | ||
| 622 | `hex-digit' | 756 | `hex-digit', `hex', `xdigit' |
| 623 | matches 0 through 9, a through f and A through F. | 757 | matches 0 through 9, a through f and A through F. |
| 624 | 758 | ||
| 625 | `blank' | 759 | `blank' |
| 626 | matches space and tab only. | 760 | matches space and tab only. |
| 627 | 761 | ||
| 628 | `graphic' | 762 | `graphic', `graph' |
| 629 | matches graphic characters--everything except ASCII control chars, | 763 | matches graphic characters--everything except ASCII control chars, |
| 630 | space, and DEL. | 764 | space, and DEL. |
| 631 | 765 | ||
| 632 | `printing' | 766 | `printing', `print' |
| 633 | matches printing characters--everything except ASCII control chars | 767 | matches printing characters--everything except ASCII control chars |
| 634 | and DEL. | 768 | and DEL. |
| 635 | 769 | ||
| 636 | `alphanumeric' | 770 | `alphanumeric', `alnum' |
| 637 | matches letters and digits. (But at present, for multibyte characters, | 771 | matches letters and digits. (But at present, for multibyte characters, |
| 638 | it matches anything that has word syntax.) | 772 | it matches anything that has word syntax.) |
| 639 | 773 | ||
| 640 | `letter' | 774 | `letter', `alphabetic', `alpha' |
| 641 | matches letters. (But at present, for multibyte characters, | 775 | matches letters. (But at present, for multibyte characters, |
| 642 | it matches anything that has word syntax.) | 776 | it matches anything that has word syntax.) |
| 643 | 777 | ||
| @@ -647,25 +781,29 @@ CHAR | |||
| 647 | `nonascii' | 781 | `nonascii' |
| 648 | matches non-ASCII (multibyte) characters. | 782 | matches non-ASCII (multibyte) characters. |
| 649 | 783 | ||
| 650 | `lower' | 784 | `lower', `lower-case' |
| 651 | matches anything lower-case. | 785 | matches anything lower-case. |
| 652 | 786 | ||
| 653 | `upper' | 787 | `upper', `upper-case' |
| 654 | matches anything upper-case. | 788 | matches anything upper-case. |
| 655 | 789 | ||
| 656 | `punctuation' | 790 | `punctuation', `punct' |
| 657 | matches punctuation. (But at present, for multibyte characters, | 791 | matches punctuation. (But at present, for multibyte characters, |
| 658 | it matches anything that has non-word syntax.) | 792 | it matches anything that has non-word syntax.) |
| 659 | 793 | ||
| 660 | `space' | 794 | `space', `whitespace', `white' |
| 661 | matches anything that has whitespace syntax. | 795 | matches anything that has whitespace syntax. |
| 662 | 796 | ||
| 663 | `word' | 797 | `word', `wordchar' |
| 664 | matches anything that has word syntax. | 798 | matches anything that has word syntax. |
| 665 | 799 | ||
| 800 | `not-wordchar' | ||
| 801 | matches anything that has non-word syntax. | ||
| 802 | |||
| 666 | `(syntax SYNTAX)' | 803 | `(syntax SYNTAX)' |
| 667 | matches a character with syntax SYNTAX. SYNTAX must be one | 804 | matches a character with syntax SYNTAX. SYNTAX must be one |
| 668 | of the following symbols. | 805 | of the following symbols, or a symbol corresponding to the syntax |
| 806 | character, e.g. `\\.' for `\\s.'. | ||
| 669 | 807 | ||
| 670 | `whitespace' (\\s- in string notation) | 808 | `whitespace' (\\s- in string notation) |
| 671 | `punctuation' (\\s.) | 809 | `punctuation' (\\s.) |
| @@ -684,7 +822,7 @@ CHAR | |||
| 684 | `comment-delimiter' (\\s!) | 822 | `comment-delimiter' (\\s!) |
| 685 | 823 | ||
| 686 | `(not (syntax SYNTAX))' | 824 | `(not (syntax SYNTAX))' |
| 687 | matches a character that has not syntax SYNTAX. | 825 | matches a character that doesn't have syntax SYNTAX. |
| 688 | 826 | ||
| 689 | `(category CATEGORY)' | 827 | `(category CATEGORY)' |
| 690 | matches a character with category CATEGORY. CATEGORY must be | 828 | matches a character with category CATEGORY. CATEGORY must be |
| @@ -710,7 +848,7 @@ CHAR | |||
| 710 | `japanese-katakana-two-byte' (\\cK) | 848 | `japanese-katakana-two-byte' (\\cK) |
| 711 | `korean-hangul-two-byte' (\\cN) | 849 | `korean-hangul-two-byte' (\\cN) |
| 712 | `cyrillic-two-byte' (\\cY) | 850 | `cyrillic-two-byte' (\\cY) |
| 713 | `combining-diacritic' (\\c^) | 851 | `combining-diacritic' (\\c^) |
| 714 | `ascii' (\\ca) | 852 | `ascii' (\\ca) |
| 715 | `arabic' (\\cb) | 853 | `arabic' (\\cb) |
| 716 | `chinese' (\\cc) | 854 | `chinese' (\\cc) |
| @@ -731,12 +869,16 @@ CHAR | |||
| 731 | `can-break' (\\c|) | 869 | `can-break' (\\c|) |
| 732 | 870 | ||
| 733 | `(not (category CATEGORY))' | 871 | `(not (category CATEGORY))' |
| 734 | matches a character that has not category CATEGORY. | 872 | matches a character that doesn't have category CATEGORY. |
| 735 | 873 | ||
| 736 | `(and SEXP1 SEXP2 ...)' | 874 | `(and SEXP1 SEXP2 ...)' |
| 875 | `(: SEXP1 SEXP2 ...)' | ||
| 876 | `(seq SEXP1 SEXP2 ...)' | ||
| 877 | `(sequence SEXP1 SEXP2 ...)' | ||
| 737 | matches what SEXP1 matches, followed by what SEXP2 matches, etc. | 878 | matches what SEXP1 matches, followed by what SEXP2 matches, etc. |
| 738 | 879 | ||
| 739 | `(submatch SEXP1 SEXP2 ...)' | 880 | `(submatch SEXP1 SEXP2 ...)' |
| 881 | `(group SEXP1 SEXP2 ...)' | ||
| 740 | like `and', but makes the match accessible with `match-end', | 882 | like `and', but makes the match accessible with `match-end', |
| 741 | `match-beginning', and `match-string'. | 883 | `match-beginning', and `match-string'. |
| 742 | 884 | ||
| @@ -744,6 +886,7 @@ CHAR | |||
| 744 | another name for `submatch'. | 886 | another name for `submatch'. |
| 745 | 887 | ||
| 746 | `(or SEXP1 SEXP2 ...)' | 888 | `(or SEXP1 SEXP2 ...)' |
| 889 | `(| SEXP1 SEXP2 ...)' | ||
| 747 | matches anything that matches SEXP1 or SEXP2, etc. If all | 890 | matches anything that matches SEXP1 or SEXP2, etc. If all |
| 748 | args are strings, use `regexp-opt' to optimize the resulting | 891 | args are strings, use `regexp-opt' to optimize the resulting |
| 749 | regular expression. | 892 | regular expression. |
| @@ -757,47 +900,55 @@ CHAR | |||
| 757 | `(maximal-match SEXP)' | 900 | `(maximal-match SEXP)' |
| 758 | produce a greedy regexp for SEXP. This is the default. | 901 | produce a greedy regexp for SEXP. This is the default. |
| 759 | 902 | ||
| 760 | `(zero-or-more SEXP)' | 903 | Below, `SEXP ...' represents a sequence of regexp forms, treated as if |
| 761 | matches zero or more occurrences of what SEXP matches. | 904 | enclosed in `(and ...)'. |
| 762 | |||
| 763 | `(0+ SEXP)' | ||
| 764 | like `zero-or-more'. | ||
| 765 | 905 | ||
| 766 | `(* SEXP)' | 906 | `(zero-or-more SEXP ...)' |
| 767 | like `zero-or-more', but always produces a greedy regexp. | 907 | `(0+ SEXP ...)' |
| 908 | matches zero or more occurrences of what SEXP ... matches. | ||
| 768 | 909 | ||
| 769 | `(*? SEXP)' | 910 | `(* SEXP ...)' |
| 770 | like `zero-or-more', but always produces a non-greedy regexp. | 911 | like `zero-or-more', but always produces a greedy regexp, independent |
| 912 | of `rx-greedy-flag'. | ||
| 771 | 913 | ||
| 772 | `(one-or-more SEXP)' | 914 | `(*? SEXP ...)' |
| 773 | matches one or more occurrences of A. | 915 | like `zero-or-more', but always produces a non-greedy regexp, |
| 916 | independent of `rx-greedy-flag'. | ||
| 774 | 917 | ||
| 775 | `(1+ SEXP)' | 918 | `(one-or-more SEXP ...)' |
| 776 | like `one-or-more'. | 919 | `(1+ SEXP ...)' |
| 920 | matches one or more occurrences of SEXP ... | ||
| 777 | 921 | ||
| 778 | `(+ SEXP)' | 922 | `(+ SEXP ...)' |
| 779 | like `one-or-more', but always produces a greedy regexp. | 923 | like `one-or-more', but always produces a greedy regexp. |
| 780 | 924 | ||
| 781 | `(+? SEXP)' | 925 | `(+? SEXP ...)' |
| 782 | like `one-or-more', but always produces a non-greedy regexp. | 926 | like `one-or-more', but always produces a non-greedy regexp. |
| 783 | 927 | ||
| 784 | `(zero-or-one SEXP)' | 928 | `(zero-or-one SEXP ...)' |
| 929 | `(optional SEXP ...)' | ||
| 930 | `(opt SEXP ...)' | ||
| 785 | matches zero or one occurrences of A. | 931 | matches zero or one occurrences of A. |
| 786 | 932 | ||
| 787 | `(optional SEXP)' | 933 | `(? SEXP ...)' |
| 788 | like `zero-or-one'. | ||
| 789 | |||
| 790 | `(? SEXP)' | ||
| 791 | like `zero-or-one', but always produces a greedy regexp. | 934 | like `zero-or-one', but always produces a greedy regexp. |
| 792 | 935 | ||
| 793 | `(?? SEXP)' | 936 | `(?? SEXP ...)' |
| 794 | like `zero-or-one', but always produces a non-greedy regexp. | 937 | like `zero-or-one', but always produces a non-greedy regexp. |
| 795 | 938 | ||
| 796 | `(repeat N SEXP)' | 939 | `(repeat N SEXP)' |
| 797 | matches N occurrences of what SEXP matches. | 940 | `(= N SEXP ...)' |
| 941 | matches N occurrences. | ||
| 942 | |||
| 943 | `(>= N SEXP ...)' | ||
| 944 | matches N or more occurrences. | ||
| 798 | 945 | ||
| 799 | `(repeat N M SEXP)' | 946 | `(repeat N M SEXP)' |
| 800 | matches N to M occurrences of what SEXP matches. | 947 | `(** N M SEXP ...)' |
| 948 | matches N to M occurrences. | ||
| 949 | |||
| 950 | `(backref N)' | ||
| 951 | matches what was matched previously by submatch N. | ||
| 801 | 952 | ||
| 802 | `(backref N)' | 953 | `(backref N)' |
| 803 | matches what was matched previously by submatch N. | 954 | matches what was matched previously by submatch N. |
| @@ -811,9 +962,21 @@ CHAR | |||
| 811 | 962 | ||
| 812 | `(regexp REGEXP)' | 963 | `(regexp REGEXP)' |
| 813 | include REGEXP in string notation in the result." | 964 | include REGEXP in string notation in the result." |
| 814 | 965 | (cond ((null regexps) | |
| 815 | (rx-to-string regexp)) | 966 | (error "No regexp")) |
| 816 | 967 | ((cdr regexps) | |
| 968 | (rx-to-string `(and ,@regexps) t)) | ||
| 969 | (t | ||
| 970 | (rx-to-string (car regexps) t)))) | ||
| 971 | |||
| 972 | ;; ;; sregex.el replacement | ||
| 973 | |||
| 974 | ;; ;;;###autoload (provide 'sregex) | ||
| 975 | ;; ;;;###autoload (autoload 'sregex "rx") | ||
| 976 | ;; (defalias 'sregex 'rx-to-string) | ||
| 977 | ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) | ||
| 978 | ;; (defalias 'sregexq 'rx) | ||
| 979 | |||
| 817 | (provide 'rx) | 980 | (provide 'rx) |
| 818 | 981 | ||
| 819 | ;;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b | 982 | ;;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b |