diff options
| author | Gerd Moellmann | 2001-10-01 07:31:59 +0000 |
|---|---|---|
| committer | Gerd Moellmann | 2001-10-01 07:31:59 +0000 |
| commit | 12c645037675823286ff13ad4c1b18478e73346d (patch) | |
| tree | a23409defd7d37d17a8e6605840eba93beebd800 | |
| parent | 78f2fcaf91aee5237cfabf97e22011c8ce8cbc6e (diff) | |
| download | emacs-12c645037675823286ff13ad4c1b18478e73346d.tar.gz emacs-12c645037675823286ff13ad4c1b18478e73346d.zip | |
*** empty log message ***
| -rw-r--r-- | lib-src/ChangeLog | 23 | ||||
| -rw-r--r-- | lisp/ChangeLog | 6 | ||||
| -rw-r--r-- | lisp/emacs-lisp/rx.el | 753 |
3 files changed, 782 insertions, 0 deletions
diff --git a/lib-src/ChangeLog b/lib-src/ChangeLog index f1636d4571f..bb90d3c0c39 100644 --- a/lib-src/ChangeLog +++ b/lib-src/ChangeLog | |||
| @@ -1,3 +1,26 @@ | |||
| 1 | 2001-10-01 Alexander Zhuckov <zuav@int.spb.ru> | ||
| 2 | |||
| 3 | * ebrowse.c (struct alias): Add two new struct members: NAMESP and | ||
| 4 | ALIASEE to help work with namespace aliases. | ||
| 5 | (struct sym): Remove struct member NAMESP_ALIASES. | ||
| 6 | (namespace_alias_table): New variable. | ||
| 7 | (make_namespace): Add parameter CONTEXT. | ||
| 8 | (check_namespace): New function. | ||
| 9 | (find_namespace): Add parameter CONTEXT. | ||
| 10 | (check_namespace_alias): New function. | ||
| 11 | (register_namespace_alias): Change type of parameter | ||
| 12 | OLD_NAME. Search for already defined alias in | ||
| 13 | NAMESPACE_ALIAS_TABLE. | ||
| 14 | (check_namespace): New function. | ||
| 15 | (enter_namespace): Call find_namespace with CONTEXT parameter. | ||
| 16 | (match_qualified_namespace_alias): New function. | ||
| 17 | (parse_qualified_ident_or_type): Fixed typo in comment. While | ||
| 18 | parsing qualified ident or type update namespace context and | ||
| 19 | restore it on exit. | ||
| 20 | (parse_qualified_param_ident_or_type): Fixed typo in comment. | ||
| 21 | (globals): Changed handling of namespace aliases. | ||
| 22 | (version): Added year 2001. | ||
| 23 | |||
| 1 | 2001-09-15 Eli Zaretskii <eliz@is.elta.co.il> | 24 | 2001-09-15 Eli Zaretskii <eliz@is.elta.co.il> |
| 2 | 25 | ||
| 3 | * etags.c (analyse_regex): If regex_arg is NULL, return | 26 | * etags.c (analyse_regex): If regex_arg is NULL, return |
diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 1897235b6d1..32d58d60c9a 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog | |||
| @@ -1,3 +1,9 @@ | |||
| 1 | 2001-10-01 Gerd Moellmann <gerd@gnu.org> | ||
| 2 | |||
| 3 | * emacs-lisp/edebug.el (rx): Add def-edebug-spec. | ||
| 4 | |||
| 5 | * emacs-lisp/rx.el: New file. | ||
| 6 | |||
| 1 | 2001-10-01 Eli Zaretskii <eliz@is.elta.co.il> | 7 | 2001-10-01 Eli Zaretskii <eliz@is.elta.co.il> |
| 2 | 8 | ||
| 3 | * help.el (help-for-help): Doc fix. From Pavel@Janik.cz (Pavel | 9 | * help.el (help-for-help): Doc fix. From Pavel@Janik.cz (Pavel |
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el new file mode 100644 index 00000000000..92eea6d00a3 --- /dev/null +++ b/lisp/emacs-lisp/rx.el | |||
| @@ -0,0 +1,753 @@ | |||
| 1 | ;;; rx.el --- sexp notation for regular expressions | ||
| 2 | |||
| 3 | ;; Copyright (C) 2001 Free Software Foundation, Inc. | ||
| 4 | |||
| 5 | ;; Author: Gerd Moellmann <gerd@gnu.org> | ||
| 6 | ;; Maintainer: FSF | ||
| 7 | ;; Keywords: strings, regexps, extensions | ||
| 8 | |||
| 9 | ;; This file is part of GNU Emacs. | ||
| 10 | |||
| 11 | ;; GNU Emacs is free software; you can redistribute it and/or modify | ||
| 12 | ;; it under the terms of the GNU General Public License as published by | ||
| 13 | ;; the Free Software Foundation; either version 2, or (at your option) | ||
| 14 | ;; any later version. | ||
| 15 | |||
| 16 | ;; GNU Emacs is distributed in the hope that it will be useful, | ||
| 17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 19 | ;; GNU General Public License for more details. | ||
| 20 | |||
| 21 | ;; You should have received a copy of the GNU General Public License | ||
| 22 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | ||
| 23 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 24 | ;; Boston, MA 02111-1307, USA. | ||
| 25 | |||
| 26 | ;;; Commentary: | ||
| 27 | |||
| 28 | ;; This is another implementation of sexp-form regular expressions. | ||
| 29 | ;; It was unfortunately written without being aware of the Sregex | ||
| 30 | ;; package coming with Emacs, but as things stand, Rx completely | ||
| 31 | ;; covers all regexp features, which Sregex doesn't, doesn't suffer | ||
| 32 | ;; from the bugs mentioned in the commentary section of Sregex, and | ||
| 33 | ;; uses a nicer syntax (IMHO, of course :-). | ||
| 34 | |||
| 35 | ;; Rx translates a sexp notation for regular expressions into the | ||
| 36 | ;; usual string notation. The translation can be done at compile-time | ||
| 37 | ;; by using the `rx' macro. It can be done at run-time by calling | ||
| 38 | ;; function `rx-to-string'. See the documentation of `rx' for a | ||
| 39 | ;; complete description of the sexp notation. | ||
| 40 | ;; | ||
| 41 | ;; Some examples of string regexps and their sexp counterparts: | ||
| 42 | ;; | ||
| 43 | ;; "^[a-z]*" | ||
| 44 | ;; (rx (and line-start (0+ (in "a-z")))) | ||
| 45 | ;; | ||
| 46 | ;; "\n[^ \t]" | ||
| 47 | ;; (rx (and "\n" (not blank))), or | ||
| 48 | ;; (rx (and "\n" (not (any " \t")))) | ||
| 49 | ;; | ||
| 50 | ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | ||
| 51 | ;; (rx "*** EOOH ***\n") | ||
| 52 | ;; | ||
| 53 | ;; "\\<\\(catch\\|finally\\)\\>[^_]" | ||
| 54 | ;; (rx (and word-start (submatch (or "catch" "finally")) word-end | ||
| 55 | ;; (not (any ?_)))) | ||
| 56 | ;; | ||
| 57 | ;; "[ \t\n]*:\\([^:]+\\|$\\)" | ||
| 58 | ;; (rx (and (zero-or-more (in " \t\n")) ":" | ||
| 59 | ;; (submatch (or line-end (one-or-more (not (any ?:))))))) | ||
| 60 | ;; | ||
| 61 | ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" | ||
| 62 | ;; (rx (and line-start | ||
| 63 | ;; "content-transfer-encoding:" | ||
| 64 | ;; (+ (? ?\n) blank) | ||
| 65 | ;; "quoted-printable" | ||
| 66 | ;; (+ (? ?\n) blank)) | ||
| 67 | ;; | ||
| 68 | ;; (concat "^\\(?:" something-else "\\)") | ||
| 69 | ;; (rx (and line-start (eval something-else))), statically or | ||
| 70 | ;; (rx-to-string '(and line-start ,something-else)), dynamically. | ||
| 71 | ;; | ||
| 72 | ;; (regexp-opt '(STRING1 STRING2 ...)) | ||
| 73 | ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | ||
| 74 | ;; calls `regexp-opt' as needed. | ||
| 75 | ;; | ||
| 76 | ;; "^;;\\s-*\n\\|^\n" | ||
| 77 | ;; (rx (or (and line-start ";;" (0+ space) ?\n) | ||
| 78 | ;; (and line-start ?\n))) | ||
| 79 | ;; | ||
| 80 | ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | ||
| 81 | ;; (rx (and "$Id": " | ||
| 82 | ;; (1+ (not (in " "))) | ||
| 83 | ;; " " | ||
| 84 | ;; (submatch (1+ (not (in " ")))) | ||
| 85 | ;; " "))) | ||
| 86 | ;; | ||
| 87 | ;; "\\\\\\\\\\[\\w+" | ||
| 88 | ;; (rx (and ?\\ ?\\ ?\[ (1+ word))) | ||
| 89 | ;; | ||
| 90 | ;; etc. | ||
| 91 | |||
| 92 | ;;; History: | ||
| 93 | ;; | ||
| 94 | |||
| 95 | ;;; Code: | ||
| 96 | |||
| 97 | |||
| 98 | (defconst rx-constituents | ||
| 99 | '((and . (rx-and 1 nil)) | ||
| 100 | (or . (rx-or 1 nil)) | ||
| 101 | (not-newline . ".") | ||
| 102 | (anything . ".\\|\n") | ||
| 103 | (any . (rx-any 1 1 rx-check-any)) | ||
| 104 | (in . any) | ||
| 105 | (not . (rx-not 1 1 rx-check-not)) | ||
| 106 | (repeat . (rx-repeat 2 3)) | ||
| 107 | (submatch . (rx-submatch 1 nil)) | ||
| 108 | (group . submatch) | ||
| 109 | (zero-or-more . (rx-kleene 1 1)) | ||
| 110 | (one-or-more . (rx-kleene 1 1)) | ||
| 111 | (zero-or-one . (rx-kleene 1 1)) | ||
| 112 | (\? . zero-or-one) | ||
| 113 | (\?? . zero-or-one) | ||
| 114 | (* . zero-or-more) | ||
| 115 | (*? . zero-or-more) | ||
| 116 | (0+ . zero-or-more) | ||
| 117 | (+ . one-or-more) | ||
| 118 | (+? . one-or-more) | ||
| 119 | (1+ . one-or-more) | ||
| 120 | (optional . zero-or-one) | ||
| 121 | (minimal-match . (rx-greedy 1 1)) | ||
| 122 | (maximal-match . (rx-greedy 1 1)) | ||
| 123 | (line-start . "^") | ||
| 124 | (line-end . "$") | ||
| 125 | (string-start . "\\`") | ||
| 126 | (string-end . "\\'") | ||
| 127 | (buffer-start . "\\`") | ||
| 128 | (buffer-end . "\\'") | ||
| 129 | (point . "\\=") | ||
| 130 | (word-start . "\\<") | ||
| 131 | (word-end . "\\>") | ||
| 132 | (word-boundary . "\\b") | ||
| 133 | (syntax . (rx-syntax 1 1)) | ||
| 134 | (category . (rx-category 1 1 rx-check-category)) | ||
| 135 | (eval . (rx-eval 1 1)) | ||
| 136 | (regexp . (rx-regexp 1 1 stringp)) | ||
| 137 | (digit . "[[:digit:]]") | ||
| 138 | (control . "[[:cntrl:]]") | ||
| 139 | (hex-digit . "[[:xdigit:]]") | ||
| 140 | (blank . "[[:blank:]]") | ||
| 141 | (graphic . "[[:graph:]]") | ||
| 142 | (printing . "[[:print:]]") | ||
| 143 | (alphanumeric . "[[:alnum:]]") | ||
| 144 | (letter . "[[:alpha:]]") | ||
| 145 | (ascii . "[[:ascii:]]") | ||
| 146 | (nonascii . "[[:nonascii:]]") | ||
| 147 | (lower . "[[:lower:]]") | ||
| 148 | (punctuation . "[[:punct:]]") | ||
| 149 | (space . "[[:space:]]") | ||
| 150 | (upper . "[[:upper:]]") | ||
| 151 | (word . "[[:word:]]")) | ||
| 152 | "Alist of sexp form regexp constituents. | ||
| 153 | Each element of the alist has the form (SYMBOL . DEFN). | ||
| 154 | SYMBOL is a valid constituent of sexp regular expressions. | ||
| 155 | If DEFN is a string, SYMBOL is translated into DEFN. | ||
| 156 | If DEFN is a symbol, use the definition of DEFN, recursively. | ||
| 157 | Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | ||
| 158 | FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | ||
| 159 | are the minimum and maximum number of arguments the function-form | ||
| 160 | sexp constituent SYMBOL may have in sexp regular expressions. | ||
| 161 | MAX-ARGS nil means no limit. PREDICATE, if specified, means that | ||
| 162 | all arguments must satisfy PREDICATE.") | ||
| 163 | |||
| 164 | |||
| 165 | (defconst rx-syntax | ||
| 166 | '((whitespace . ?-) | ||
| 167 | (punctuation . ?.) | ||
| 168 | (word . ?w) | ||
| 169 | (symbol . ?_) | ||
| 170 | (open-parenthesis . ?\() | ||
| 171 | (close-parenthesis . ?\)) | ||
| 172 | (expression-prefix . ?\') | ||
| 173 | (string-quote . ?\") | ||
| 174 | (paired-delimiter . ?$) | ||
| 175 | (escape . ?\\) | ||
| 176 | (character-quote . ?/) | ||
| 177 | (comment-start . ?<) | ||
| 178 | (comment-end . ?>)) | ||
| 179 | "Alist mapping Rx syntax symbols to syntax characters. | ||
| 180 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | ||
| 181 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character | ||
| 182 | corresponding to SYMBOL, as it would be used with \\s or \\S in | ||
| 183 | regular expressions.") | ||
| 184 | |||
| 185 | |||
| 186 | (defconst rx-categories | ||
| 187 | '((consonant . ?0) | ||
| 188 | (base-vowel . ?1) | ||
| 189 | (upper-diacritical-mark . ?2) | ||
| 190 | (lower-diacritical-mark . ?3) | ||
| 191 | (tone-mark . ?4) | ||
| 192 | (symbol . ?5) | ||
| 193 | (digit . ?6) | ||
| 194 | (vowel-modifying-diacritical-mark . ?7) | ||
| 195 | (vowel-sign . ?8) | ||
| 196 | (semivowel-lower . ?9) | ||
| 197 | (not-at-end-of-line . ?<) | ||
| 198 | (not-at-beginning-of-line . ?>) | ||
| 199 | (alpha-numeric-two-byte . ?A) | ||
| 200 | (chinse-two-byte . ?C) | ||
| 201 | (greek-two-byte . ?G) | ||
| 202 | (japanese-hiragana-two-byte . ?H) | ||
| 203 | (indian-two-byte . ?I) | ||
| 204 | (japanese-katakana-two-byte . ?K) | ||
| 205 | (korean-hangul-two-byte . ?N) | ||
| 206 | (cyrillic-two-byte . ?Y) | ||
| 207 | (ascii . ?a) | ||
| 208 | (arabic . ?b) | ||
| 209 | (chinese . ?c) | ||
| 210 | (ethiopic . ?e) | ||
| 211 | (greek . ?g) | ||
| 212 | (korean . ?h) | ||
| 213 | (indian . ?i) | ||
| 214 | (japanese . ?j) | ||
| 215 | (japanese-katakana . ?k) | ||
| 216 | (latin . ?l) | ||
| 217 | (lao . ?o) | ||
| 218 | (tibetan . ?q) | ||
| 219 | (japanese-roman . ?r) | ||
| 220 | (thai . ?t) | ||
| 221 | (vietnamese . ?v) | ||
| 222 | (hebrew . ?w) | ||
| 223 | (cyrillic . ?y) | ||
| 224 | (can-break . ?|)) | ||
| 225 | "Alist mapping symbols to category characters. | ||
| 226 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | ||
| 227 | symbol in `(category SYMBOL)', and CHAR is the category character | ||
| 228 | corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | ||
| 229 | regular expression strings.") | ||
| 230 | |||
| 231 | |||
| 232 | (defvar rx-greedy-flag t | ||
| 233 | "Non-nil means produce greedy regular expressions for `zero-or-one', | ||
| 234 | `zero-or-more', and `one-or-more'. Dynamically bound.") | ||
| 235 | |||
| 236 | |||
| 237 | (defun rx-info (op) | ||
| 238 | "Return parsing/code generation info for OP. | ||
| 239 | If OP is the space character ASCII 32, return info for the symbol `?'. | ||
| 240 | If OP is the character `?', return info for the symbol `??'. | ||
| 241 | See also `rx-constituents'." | ||
| 242 | (cond ((eq op ? ) (setq op '\?)) | ||
| 243 | ((eq op ??) (setq op '\??))) | ||
| 244 | (while (and (not (null op)) (symbolp op)) | ||
| 245 | (setq op (cdr (assq op rx-constituents)))) | ||
| 246 | op) | ||
| 247 | |||
| 248 | |||
| 249 | (defun rx-check (form) | ||
| 250 | "Check FORM according to its car's parsing info." | ||
| 251 | (let* ((rx (rx-info (car form))) | ||
| 252 | (nargs (1- (length form))) | ||
| 253 | (min-args (nth 1 rx)) | ||
| 254 | (max-args (nth 2 rx)) | ||
| 255 | (type-pred (nth 3 rx))) | ||
| 256 | (when (and (not (null min-args)) | ||
| 257 | (< nargs min-args)) | ||
| 258 | (error "Rx form `%s' requires at least %d args" | ||
| 259 | (car form) min-args)) | ||
| 260 | (when (and (not (null max-args)) | ||
| 261 | (> nargs max-args)) | ||
| 262 | (error "Rx form `%s' accepts at most %d args" | ||
| 263 | (car form) max-args)) | ||
| 264 | (when (not (null type-pred)) | ||
| 265 | (dolist (sub-form (cdr form)) | ||
| 266 | (unless (funcall type-pred sub-form) | ||
| 267 | (error "Rx form `%s' requires args satisfying `%s'" | ||
| 268 | (car form) type-pred)))))) | ||
| 269 | |||
| 270 | |||
| 271 | (defun rx-and (form) | ||
| 272 | "Parse and produce code from FORM. | ||
| 273 | FORM is of the form `(and FORM1 ...)'." | ||
| 274 | (rx-check form) | ||
| 275 | (mapconcat #'rx-to-string (cdr form) nil)) | ||
| 276 | |||
| 277 | |||
| 278 | (defun rx-or (form) | ||
| 279 | "Parse and produce code from FORM, which is `(or FORM1 ...)'." | ||
| 280 | (rx-check form) | ||
| 281 | (let ((all-args-strings t)) | ||
| 282 | (dolist (arg (cdr form)) | ||
| 283 | (unless (stringp arg) | ||
| 284 | (setq all-args-strings nil))) | ||
| 285 | (if all-args-strings | ||
| 286 | (regexp-opt (cdr form)) | ||
| 287 | (mapconcat #'rx-to-string (cdr form) "\\|")))) | ||
| 288 | |||
| 289 | |||
| 290 | (defun rx-quote-for-set (string) | ||
| 291 | "Transform STRING for use in a character set. | ||
| 292 | If STRING contains a `]', move it to the front. | ||
| 293 | If STRING starts with a '^', move it to the end." | ||
| 294 | (when (string-match "\\`\\(\\(?:.\\|\n\\)+\\)\\]\\(\\(?:.\\|\n\\)\\)*\\'" | ||
| 295 | string) | ||
| 296 | (setq string (concat "]" (match-string 1 string) | ||
| 297 | (match-string 2 string)))) | ||
| 298 | (when (string-match "\\`^\\(\\(?:.\\|\n\\)+\\)\\'" string) | ||
| 299 | (setq string (concat (substring string 1) "^"))) | ||
| 300 | string) | ||
| 301 | |||
| 302 | |||
| 303 | (defun rx-check-any (arg) | ||
| 304 | "Check arg ARG for Rx `any'." | ||
| 305 | (cond ((integerp arg) t) | ||
| 306 | ((and (stringp arg) (zerop (length arg))) | ||
| 307 | (error "String arg for Rx `any' must not be empty")) | ||
| 308 | ((stringp arg) t) | ||
| 309 | (t | ||
| 310 | (error "Rx `any' requires string or character arg")))) | ||
| 311 | |||
| 312 | |||
| 313 | (defun rx-any (form) | ||
| 314 | "Parse and produce code from FORM, which is `(any STRING)'. | ||
| 315 | STRING is optional. If it is omitted, build a regexp that | ||
| 316 | matches anything." | ||
| 317 | (rx-check form) | ||
| 318 | (let ((arg (cadr form))) | ||
| 319 | (cond ((integerp arg) | ||
| 320 | (char-to-string arg)) | ||
| 321 | ((= (length arg) 1) | ||
| 322 | arg) | ||
| 323 | (t | ||
| 324 | (concat "[" (rx-quote-for-set (cadr form)) "]"))))) | ||
| 325 | |||
| 326 | |||
| 327 | (defun rx-check-not (form) | ||
| 328 | "Check arguments of FORM. FORM is `(not ...)'." | ||
| 329 | (unless (or (memq form | ||
| 330 | '(digit control hex-digit blank graphic printing | ||
| 331 | alphanumeric letter ascii nonascii lower | ||
| 332 | punctuation space upper word)) | ||
| 333 | (and (consp form) | ||
| 334 | (memq (car form) '(not any in syntax category:)))) | ||
| 335 | (error "Rx `not' syntax error: %s" form)) | ||
| 336 | t) | ||
| 337 | |||
| 338 | |||
| 339 | (defun rx-not (form) | ||
| 340 | "Parse and produce code from FORM. FORM is `(not ...)'." | ||
| 341 | (rx-check form) | ||
| 342 | (let ((result (rx-to-string (cadr form) 'no-group))) | ||
| 343 | (cond ((string-match "\\`\\[^" result) | ||
| 344 | (if (= (length result) 4) | ||
| 345 | (substring result 2 3) | ||
| 346 | (concat "[" (substring result 2)))) | ||
| 347 | ((string-match "\\`\\[" result) | ||
| 348 | (concat "[^" (substring result 1))) | ||
| 349 | ((string-match "\\`\\\\s." result) | ||
| 350 | (concat "\\S" (substring result 2))) | ||
| 351 | ((string-match "\\`\\\\S." result) | ||
| 352 | (concat "\\s" (substring result 2))) | ||
| 353 | ((string-match "\\`\\\\c." result) | ||
| 354 | (concat "\\C" (substring result 2))) | ||
| 355 | ((string-match "\\`\\\\C." result) | ||
| 356 | (concat "\\c" (substring result 2))) | ||
| 357 | ((string-match "\\`\\\\B" result) | ||
| 358 | (concat "\\b" (substring result 2))) | ||
| 359 | ((string-match "\\`\\\\b" result) | ||
| 360 | (concat "\\B" (substring result 2))) | ||
| 361 | (t | ||
| 362 | (concat "[^" result "]"))))) | ||
| 363 | |||
| 364 | |||
| 365 | (defun rx-repeat (form) | ||
| 366 | "Parse and produce code from FORM. | ||
| 367 | FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." | ||
| 368 | (rx-check form) | ||
| 369 | (cond ((= (length form) 3) | ||
| 370 | (unless (and (integerp (nth 1 form)) | ||
| 371 | (> (nth 1 form) 0)) | ||
| 372 | (error "Rx `repeat' requires positive integer first arg")) | ||
| 373 | (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) | ||
| 374 | ((or (not (integerp (nth 2 form))) | ||
| 375 | (< (nth 2 form) 0) | ||
| 376 | (not (integerp (nth 1 form))) | ||
| 377 | (< (nth 1 form) 0) | ||
| 378 | (< (nth 2 form) (nth 1 form))) | ||
| 379 | (error "Rx `repeat' range error")) | ||
| 380 | (t | ||
| 381 | (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) | ||
| 382 | (nth 1 form) (nth 2 form))))) | ||
| 383 | |||
| 384 | |||
| 385 | (defun rx-submatch (form) | ||
| 386 | "Parse and produce code from FORM, which is `(submatch ...)'." | ||
| 387 | (concat "\\(" (mapconcat #'rx-to-string (cdr form) nil) "\\)")) | ||
| 388 | |||
| 389 | |||
| 390 | (defun rx-kleene (form) | ||
| 391 | "Parse and produce code from FORM. | ||
| 392 | FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | ||
| 393 | `zero-or-more' etc. operators. | ||
| 394 | If OP is one of `*', `+', `?', produce a greedy regexp. | ||
| 395 | If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | ||
| 396 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | ||
| 397 | is non-nil." | ||
| 398 | (rx-check form) | ||
| 399 | (let ((suffix (cond ((memq (car form) '(* + ? )) "") | ||
| 400 | ((memq (car form) '(*? +? ??)) "?") | ||
| 401 | (rx-greedy-flag "") | ||
| 402 | (t "?"))) | ||
| 403 | (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | ||
| 404 | ((memq (car form) '(+ +? 1+ one-or-more)) "+") | ||
| 405 | (t "?")))) | ||
| 406 | (format "\\(?:%s\\)%s%s" (rx-to-string (cadr form) 'no-group) | ||
| 407 | op suffix))) | ||
| 408 | |||
| 409 | |||
| 410 | (defun rx-syntax (form) | ||
| 411 | "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | ||
| 412 | (rx-check form) | ||
| 413 | (let ((syntax (assq (cadr form) rx-syntax))) | ||
| 414 | (unless syntax | ||
| 415 | (error "Unknown rx syntax `%s'" (cadr form))) | ||
| 416 | (format "\\s%c" (cdr syntax)))) | ||
| 417 | |||
| 418 | |||
| 419 | (defun rx-check-category (form) | ||
| 420 | "Check the argument FORM of a `(category FORM)'." | ||
| 421 | (unless (or (integerp form) | ||
| 422 | (cdr (assq form rx-categories))) | ||
| 423 | (error "Unknown category `%s'" form)) | ||
| 424 | t) | ||
| 425 | |||
| 426 | |||
| 427 | (defun rx-category (form) | ||
| 428 | "Parse and produce code from FORM, which is `(category SYMBOL ...)'." | ||
| 429 | (rx-check form) | ||
| 430 | (let ((char (if (integerp (cadr form)) | ||
| 431 | (cadr form) | ||
| 432 | (cdr (assq (cadr form) rx-categories))))) | ||
| 433 | (format "\\c%c" char))) | ||
| 434 | |||
| 435 | |||
| 436 | (defun rx-eval (form) | ||
| 437 | "Parse and produce code from FORM, which is `(eval FORM)'." | ||
| 438 | (rx-check form) | ||
| 439 | (rx-to-string (eval (cadr form)))) | ||
| 440 | |||
| 441 | |||
| 442 | (defun rx-greedy (form) | ||
| 443 | "Parse and produce code from FORM. If FORM is '(minimal-match | ||
| 444 | FORM1)', non-greedy versions of `*', `+', and `?' operators will be | ||
| 445 | used in FORM1. If FORM is '(maximal-match FORM1)', greedy operators | ||
| 446 | will be used." | ||
| 447 | (rx-check form) | ||
| 448 | (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | ||
| 449 | (rx-to-string (cadr form)))) | ||
| 450 | |||
| 451 | |||
| 452 | (defun rx-regexp (form) | ||
| 453 | "Parse and produce code from FORM, which is `(regexp STRING)'." | ||
| 454 | (rx-check form) | ||
| 455 | (concat "\\(?:" (cadr form) "\\)")) | ||
| 456 | |||
| 457 | |||
| 458 | ;;;###autoload | ||
| 459 | (defun rx-to-string (form &optional no-group) | ||
| 460 | "Parse and produce code for regular expression FORM. | ||
| 461 | FORM is a regular expression in sexp form. | ||
| 462 | NO-GROUP non-nil means don't put shy groups around the result." | ||
| 463 | (cond ((stringp form) | ||
| 464 | (regexp-quote form)) | ||
| 465 | ((integerp form) | ||
| 466 | (regexp-quote (char-to-string form))) | ||
| 467 | ((symbolp form) | ||
| 468 | (let ((info (rx-info form))) | ||
| 469 | (cond ((stringp info) | ||
| 470 | info) | ||
| 471 | ((null info) | ||
| 472 | (error "Unknown Rx form `%s'" form)) | ||
| 473 | (t | ||
| 474 | (funcall (nth 0 info) form))))) | ||
| 475 | ((consp form) | ||
| 476 | (let ((info (rx-info (car form)))) | ||
| 477 | (unless (consp info) | ||
| 478 | (error "Unknown Rx form `%s'" (car form))) | ||
| 479 | (let ((result (funcall (nth 0 info) form))) | ||
| 480 | (if (or no-group (string-match "\\`\\\\[(]" result)) | ||
| 481 | result | ||
| 482 | (concat "\\(?:" result "\\)"))))) | ||
| 483 | (t | ||
| 484 | (error "Rx syntax error at `%s'" form)))) | ||
| 485 | |||
| 486 | |||
| 487 | ;;;###autoload | ||
| 488 | (defmacro rx (regexp) | ||
| 489 | "Translate a regular expression REGEXP in sexp form to a regexp string. | ||
| 490 | See also `rx-to-string' for how to do such a translation at run-time. | ||
| 491 | |||
| 492 | The following are valid subforms of regular expressions in sexp | ||
| 493 | notation. | ||
| 494 | |||
| 495 | STRING | ||
| 496 | matches string STRING literally. | ||
| 497 | |||
| 498 | CHAR | ||
| 499 | matches character CHAR literally. | ||
| 500 | |||
| 501 | `not-newline' | ||
| 502 | matches any character except a newline. | ||
| 503 | . | ||
| 504 | `anything' | ||
| 505 | matches any character | ||
| 506 | |||
| 507 | `(any SET)' | ||
| 508 | matches any character in SET. SET may be a character or string. | ||
| 509 | Ranges of characters can be specified as `A-Z' in strings. | ||
| 510 | |||
| 511 | '(in SET)' | ||
| 512 | like `any'. | ||
| 513 | |||
| 514 | `(not (any SET))' | ||
| 515 | matches any character not in SET | ||
| 516 | |||
| 517 | `line-start' | ||
| 518 | matches the empty string, but only at the beginning of a line | ||
| 519 | in the text being matched | ||
| 520 | |||
| 521 | `line-end' | ||
| 522 | is similar to `line-start' but matches only at the end of a line | ||
| 523 | |||
| 524 | `string-start' | ||
| 525 | matches the empty string, but only at the beginning of the | ||
| 526 | string being matched against. | ||
| 527 | |||
| 528 | `string-end' | ||
| 529 | matches the empty string, but only at the end of the | ||
| 530 | string being matched against. | ||
| 531 | |||
| 532 | `buffer-start' | ||
| 533 | matches the empty string, but only at the beginning of the | ||
| 534 | buffer being matched against. | ||
| 535 | |||
| 536 | `buffer-end' | ||
| 537 | matches the empty string, but only at the end of the | ||
| 538 | buffer being matched against. | ||
| 539 | |||
| 540 | `point' | ||
| 541 | matches the empty string, but only at point. | ||
| 542 | |||
| 543 | `word-start' | ||
| 544 | matches the empty string, but only at the beginning or end of a | ||
| 545 | word. | ||
| 546 | |||
| 547 | `word-end' | ||
| 548 | matches the empty string, but only at the end of a word. | ||
| 549 | |||
| 550 | `word-boundary' | ||
| 551 | matches the empty string, but only at the beginning or end of a | ||
| 552 | word. | ||
| 553 | |||
| 554 | `(not word-boundary)' | ||
| 555 | matches the empty string, but not at the beginning or end of a | ||
| 556 | word. | ||
| 557 | |||
| 558 | `digit' | ||
| 559 | matches 0 through 9. | ||
| 560 | |||
| 561 | `control' | ||
| 562 | matches ASCII control characters. | ||
| 563 | |||
| 564 | `hex-digit' | ||
| 565 | matches 0 through 9, a through f and A through F. | ||
| 566 | |||
| 567 | `blank' | ||
| 568 | matches space and tab only. | ||
| 569 | |||
| 570 | `graphic' | ||
| 571 | matches graphic characters--everything except ASCII control chars, | ||
| 572 | space, and DEL. | ||
| 573 | |||
| 574 | `printing' | ||
| 575 | matches printing characters--everything except ASCII control chars | ||
| 576 | and DEL. | ||
| 577 | |||
| 578 | `alphanumeric' | ||
| 579 | matches letters and digits. (But at present, for multibyte characters, | ||
| 580 | it matches anything that has word syntax.) | ||
| 581 | |||
| 582 | `letter' | ||
| 583 | matches letters. (But at present, for multibyte characters, | ||
| 584 | it matches anything that has word syntax.) | ||
| 585 | |||
| 586 | `ascii' | ||
| 587 | matches ASCII (unibyte) characters. | ||
| 588 | |||
| 589 | `nonascii' | ||
| 590 | matches non-ASCII (multibyte) characters. | ||
| 591 | |||
| 592 | `lower' | ||
| 593 | matches anything lower-case. | ||
| 594 | |||
| 595 | `upper' | ||
| 596 | matches anything upper-case. | ||
| 597 | |||
| 598 | `punctuation' | ||
| 599 | matches punctuation. (But at present, for multibyte characters, | ||
| 600 | it matches anything that has non-word syntax.) | ||
| 601 | |||
| 602 | `space' | ||
| 603 | matches anything that has whitespace syntax. | ||
| 604 | |||
| 605 | `word' | ||
| 606 | matches anything that has word syntax. | ||
| 607 | |||
| 608 | `(syntax SYNTAX)' | ||
| 609 | matches a character with syntax SYNTAX. SYNTAX must be one | ||
| 610 | of the following symbols. | ||
| 611 | |||
| 612 | `whitespace' (\\s- in string notation) | ||
| 613 | `punctuation' (\\s.) | ||
| 614 | `word' (\\sw) | ||
| 615 | `symbol' (\\s_) | ||
| 616 | `open-parenthesis' (\\s() | ||
| 617 | `close-parenthesis' (\\s)) | ||
| 618 | `expression-prefix' (\\s') | ||
| 619 | `string-quote' (\\s\") | ||
| 620 | `paired-delimiter' (\\s$) | ||
| 621 | `escape' (\\s\\) | ||
| 622 | `character-quote' (\\s/) | ||
| 623 | `comment-start' (\\s<) | ||
| 624 | `comment-end' (\\s>) | ||
| 625 | |||
| 626 | `(not (syntax SYNTAX))' | ||
| 627 | matches a character that has not syntax SYNTAX. | ||
| 628 | |||
| 629 | `(category CATEGORY)' | ||
| 630 | matches a character with category CATEGORY. CATEGORY must be | ||
| 631 | either a character to use for C, or one of the following symbols. | ||
| 632 | |||
| 633 | `consonant' (\\c0 in string notation) | ||
| 634 | `base-vowel' (\\c1) | ||
| 635 | `upper-diacritical-mark' (\\c2) | ||
| 636 | `lower-diacritical-mark' (\\c3) | ||
| 637 | `tone-mark' (\\c4) | ||
| 638 | `symbol' (\\c5) | ||
| 639 | `digit' (\\c6) | ||
| 640 | `vowel-modifying-diacritical-mark' (\\c7) | ||
| 641 | `vowel-sign' (\\c8) | ||
| 642 | `semivowel-lower' (\\c9) | ||
| 643 | `not-at-end-of-line' (\\c<) | ||
| 644 | `not-at-beginning-of-line' (\\c>) | ||
| 645 | `alpha-numeric-two-byte' (\\cA) | ||
| 646 | `chinse-two-byte' (\\cC) | ||
| 647 | `greek-two-byte' (\\cG) | ||
| 648 | `japanese-hiragana-two-byte' (\\cH) | ||
| 649 | `indian-tow-byte' (\\cI) | ||
| 650 | `japanese-katakana-two-byte' (\\cK) | ||
| 651 | `korean-hangul-two-byte' (\\cN) | ||
| 652 | `cyrillic-two-byte' (\\cY) | ||
| 653 | `ascii' (\\ca) | ||
| 654 | `arabic' (\\cb) | ||
| 655 | `chinese' (\\cc) | ||
| 656 | `ethiopic' (\\ce) | ||
| 657 | `greek' (\\cg) | ||
| 658 | `korean' (\\ch) | ||
| 659 | `indian' (\\ci) | ||
| 660 | `japanese' (\\cj) | ||
| 661 | `japanese-katakana' (\\ck) | ||
| 662 | `latin' (\\cl) | ||
| 663 | `lao' (\\co) | ||
| 664 | `tibetan' (\\cq) | ||
| 665 | `japanese-roman' (\\cr) | ||
| 666 | `thai' (\\ct) | ||
| 667 | `vietnamese' (\\cv) | ||
| 668 | `hebrew' (\\cw) | ||
| 669 | `cyrillic' (\\cy) | ||
| 670 | `can-break' (\\c|) | ||
| 671 | |||
| 672 | `(not (category CATEGORY))' | ||
| 673 | matches a character that has not category CATEGORY. | ||
| 674 | |||
| 675 | `(and SEXP1 SEXP2 ...)' | ||
| 676 | matches what SEXP1 matches, followed by what SEXP2 matches, etc. | ||
| 677 | |||
| 678 | `(submatch SEXP1 SEXP2 ...)' | ||
| 679 | like `and', but makes the match accessible with `match-end', | ||
| 680 | `match-beginning', and `match-string'. | ||
| 681 | |||
| 682 | `(group SEXP1 SEXP2 ...)' | ||
| 683 | another name for `submatch'. | ||
| 684 | |||
| 685 | `(or SEXP1 SEXP2 ...)' | ||
| 686 | matches anything that matches SEXP1 or SEXP2, etc. If all | ||
| 687 | args are strings, use `regexp-opt' to optimize the resulting | ||
| 688 | regular expression. | ||
| 689 | |||
| 690 | `(minimal-match SEXP)' | ||
| 691 | produce a non-greedy regexp for SEXP. Normally, regexps matching | ||
| 692 | zero or more occurrances of something are \"greedy\" in that they | ||
| 693 | match as much as they can, as long as the overall regexp can | ||
| 694 | still match. A non-greedy regexp matches as little as possible. | ||
| 695 | |||
| 696 | `(maximal-match SEXP)' | ||
| 697 | produce a greedy regexp for SEXP. This is the default. | ||
| 698 | |||
| 699 | `(zero-or-more SEXP)' | ||
| 700 | matches zero or more occurrences of what SEXP matches. | ||
| 701 | |||
| 702 | `(0+ SEXP)' | ||
| 703 | like `zero-or-more'. | ||
| 704 | |||
| 705 | `(* SEXP)' | ||
| 706 | like `zero-or-more', but always produces a greedy regexp. | ||
| 707 | |||
| 708 | `(*? SEXP)' | ||
| 709 | like `zero-or-more', but always produces a non-greedy regexp. | ||
| 710 | |||
| 711 | `(one-or-more SEXP)' | ||
| 712 | matches one or more occurrences of A. | ||
| 713 | |||
| 714 | `(1+ SEXP)' | ||
| 715 | like `one-or-more'. | ||
| 716 | |||
| 717 | `(+ SEXP)' | ||
| 718 | like `one-or-more', but always produces a greedy regexp. | ||
| 719 | |||
| 720 | `(+? SEXP)' | ||
| 721 | like `one-or-more', but always produces a non-greedy regexp. | ||
| 722 | |||
| 723 | `(zero-or-one SEXP)' | ||
| 724 | matches zero or one occurrences of A. | ||
| 725 | |||
| 726 | `(optional SEXP)' | ||
| 727 | like `zero-or-one'. | ||
| 728 | |||
| 729 | `(? SEXP)' | ||
| 730 | like `zero-or-one', but always produces a greedy regexp. | ||
| 731 | |||
| 732 | `(?? SEXP)' | ||
| 733 | like `zero-or-one', but always produces a non-greedy regexp. | ||
| 734 | |||
| 735 | `(repeat N SEXP)' | ||
| 736 | matches N occurrences of what SEXP matches. | ||
| 737 | |||
| 738 | `(repeat N M SEXP)' | ||
| 739 | matches N to M occurrences of what SEXP matches. | ||
| 740 | |||
| 741 | `(eval FORM)' | ||
| 742 | evaluate FORM and insert result. If result is a string, | ||
| 743 | `regexp-quote' it. | ||
| 744 | |||
| 745 | `(regexp REGEXP)' | ||
| 746 | include REGEXP in string notation in the result." | ||
| 747 | |||
| 748 | `(rx-to-string ',regexp)) | ||
| 749 | |||
| 750 | |||
| 751 | (provide 'rx) | ||
| 752 | |||
| 753 | ;;; rx.el ends here | ||