diff options
| author | Simon Marshall | 1997-06-06 07:10:24 +0000 |
|---|---|---|
| committer | Simon Marshall | 1997-06-06 07:10:24 +0000 |
| commit | 25544ce1bd4992b990365f8a09070f963aefb67d (patch) | |
| tree | 60ee4e61f351617d86afedcfa0003da85151c279 | |
| parent | 14aa11f4dae9557d582f4f66b233ae84fdf925e1 (diff) | |
| download | emacs-25544ce1bd4992b990365f8a09070f963aefb67d.tar.gz emacs-25544ce1bd4992b990365f8a09070f963aefb67d.zip | |
emit charsets after strings so that the final regexp finds the longest match.
| -rw-r--r-- | lisp/emacs-lisp/regexp-opt.el | 82 |
1 files changed, 45 insertions, 37 deletions
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index adf8e9e8da7..140fd8005dd 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | ;; Author: Simon Marshall <simon@gnu.ai.mit.edu> | 5 | ;; Author: Simon Marshall <simon@gnu.ai.mit.edu> |
| 6 | ;; Keywords: strings, regexps | 6 | ;; Keywords: strings, regexps |
| 7 | ;; Version: 1.04.01 | 7 | ;; Version: 1.05 |
| 8 | 8 | ||
| 9 | ;; This file is part of GNU Emacs. | 9 | ;; This file is part of GNU Emacs. |
| 10 | 10 | ||
| @@ -27,10 +27,14 @@ | |||
| 27 | 27 | ||
| 28 | ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)". | 28 | ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)". |
| 29 | ;; | 29 | ;; |
| 30 | ;; This package generates a regexp from a given list of strings (that matches | 30 | ;; This package generates a regexp from a given list of strings (which matches |
| 31 | ;; one of those strings) that is equivalent to but more efficient than: | 31 | ;; one of those strings) so that the regexp generated by: |
| 32 | ;; | 32 | ;; |
| 33 | ;; (mapconcat 'identity (mapcar 'regexp-quote strings) "\\|") | 33 | ;; (regexp-opt strings) |
| 34 | ;; | ||
| 35 | ;; is equivalent to, but more efficient than, the regexp generated by: | ||
| 36 | ;; | ||
| 37 | ;; (mapconcat 'regexp-quote strings "\\|") | ||
| 34 | ;; | 38 | ;; |
| 35 | ;; For example: | 39 | ;; For example: |
| 36 | ;; | 40 | ;; |
| @@ -42,10 +46,9 @@ | |||
| 42 | ;; (concat "(" (regexp-opt strings t) "\\>")) | 46 | ;; (concat "(" (regexp-opt strings t) "\\>")) |
| 43 | ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" | 47 | ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" |
| 44 | ;; | 48 | ;; |
| 45 | ;; Searching using the above example `regexp-opt' regexp is significantly | 49 | ;; Searching using the above example `regexp-opt' regexp takes approximately |
| 46 | ;; faster than searching using the equivalent `mapconcat' regexp, taking | 50 | ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. |
| 47 | ;; approximately two-thirds of the time. | 51 | |
| 48 | ;; | ||
| 49 | ;; Since this package was written to produce efficient regexps, not regexps | 52 | ;; Since this package was written to produce efficient regexps, not regexps |
| 50 | ;; efficiently, it is probably not a good idea to in-line too many calls in | 53 | ;; efficiently, it is probably not a good idea to in-line too many calls in |
| 51 | ;; your code, unless you use the following trick with `eval-when-compile': | 54 | ;; your code, unless you use the following trick with `eval-when-compile': |
| @@ -62,7 +65,15 @@ | |||
| 62 | ;; (defvar definition-regexp | 65 | ;; (defvar definition-regexp |
| 63 | ;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>") | 66 | ;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>") |
| 64 | ;; | 67 | ;; |
| 65 | ;; Originally written for font-lock.el, from an idea from Stig's hl319.el. | 68 | ;; Note that if you use this trick for all instances of `regexp-opt' and |
| 69 | ;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded | ||
| 70 | ;; at compile time. But note also that using this trick means that should | ||
| 71 | ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to | ||
| 72 | ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile | ||
| 73 | ;; your code for such changes to have effect in your code. | ||
| 74 | |||
| 75 | ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with | ||
| 76 | ;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu. | ||
| 66 | ;; Please don't tell me that it doesn't produce optimal regexps; I know that | 77 | ;; Please don't tell me that it doesn't produce optimal regexps; I know that |
| 67 | ;; already. For example, the above explanation for the meaning of "opt" would | 78 | ;; already. For example, the above explanation for the meaning of "opt" would |
| 68 | ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex | 79 | ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex |
| @@ -73,15 +84,16 @@ | |||
| 73 | ;;;###autoload | 84 | ;;;###autoload |
| 74 | (defun regexp-opt (strings &optional paren) | 85 | (defun regexp-opt (strings &optional paren) |
| 75 | "Return a regexp to match a string in STRINGS. | 86 | "Return a regexp to match a string in STRINGS. |
| 87 | Each string should be unique in STRINGS and should not contain any regexps. | ||
| 76 | If optional PAREN non-nil, ensure that the returned regexp is enclosed by at | 88 | If optional PAREN non-nil, ensure that the returned regexp is enclosed by at |
| 77 | least one regexp grouping construct. | 89 | least one regexp grouping construct. |
| 78 | Each string in STRINGS should be unique and should not contain any regexps. | ||
| 79 | The returned regexp is typically more efficient than the equivalent regexp: | 90 | The returned regexp is typically more efficient than the equivalent regexp: |
| 80 | 91 | ||
| 81 | (mapconcat 'identity (mapcar 'regexp-quote STRINGS) \"\\\\|\") | 92 | (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\"))) |
| 93 | (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren)) | ||
| 82 | 94 | ||
| 83 | but typically contains regexp grouping constructs. Use `regexp-opt-depth' to | 95 | but typically contains more regexp grouping constructs. |
| 84 | count them." | 96 | Use `regexp-opt-depth' to count them." |
| 85 | (save-match-data | 97 | (save-match-data |
| 86 | ;; Recurse on the sorted list. | 98 | ;; Recurse on the sorted list. |
| 87 | (let ((max-lisp-eval-depth (* 1024 1024)) | 99 | (let ((max-lisp-eval-depth (* 1024 1024)) |
| @@ -165,12 +177,12 @@ in REGEXP." | |||
| 165 | close-group))) | 177 | close-group))) |
| 166 | ;; | 178 | ;; |
| 167 | ;; If there are several one-character strings, remove them and recurse | 179 | ;; If there are several one-character strings, remove them and recurse |
| 168 | ;; on the rest. | 180 | ;; on the rest (first so the final regexp finds the longest match). |
| 169 | ((> (length letters) 1) | 181 | ((> (length letters) 1) |
| 170 | (let ((rest (let ((completion-regexp-list '("^..+$"))) | 182 | (let ((rest (let ((completion-regexp-list '("^..+$"))) |
| 171 | (all-completions "" (mapcar 'list strings))))) | 183 | (all-completions "" (mapcar 'list strings))))) |
| 172 | (concat open-group | 184 | (concat open-group |
| 173 | (regexp-opt-charset letters) "\\|" (regexp-opt-group rest) | 185 | (regexp-opt-group rest) "\\|" (regexp-opt-charset letters) |
| 174 | close-group))) | 186 | close-group))) |
| 175 | ;; | 187 | ;; |
| 176 | ;; Otherwise, divide the list into those that start with a particular | 188 | ;; Otherwise, divide the list into those that start with a particular |
| @@ -196,30 +208,26 @@ in REGEXP." | |||
| 196 | (bracket "") (dash "") (caret "")) | 208 | (bracket "") (dash "") (caret "")) |
| 197 | ;; | 209 | ;; |
| 198 | ;; Make a character map but extract character set meta characters. | 210 | ;; Make a character map but extract character set meta characters. |
| 199 | (let (char) | 211 | (dolist (char (mapcar 'string-to-char chars)) |
| 200 | (while chars | 212 | (case char |
| 201 | (setq char (string-to-char (pop chars))) | 213 | (?\] |
| 202 | (cond ((eq char ?\]) | 214 | (setq bracket "]")) |
| 203 | (setq bracket "]")) | 215 | (?^ |
| 204 | ((eq char ?^) | 216 | (setq caret "^")) |
| 205 | (setq caret "^")) | 217 | (?- |
| 206 | ((eq char ?-) | 218 | (setq dash "-")) |
| 207 | (setq dash "-")) | 219 | (otherwise |
| 208 | (t | 220 | (aset charmap char t)))) |
| 209 | (aset charmap char t))))) | ||
| 210 | ;; | 221 | ;; |
| 211 | ;; Make a character set from the map using ranges where applicable. | 222 | ;; Make a character set from the map using ranges where applicable. |
| 212 | (let ((elt 0) start) | 223 | (dotimes (elt charwidth) |
| 213 | (while (< elt charwidth) | 224 | (when (aref charmap elt) |
| 214 | (when (aref charmap elt) | 225 | (let ((start elt)) |
| 215 | (setq start (1+ elt)) | 226 | (while (and (< elt charwidth) (aref charmap elt)) |
| 216 | (while (and (< start charwidth) (aref charmap start)) | 227 | (incf elt)) |
| 217 | (incf start)) | 228 | (if (> (- elt start) 3) |
| 218 | (if (< (- start elt) 4) | 229 | (setq charset (format "%s%c-%c" charset start (1- elt))) |
| 219 | (setq charset (format "%s%c" charset elt)) | 230 | (setq charset (format "%s%c" charset (setq elt start))))))) |
| 220 | (setq charset (format "%s%c-%c" charset elt (1- start)) | ||
| 221 | elt start))) | ||
| 222 | (incf elt))) | ||
| 223 | ;; | 231 | ;; |
| 224 | ;; Make sure a caret is not first and a dash is first or last. | 232 | ;; Make sure a caret is not first and a dash is first or last. |
| 225 | (if (and (string-equal charset "") (string-equal bracket "")) | 233 | (if (and (string-equal charset "") (string-equal bracket "")) |