diff options
| author | Artur Malabarba | 2015-11-28 12:15:17 +0000 |
|---|---|---|
| committer | Artur Malabarba | 2015-11-28 16:21:54 +0000 |
| commit | 5c5997002b0b0aded744d5828158243fd546b3ec (patch) | |
| tree | 796917c909ee36f2ae5dc64cda6c0c78cb9c0155 | |
| parent | 35c7dc06ed1bd27fc69b2a79e88a8cfb2a6b3dae (diff) | |
| download | emacs-5c5997002b0b0aded744d5828158243fd546b3ec.tar.gz emacs-5c5997002b0b0aded744d5828158243fd546b3ec.zip | |
* lisp/character-fold.el: Add support for multi-char matches
(character-fold-table): Now has an extra-slot. This is a second
char-table that holds multi-character matches. See docstring for
details.
(character-fold-to-regexp): Can build branching regexps when a
character's entry the extra slot of `character-fold-table' matches the
characters that succeed it.
| -rw-r--r-- | lisp/character-fold.el | 173 | ||||
| -rw-r--r-- | test/automated/character-fold-tests.el | 45 |
2 files changed, 149 insertions, 69 deletions
diff --git a/lisp/character-fold.el b/lisp/character-fold.el index 749d1135ce5..0086345cccb 100644 --- a/lisp/character-fold.el +++ b/lisp/character-fold.el | |||
| @@ -22,11 +22,15 @@ | |||
| 22 | 22 | ||
| 23 | ;;; Code: | 23 | ;;; Code: |
| 24 | 24 | ||
| 25 | (eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1)) | ||
| 25 | 26 | ||
| 26 | (defconst character-fold-table | 27 | (defconst character-fold-table |
| 27 | (eval-when-compile | 28 | (eval-when-compile |
| 28 | (let ((equiv (make-char-table 'character-fold-table)) | 29 | (let ((equiv (make-char-table 'character-fold-table)) |
| 30 | (equiv-multi (make-char-table 'character-fold-table)) | ||
| 29 | (table (unicode-property-table-internal 'decomposition))) | 31 | (table (unicode-property-table-internal 'decomposition))) |
| 32 | (set-char-table-extra-slot equiv 0 equiv-multi) | ||
| 33 | |||
| 30 | ;; Ensure the table is populated. | 34 | ;; Ensure the table is populated. |
| 31 | (let ((func (char-table-extra-slot table 1))) | 35 | (let ((func (char-table-extra-slot table 1))) |
| 32 | (map-char-table (lambda (char v) | 36 | (map-char-table (lambda (char v) |
| @@ -36,63 +40,61 @@ | |||
| 36 | 40 | ||
| 37 | ;; Compile a list of all complex characters that each simple | 41 | ;; Compile a list of all complex characters that each simple |
| 38 | ;; character should match. | 42 | ;; character should match. |
| 43 | ;; In summary this loop does 3 things: | ||
| 44 | ;; - A complex character might be allowed to match its decomp. | ||
| 45 | ;; - The decomp is allowed to match the complex character. | ||
| 46 | ;; - A single char of the decomp might be allowed to match the | ||
| 47 | ;; character. | ||
| 48 | ;; Some examples in the comments below. | ||
| 39 | (map-char-table | 49 | (map-char-table |
| 40 | (lambda (char decomp) | 50 | (lambda (char decomp) |
| 41 | (when (consp decomp) | 51 | (when (consp decomp) |
| 42 | (if (symbolp (car decomp)) | ||
| 43 | ;; Discard a possible formatting tag. | ||
| 44 | (setq decomp (cdr decomp)) | ||
| 45 | ;; If there's no formatting tag, ensure that char matches | ||
| 46 | ;; its decomp exactly. This is because we want 'ä' to | ||
| 47 | ;; match 'ä', but we don't want '¹' to match '1'. | ||
| 48 | (aset equiv char | ||
| 49 | (cons (apply #'string decomp) | ||
| 50 | (aref equiv char)))) | ||
| 51 | ;; Finally, figure out whether char has a simpler | ||
| 52 | ;; equivalent (char-aux). If so, ensure that char-aux | ||
| 53 | ;; matches char and maybe its decomp too. | ||
| 54 | |||
| 55 | ;; Skip trivial cases like ?a decomposing to (?a). | 52 | ;; Skip trivial cases like ?a decomposing to (?a). |
| 56 | (unless (or (and (eq char (car decomp)) | 53 | (unless (and (not (cdr decomp)) |
| 57 | (not (cdr decomp)))) | 54 | (eq char (car decomp))) |
| 58 | (let ((dec-aux decomp) | 55 | (if (symbolp (car decomp)) |
| 59 | (fold-decomp t) | 56 | ;; Discard a possible formatting tag. |
| 60 | char-aux found) | 57 | (setq decomp (cdr decomp)) |
| 61 | (while (and dec-aux (not found)) | 58 | ;; If there's no formatting tag, ensure that char matches |
| 62 | (setq char-aux (pop dec-aux)) | 59 | ;; its decomp exactly. This is because we want 'ä' to |
| 63 | ;; Is char-aux a number or letter, per unicode standard? | 60 | ;; match 'ä', but we don't want '¹' to match '1'. |
| 64 | (setq found (memq (get-char-code-property char-aux 'general-category) | 61 | (aset equiv char |
| 65 | '(Lu Ll Lt Lm Lo Nd Nl No)))) | 62 | (cons (apply #'string decomp) |
| 66 | (if found | 63 | (aref equiv char)))) |
| 67 | ;; Check if the decomp has more than one letter, | 64 | |
| 68 | ;; because then we don't want the first letter to | 65 | ;; Allow the entire decomp to match char. If decomp has |
| 69 | ;; match the decomposition. This is because we | 66 | ;; multiple characters, this is done by adding an entry |
| 70 | ;; want 'f' to match 'ff' but not 'ff'. | 67 | ;; to the alist of the first character in decomp. This |
| 71 | (dolist (char-aux dec-aux) | 68 | ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to |
| 72 | (when (and fold-decomp | 69 | ;; match '¹'. |
| 73 | (memq (get-char-code-property char-aux 'general-category) | 70 | (let ((make-decomp-match-char |
| 74 | '(Lu Ll Lt Lm Lo Nd Nl No))) | 71 | (lambda (decomp char) |
| 75 | (setq fold-decomp nil))) | 72 | (if (cdr decomp) |
| 76 | ;; If there's no number or letter on the | 73 | (aset equiv-multi (car decomp) |
| 77 | ;; decomp, take the first character in it. | 74 | (cons (cons (apply #'string (cdr decomp)) |
| 78 | (setq found (car-safe decomp))) | 75 | (regexp-quote (string char))) |
| 79 | ;; Finally, we only fold multi-char decomp if at | 76 | (aref equiv-multi (car decomp)))) |
| 80 | ;; least one of the chars is non-spacing (combining). | 77 | (aset equiv (car decomp) |
| 81 | (when fold-decomp | 78 | (cons (char-to-string char) |
| 82 | (setq fold-decomp nil) | 79 | (aref equiv (car decomp)))))))) |
| 83 | (dolist (char-aux decomp) | 80 | (funcall make-decomp-match-char decomp char) |
| 84 | (when (and (not fold-decomp) | 81 | ;; Do it again, without the non-spacing characters. |
| 85 | (> (get-char-code-property char-aux 'canonical-combining-class) 0)) | 82 | ;; This allows 'a' to match 'ä'. |
| 86 | (setq fold-decomp t)))) | 83 | (let ((simpler-decomp nil) |
| 87 | ;; Add char to the list of characters that char-aux can | 84 | (found-one nil)) |
| 88 | ;; represent. Also possibly add its decomp, so we can | 85 | (dolist (c decomp) |
| 89 | ;; match multi-char representations like (format "a%c" 769) | 86 | (if (> (get-char-code-property c 'canonical-combining-class) 0) |
| 90 | (when (and found (not (eq char char-aux))) | 87 | (setq found-one t) |
| 91 | (let ((chars (cons (char-to-string char) (aref equiv char-aux)))) | 88 | (push c simpler-decomp))) |
| 92 | (aset equiv char-aux | 89 | (when (and simpler-decomp found-one) |
| 93 | (if fold-decomp | 90 | (funcall make-decomp-match-char simpler-decomp char) |
| 94 | (cons (apply #'string decomp) chars) | 91 | ;; Finally, if the decomp only had one spacing |
| 95 | chars)))))))) | 92 | ;; character, we allow this character to match the |
| 93 | ;; decomp. This is to let 'a' match 'ä'. | ||
| 94 | (unless (cdr simpler-decomp) | ||
| 95 | (aset equiv (car simpler-decomp) | ||
| 96 | (cons (apply #'string decomp) | ||
| 97 | (aref equiv (car simpler-decomp))))))))))) | ||
| 96 | table) | 98 | table) |
| 97 | 99 | ||
| 98 | ;; Add some manual entries. | 100 | ;; Add some manual entries. |
| @@ -112,7 +114,27 @@ | |||
| 112 | (aset equiv char re)))) | 114 | (aset equiv char re)))) |
| 113 | equiv) | 115 | equiv) |
| 114 | equiv)) | 116 | equiv)) |
| 115 | "Used for folding characters of the same group during search.") | 117 | "Used for folding characters of the same group during search. |
| 118 | This is a char-table with the `character-fold-table' subtype. | ||
| 119 | |||
| 120 | Let us refer to the character in question by char-x. | ||
| 121 | Each entry is either nil (meaning char-x only matches literally) | ||
| 122 | or a regexp. This regexp should match anything that char-x can | ||
| 123 | match by itself \(including char-x). For instance, the default | ||
| 124 | regexp for the ?+ character is \"[+⁺₊﬩﹢+]\". | ||
| 125 | |||
| 126 | This table also has one extra slot which is also a char-table. | ||
| 127 | Each entry in the extra slot is an alist used for multi-character | ||
| 128 | matching (which may be nil). The elements of the alist should | ||
| 129 | have the form (SUFFIX . OTHER-REGEXP). If the characters after | ||
| 130 | char-x are equal to SUFFIX, then this combination of char-x + | ||
| 131 | SUFFIX is allowed to match OTHER-REGEXP. This is in addition to | ||
| 132 | char-x being allowed to match REGEXP. | ||
| 133 | For instance, the default alist for ?f includes: | ||
| 134 | \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\") | ||
| 135 | (\"i\" . \"fi\") (\"f\" . \"ff\")) | ||
| 136 | |||
| 137 | Exceptionally for the space character (32), ALIST is ignored.") | ||
| 116 | 138 | ||
| 117 | (defun character-fold--make-space-string (n) | 139 | (defun character-fold--make-space-string (n) |
| 118 | "Return a string that matches N spaces." | 140 | "Return a string that matches N spaces." |
| @@ -122,13 +144,17 @@ | |||
| 122 | (make-list n (or (aref character-fold-table ?\s) " "))))) | 144 | (make-list n (or (aref character-fold-table ?\s) " "))))) |
| 123 | 145 | ||
| 124 | ;;;###autoload | 146 | ;;;###autoload |
| 125 | (defun character-fold-to-regexp (string &optional _lax) | 147 | (defun character-fold-to-regexp (string &optional _lax from) |
| 126 | "Return a regexp matching anything that character-folds into STRING. | 148 | "Return a regexp matching anything that character-folds into STRING. |
| 127 | Any character in STRING that has an entry in | 149 | Any character in STRING that has an entry in |
| 128 | `character-fold-table' is replaced with that entry (which is a | 150 | `character-fold-table' is replaced with that entry (which is a |
| 129 | regexp) and other characters are `regexp-quote'd." | 151 | regexp) and other characters are `regexp-quote'd. |
| 152 | |||
| 153 | FROM is for internal use. It specifies an index in the STRING | ||
| 154 | from which to start." | ||
| 130 | (let ((spaces 0) | 155 | (let ((spaces 0) |
| 131 | (i 0) | 156 | (multi-char-table (char-table-extra-slot character-fold-table 0)) |
| 157 | (i (or from 0)) | ||
| 132 | (end (length string)) | 158 | (end (length string)) |
| 133 | (out nil)) | 159 | (out nil)) |
| 134 | ;; When the user types a space, we want to match the table entry | 160 | ;; When the user types a space, we want to match the table entry |
| @@ -145,9 +171,36 @@ regexp) and other characters are `regexp-quote'd." | |||
| 145 | (c (when (> spaces 0) | 171 | (c (when (> spaces 0) |
| 146 | (push (character-fold--make-space-string spaces) out) | 172 | (push (character-fold--make-space-string spaces) out) |
| 147 | (setq spaces 0)) | 173 | (setq spaces 0)) |
| 148 | (push (or (aref character-fold-table c) | 174 | (let ((regexp (or (aref character-fold-table c) |
| 149 | (regexp-quote (string c))) | 175 | (regexp-quote (string c)))) |
| 150 | out))) | 176 | ;; Long string. The regexp would probably be too long. |
| 177 | (alist (unless (> end 60) | ||
| 178 | (aref multi-char-table c)))) | ||
| 179 | (push (let ((alist-out '("\\)"))) | ||
| 180 | (pcase-dolist (`(,suffix . ,out-regexp) alist) | ||
| 181 | (let ((len-suf (length suffix))) | ||
| 182 | (when (eq (compare-strings suffix 0 nil | ||
| 183 | string (1+ i) (+ i 1 len-suf) | ||
| 184 | nil) | ||
| 185 | t) | ||
| 186 | ;; FIXME: If N suffixes match, we "branch" | ||
| 187 | ;; out into N+1 executions for the rest of | ||
| 188 | ;; the string. This involves redundant | ||
| 189 | ;; work and makes a huge regexp. | ||
| 190 | (push (concat "\\|" out-regexp | ||
| 191 | (character-fold-to-regexp | ||
| 192 | string nil (+ i 1 len-suf))) | ||
| 193 | alist-out)))) | ||
| 194 | ;; If no suffixes matched, just go on. | ||
| 195 | (if (not (cdr alist-out)) | ||
| 196 | regexp | ||
| 197 | ;; Otherwise, add a branch for the | ||
| 198 | ;; no-suffix case, and stop the loop here. | ||
| 199 | (prog1 (apply #'concat "\\(?:" regexp | ||
| 200 | (character-fold-to-regexp string nil (1+ i)) | ||
| 201 | alist-out) | ||
| 202 | (setq i end)))) | ||
| 203 | out)))) | ||
| 151 | (setq i (1+ i))) | 204 | (setq i (1+ i))) |
| 152 | (when (> spaces 0) | 205 | (when (> spaces 0) |
| 153 | (push (character-fold--make-space-string spaces) out)) | 206 | (push (character-fold--make-space-string spaces) out)) |
diff --git a/test/automated/character-fold-tests.el b/test/automated/character-fold-tests.el index 40f0aecf449..aa2ee96a7a4 100644 --- a/test/automated/character-fold-tests.el +++ b/test/automated/character-fold-tests.el | |||
| @@ -43,7 +43,7 @@ | |||
| 43 | (character-fold--test-search-with-contents w w)))) | 43 | (character-fold--test-search-with-contents w w)))) |
| 44 | 44 | ||
| 45 | (ert-deftest character-fold--test-lax-whitespace () | 45 | (ert-deftest character-fold--test-lax-whitespace () |
| 46 | (dotimes (n 100) | 46 | (dotimes (n 50) |
| 47 | (let ((w1 (character-fold--random-word n)) | 47 | (let ((w1 (character-fold--random-word n)) |
| 48 | (w2 (character-fold--random-word n)) | 48 | (w2 (character-fold--random-word n)) |
| 49 | (search-spaces-regexp "\\s-+")) | 49 | (search-spaces-regexp "\\s-+")) |
| @@ -52,17 +52,44 @@ | |||
| 52 | (concat w1 " " w2)) | 52 | (concat w1 " " w2)) |
| 53 | (character-fold--test-search-with-contents | 53 | (character-fold--test-search-with-contents |
| 54 | (concat w1 "\s\n\s\t\f\t\n\r\t" w2) | 54 | (concat w1 "\s\n\s\t\f\t\n\r\t" w2) |
| 55 | (concat w1 (make-string 90 ?\s) w2))))) | 55 | (concat w1 (make-string 10 ?\s) w2))))) |
| 56 | |||
| 57 | (defun character-fold--test-match-exactly (string &rest strings-to-match) | ||
| 58 | (let ((re (concat "\\`" (character-fold-to-regexp string) "\\'"))) | ||
| 59 | (dolist (it strings-to-match) | ||
| 60 | (should (string-match re it))))) | ||
| 61 | |||
| 62 | (ert-deftest character-fold--test-some-defaults () | ||
| 63 | (dolist (it '(("ffl" . "ffl") ("ffi" . "ffi") | ||
| 64 | ("fi" . "fi") ("ff" . "ff") | ||
| 65 | ("ä" . "ä"))) | ||
| 66 | (character-fold--test-search-with-contents (cdr it) (car it)) | ||
| 67 | (let ((multi (char-table-extra-slot character-fold-table 0)) | ||
| 68 | (character-fold-table (make-char-table 'character-fold-table))) | ||
| 69 | (set-char-table-extra-slot character-fold-table 0 multi) | ||
| 70 | (character-fold--test-match-exactly (car it) (cdr it))))) | ||
| 56 | 71 | ||
| 57 | (ert-deftest character-fold--test-fold-to-regexp () | 72 | (ert-deftest character-fold--test-fold-to-regexp () |
| 58 | (let ((character-fold-table (make-char-table 'character-fold-table))) | 73 | (let ((character-fold-table (make-char-table 'character-fold-table)) |
| 59 | (aset character-fold-table ?a "abc") | 74 | (multi (make-char-table 'character-fold-table))) |
| 60 | (aset character-fold-table ?1 "123") | 75 | (set-char-table-extra-slot character-fold-table 0 multi) |
| 76 | (aset character-fold-table ?a "xx") | ||
| 77 | (aset character-fold-table ?1 "44") | ||
| 61 | (aset character-fold-table ?\s "-!-") | 78 | (aset character-fold-table ?\s "-!-") |
| 62 | (should (equal (character-fold-to-regexp "a1a1") | 79 | (character-fold--test-match-exactly "a1a1" "xx44xx44") |
| 63 | "abc123abc123")) | 80 | (character-fold--test-match-exactly "a1 a 1" "xx44-!--!-xx-!-44") |
| 64 | (should (equal (character-fold-to-regexp "a1 a 1") | 81 | (aset multi ?a '(("1" . "99") |
| 65 | "abc123\\(?: \\|-!--!-\\)abc\\(?: \\|-!-\\)123")))) | 82 | ("2" . "88") |
| 83 | ("12" . "77"))) | ||
| 84 | (character-fold--test-match-exactly "a" "xx") | ||
| 85 | (character-fold--test-match-exactly "a1" "xx44" "99") | ||
| 86 | (character-fold--test-match-exactly "a12" "77" "xx442" "992") | ||
| 87 | (character-fold--test-match-exactly "a2" "88") | ||
| 88 | (aset multi ?1 '(("2" . "yy"))) | ||
| 89 | (character-fold--test-match-exactly "a1" "xx44" "99") | ||
| 90 | (character-fold--test-match-exactly "a12" "77" "xx442" "992") | ||
| 91 | (character-fold--test-match-exactly "a12" "xxyy"))) | ||
| 92 | |||
| 66 | 93 | ||
| 67 | (provide 'character-fold-tests) | 94 | (provide 'character-fold-tests) |
| 68 | ;;; character-fold-tests.el ends here | 95 | ;;; character-fold-tests.el ends here |