aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArtur Malabarba2015-11-28 12:15:17 +0000
committerArtur Malabarba2015-11-28 16:21:54 +0000
commit5c5997002b0b0aded744d5828158243fd546b3ec (patch)
tree796917c909ee36f2ae5dc64cda6c0c78cb9c0155
parent35c7dc06ed1bd27fc69b2a79e88a8cfb2a6b3dae (diff)
downloademacs-5c5997002b0b0aded744d5828158243fd546b3ec.tar.gz
emacs-5c5997002b0b0aded744d5828158243fd546b3ec.zip
* lisp/character-fold.el: Add support for multi-char matches
(character-fold-table): Now has an extra-slot. This is a second char-table that holds multi-character matches. See docstring for details. (character-fold-to-regexp): Can build branching regexps when a character's entry the extra slot of `character-fold-table' matches the characters that succeed it.
-rw-r--r--lisp/character-fold.el173
-rw-r--r--test/automated/character-fold-tests.el45
2 files changed, 149 insertions, 69 deletions
diff --git a/lisp/character-fold.el b/lisp/character-fold.el
index 749d1135ce5..0086345cccb 100644
--- a/lisp/character-fold.el
+++ b/lisp/character-fold.el
@@ -22,11 +22,15 @@
22 22
23;;; Code: 23;;; Code:
24 24
25(eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1))
25 26
26(defconst character-fold-table 27(defconst character-fold-table
27 (eval-when-compile 28 (eval-when-compile
28 (let ((equiv (make-char-table 'character-fold-table)) 29 (let ((equiv (make-char-table 'character-fold-table))
30 (equiv-multi (make-char-table 'character-fold-table))
29 (table (unicode-property-table-internal 'decomposition))) 31 (table (unicode-property-table-internal 'decomposition)))
32 (set-char-table-extra-slot equiv 0 equiv-multi)
33
30 ;; Ensure the table is populated. 34 ;; Ensure the table is populated.
31 (let ((func (char-table-extra-slot table 1))) 35 (let ((func (char-table-extra-slot table 1)))
32 (map-char-table (lambda (char v) 36 (map-char-table (lambda (char v)
@@ -36,63 +40,61 @@
36 40
37 ;; Compile a list of all complex characters that each simple 41 ;; Compile a list of all complex characters that each simple
38 ;; character should match. 42 ;; character should match.
43 ;; In summary this loop does 3 things:
44 ;; - A complex character might be allowed to match its decomp.
45 ;; - The decomp is allowed to match the complex character.
46 ;; - A single char of the decomp might be allowed to match the
47 ;; character.
48 ;; Some examples in the comments below.
39 (map-char-table 49 (map-char-table
40 (lambda (char decomp) 50 (lambda (char decomp)
41 (when (consp decomp) 51 (when (consp decomp)
42 (if (symbolp (car decomp))
43 ;; Discard a possible formatting tag.
44 (setq decomp (cdr decomp))
45 ;; If there's no formatting tag, ensure that char matches
46 ;; its decomp exactly. This is because we want 'ä' to
47 ;; match 'ä', but we don't want '¹' to match '1'.
48 (aset equiv char
49 (cons (apply #'string decomp)
50 (aref equiv char))))
51 ;; Finally, figure out whether char has a simpler
52 ;; equivalent (char-aux). If so, ensure that char-aux
53 ;; matches char and maybe its decomp too.
54
55 ;; Skip trivial cases like ?a decomposing to (?a). 52 ;; Skip trivial cases like ?a decomposing to (?a).
56 (unless (or (and (eq char (car decomp)) 53 (unless (and (not (cdr decomp))
57 (not (cdr decomp)))) 54 (eq char (car decomp)))
58 (let ((dec-aux decomp) 55 (if (symbolp (car decomp))
59 (fold-decomp t) 56 ;; Discard a possible formatting tag.
60 char-aux found) 57 (setq decomp (cdr decomp))
61 (while (and dec-aux (not found)) 58 ;; If there's no formatting tag, ensure that char matches
62 (setq char-aux (pop dec-aux)) 59 ;; its decomp exactly. This is because we want 'ä' to
63 ;; Is char-aux a number or letter, per unicode standard? 60 ;; match 'ä', but we don't want '¹' to match '1'.
64 (setq found (memq (get-char-code-property char-aux 'general-category) 61 (aset equiv char
65 '(Lu Ll Lt Lm Lo Nd Nl No)))) 62 (cons (apply #'string decomp)
66 (if found 63 (aref equiv char))))
67 ;; Check if the decomp has more than one letter, 64
68 ;; because then we don't want the first letter to 65 ;; Allow the entire decomp to match char. If decomp has
69 ;; match the decomposition. This is because we 66 ;; multiple characters, this is done by adding an entry
70 ;; want 'f' to match 'ff' but not 'ff'. 67 ;; to the alist of the first character in decomp. This
71 (dolist (char-aux dec-aux) 68 ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
72 (when (and fold-decomp 69 ;; match '¹'.
73 (memq (get-char-code-property char-aux 'general-category) 70 (let ((make-decomp-match-char
74 '(Lu Ll Lt Lm Lo Nd Nl No))) 71 (lambda (decomp char)
75 (setq fold-decomp nil))) 72 (if (cdr decomp)
76 ;; If there's no number or letter on the 73 (aset equiv-multi (car decomp)
77 ;; decomp, take the first character in it. 74 (cons (cons (apply #'string (cdr decomp))
78 (setq found (car-safe decomp))) 75 (regexp-quote (string char)))
79 ;; Finally, we only fold multi-char decomp if at 76 (aref equiv-multi (car decomp))))
80 ;; least one of the chars is non-spacing (combining). 77 (aset equiv (car decomp)
81 (when fold-decomp 78 (cons (char-to-string char)
82 (setq fold-decomp nil) 79 (aref equiv (car decomp))))))))
83 (dolist (char-aux decomp) 80 (funcall make-decomp-match-char decomp char)
84 (when (and (not fold-decomp) 81 ;; Do it again, without the non-spacing characters.
85 (> (get-char-code-property char-aux 'canonical-combining-class) 0)) 82 ;; This allows 'a' to match 'ä'.
86 (setq fold-decomp t)))) 83 (let ((simpler-decomp nil)
87 ;; Add char to the list of characters that char-aux can 84 (found-one nil))
88 ;; represent. Also possibly add its decomp, so we can 85 (dolist (c decomp)
89 ;; match multi-char representations like (format "a%c" 769) 86 (if (> (get-char-code-property c 'canonical-combining-class) 0)
90 (when (and found (not (eq char char-aux))) 87 (setq found-one t)
91 (let ((chars (cons (char-to-string char) (aref equiv char-aux)))) 88 (push c simpler-decomp)))
92 (aset equiv char-aux 89 (when (and simpler-decomp found-one)
93 (if fold-decomp 90 (funcall make-decomp-match-char simpler-decomp char)
94 (cons (apply #'string decomp) chars) 91 ;; Finally, if the decomp only had one spacing
95 chars)))))))) 92 ;; character, we allow this character to match the
93 ;; decomp. This is to let 'a' match 'ä'.
94 (unless (cdr simpler-decomp)
95 (aset equiv (car simpler-decomp)
96 (cons (apply #'string decomp)
97 (aref equiv (car simpler-decomp)))))))))))
96 table) 98 table)
97 99
98 ;; Add some manual entries. 100 ;; Add some manual entries.
@@ -112,7 +114,27 @@
112 (aset equiv char re)))) 114 (aset equiv char re))))
113 equiv) 115 equiv)
114 equiv)) 116 equiv))
115 "Used for folding characters of the same group during search.") 117 "Used for folding characters of the same group during search.
118This is a char-table with the `character-fold-table' subtype.
119
120Let us refer to the character in question by char-x.
121Each entry is either nil (meaning char-x only matches literally)
122or a regexp. This regexp should match anything that char-x can
123match by itself \(including char-x). For instance, the default
124regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".
125
126This table also has one extra slot which is also a char-table.
127Each entry in the extra slot is an alist used for multi-character
128matching (which may be nil). The elements of the alist should
129have the form (SUFFIX . OTHER-REGEXP). If the characters after
130char-x are equal to SUFFIX, then this combination of char-x +
131SUFFIX is allowed to match OTHER-REGEXP. This is in addition to
132char-x being allowed to match REGEXP.
133For instance, the default alist for ?f includes:
134 \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
135 (\"i\" . \"fi\") (\"f\" . \"ff\"))
136
137Exceptionally for the space character (32), ALIST is ignored.")
116 138
117(defun character-fold--make-space-string (n) 139(defun character-fold--make-space-string (n)
118 "Return a string that matches N spaces." 140 "Return a string that matches N spaces."
@@ -122,13 +144,17 @@
122 (make-list n (or (aref character-fold-table ?\s) " "))))) 144 (make-list n (or (aref character-fold-table ?\s) " ")))))
123 145
124;;;###autoload 146;;;###autoload
125(defun character-fold-to-regexp (string &optional _lax) 147(defun character-fold-to-regexp (string &optional _lax from)
126 "Return a regexp matching anything that character-folds into STRING. 148 "Return a regexp matching anything that character-folds into STRING.
127Any character in STRING that has an entry in 149Any character in STRING that has an entry in
128`character-fold-table' is replaced with that entry (which is a 150`character-fold-table' is replaced with that entry (which is a
129regexp) and other characters are `regexp-quote'd." 151regexp) and other characters are `regexp-quote'd.
152
153FROM is for internal use. It specifies an index in the STRING
154from which to start."
130 (let ((spaces 0) 155 (let ((spaces 0)
131 (i 0) 156 (multi-char-table (char-table-extra-slot character-fold-table 0))
157 (i (or from 0))
132 (end (length string)) 158 (end (length string))
133 (out nil)) 159 (out nil))
134 ;; When the user types a space, we want to match the table entry 160 ;; When the user types a space, we want to match the table entry
@@ -145,9 +171,36 @@ regexp) and other characters are `regexp-quote'd."
145 (c (when (> spaces 0) 171 (c (when (> spaces 0)
146 (push (character-fold--make-space-string spaces) out) 172 (push (character-fold--make-space-string spaces) out)
147 (setq spaces 0)) 173 (setq spaces 0))
148 (push (or (aref character-fold-table c) 174 (let ((regexp (or (aref character-fold-table c)
149 (regexp-quote (string c))) 175 (regexp-quote (string c))))
150 out))) 176 ;; Long string. The regexp would probably be too long.
177 (alist (unless (> end 60)
178 (aref multi-char-table c))))
179 (push (let ((alist-out '("\\)")))
180 (pcase-dolist (`(,suffix . ,out-regexp) alist)
181 (let ((len-suf (length suffix)))
182 (when (eq (compare-strings suffix 0 nil
183 string (1+ i) (+ i 1 len-suf)
184 nil)
185 t)
186 ;; FIXME: If N suffixes match, we "branch"
187 ;; out into N+1 executions for the rest of
188 ;; the string. This involves redundant
189 ;; work and makes a huge regexp.
190 (push (concat "\\|" out-regexp
191 (character-fold-to-regexp
192 string nil (+ i 1 len-suf)))
193 alist-out))))
194 ;; If no suffixes matched, just go on.
195 (if (not (cdr alist-out))
196 regexp
197 ;; Otherwise, add a branch for the
198 ;; no-suffix case, and stop the loop here.
199 (prog1 (apply #'concat "\\(?:" regexp
200 (character-fold-to-regexp string nil (1+ i))
201 alist-out)
202 (setq i end))))
203 out))))
151 (setq i (1+ i))) 204 (setq i (1+ i)))
152 (when (> spaces 0) 205 (when (> spaces 0)
153 (push (character-fold--make-space-string spaces) out)) 206 (push (character-fold--make-space-string spaces) out))
diff --git a/test/automated/character-fold-tests.el b/test/automated/character-fold-tests.el
index 40f0aecf449..aa2ee96a7a4 100644
--- a/test/automated/character-fold-tests.el
+++ b/test/automated/character-fold-tests.el
@@ -43,7 +43,7 @@
43 (character-fold--test-search-with-contents w w)))) 43 (character-fold--test-search-with-contents w w))))
44 44
45(ert-deftest character-fold--test-lax-whitespace () 45(ert-deftest character-fold--test-lax-whitespace ()
46 (dotimes (n 100) 46 (dotimes (n 50)
47 (let ((w1 (character-fold--random-word n)) 47 (let ((w1 (character-fold--random-word n))
48 (w2 (character-fold--random-word n)) 48 (w2 (character-fold--random-word n))
49 (search-spaces-regexp "\\s-+")) 49 (search-spaces-regexp "\\s-+"))
@@ -52,17 +52,44 @@
52 (concat w1 " " w2)) 52 (concat w1 " " w2))
53 (character-fold--test-search-with-contents 53 (character-fold--test-search-with-contents
54 (concat w1 "\s\n\s\t\f\t\n\r\t" w2) 54 (concat w1 "\s\n\s\t\f\t\n\r\t" w2)
55 (concat w1 (make-string 90 ?\s) w2))))) 55 (concat w1 (make-string 10 ?\s) w2)))))
56
57(defun character-fold--test-match-exactly (string &rest strings-to-match)
58 (let ((re (concat "\\`" (character-fold-to-regexp string) "\\'")))
59 (dolist (it strings-to-match)
60 (should (string-match re it)))))
61
62(ert-deftest character-fold--test-some-defaults ()
63 (dolist (it '(("ffl" . "ffl") ("ffi" . "ffi")
64 ("fi" . "fi") ("ff" . "ff")
65 ("ä" . "ä")))
66 (character-fold--test-search-with-contents (cdr it) (car it))
67 (let ((multi (char-table-extra-slot character-fold-table 0))
68 (character-fold-table (make-char-table 'character-fold-table)))
69 (set-char-table-extra-slot character-fold-table 0 multi)
70 (character-fold--test-match-exactly (car it) (cdr it)))))
56 71
57(ert-deftest character-fold--test-fold-to-regexp () 72(ert-deftest character-fold--test-fold-to-regexp ()
58 (let ((character-fold-table (make-char-table 'character-fold-table))) 73 (let ((character-fold-table (make-char-table 'character-fold-table))
59 (aset character-fold-table ?a "abc") 74 (multi (make-char-table 'character-fold-table)))
60 (aset character-fold-table ?1 "123") 75 (set-char-table-extra-slot character-fold-table 0 multi)
76 (aset character-fold-table ?a "xx")
77 (aset character-fold-table ?1 "44")
61 (aset character-fold-table ?\s "-!-") 78 (aset character-fold-table ?\s "-!-")
62 (should (equal (character-fold-to-regexp "a1a1") 79 (character-fold--test-match-exactly "a1a1" "xx44xx44")
63 "abc123abc123")) 80 (character-fold--test-match-exactly "a1 a 1" "xx44-!--!-xx-!-44")
64 (should (equal (character-fold-to-regexp "a1 a 1") 81 (aset multi ?a '(("1" . "99")
65 "abc123\\(?: \\|-!--!-\\)abc\\(?: \\|-!-\\)123")))) 82 ("2" . "88")
83 ("12" . "77")))
84 (character-fold--test-match-exactly "a" "xx")
85 (character-fold--test-match-exactly "a1" "xx44" "99")
86 (character-fold--test-match-exactly "a12" "77" "xx442" "992")
87 (character-fold--test-match-exactly "a2" "88")
88 (aset multi ?1 '(("2" . "yy")))
89 (character-fold--test-match-exactly "a1" "xx44" "99")
90 (character-fold--test-match-exactly "a12" "77" "xx442" "992")
91 (character-fold--test-match-exactly "a12" "xxyy")))
92
66 93
67(provide 'character-fold-tests) 94(provide 'character-fold-tests)
68;;; character-fold-tests.el ends here 95;;; character-fold-tests.el ends here