aboutsummaryrefslogtreecommitdiffstats
path: root/admin
diff options
context:
space:
mode:
authorKenichi Handa2011-07-07 07:43:48 +0900
committerKenichi Handa2011-07-07 07:43:48 +0900
commitc805dec0b5fa81b5c9f2b724e2ec12a17d723aca (patch)
treec29a8490c976fdf4dbf64ef1b13a57f7d1110cc1 /admin
parent5c62d133468c354b47a1643092add8292e084765 (diff)
downloademacs-c805dec0b5fa81b5c9f2b724e2ec12a17d723aca.tar.gz
emacs-c805dec0b5fa81b5c9f2b724e2ec12a17d723aca.zip
Add C interface for Unicode character property table.
Diffstat (limited to 'admin')
-rw-r--r--admin/ChangeLog35
-rw-r--r--admin/unidata/Makefile.in5
-rw-r--r--admin/unidata/unidata-gen.el437
3 files changed, 223 insertions, 254 deletions
diff --git a/admin/ChangeLog b/admin/ChangeLog
index 7aaeb1d5ee2..3632a0992a6 100644
--- a/admin/ChangeLog
+++ b/admin/ChangeLog
@@ -1,3 +1,38 @@
12011-07-06 Kenichi Handa <handa@m17n.org>
2
3 * unidata/unidata-gen.el (unidata-dir): New variable.
4 (unidata-setup-list): Expand unidata-text-file in unidata-dir.
5 (unidata-prop-alist): INDEX element may be a function. New
6 optional element VAL-LIST (for general-category and bidi-class).
7 New entry `mirroring'.
8 (unidata-prop-default, unidata-prop-val-list): New subst.
9 (unidata-get-character, unidata-put-character): Delete them.
10 (unidata-gen-table-character): New arg IGNORE. Adjusted for the
11 above changes.
12 (unidata-get-symbol, unidata-get-integer, unidata-get-numeric)
13 (unidata-put-symbol, unidata-put-integer, unidata-put-numeric):
14 Delete them.
15 (unidata-encode-val): Assume that the first element of VAL-LIST is
16 a cons (nil . 0).
17 (unidata-gen-table): Change argument DEFAULT-VALUE to VAL-LIST.
18 Always store the encoded value.
19 (unidata-gen-table-symbol): New args DEFAULT-VALUE and VAL-LIST.
20 Set the 1st and the 2nd extra slots to index numbers for C
21 functions.
22 (unidata-gen-table-integer): Likewise.
23 (unidata-gen-table-numeric): Likewise.
24 (unidata-gen-table-name): New arg IGNORE.
25 (unidata-gen-table-decomposition): Likewise.
26 (unidata-describe-general-category): Add the case nil to the
27 description alist.
28 (unidata-gen-mirroring-list): New funciton.
29 (unidata-gen-files): New arg DATA-DIR. Adjusted for the change of
30 unidata-prop-alist. Handle the case of storing multiple
31 char-tables in a file.
32
33 * unidata/Makefile.in (${DSTDIR}/charprop.el): New arg to
34 unidata-gen-files.
35
12011-05-21 Glenn Morris <rgm@gnu.org> 362011-05-21 Glenn Morris <rgm@gnu.org>
2 37
3 * bzrmerge.el (bzrmerge-resolve): Suppress prompts about file-locals. 38 * bzrmerge.el (bzrmerge-resolve): Suppress prompts about file-locals.
diff --git a/admin/unidata/Makefile.in b/admin/unidata/Makefile.in
index 04f2f1d4380..e1fe247631f 100644
--- a/admin/unidata/Makefile.in
+++ b/admin/unidata/Makefile.in
@@ -33,9 +33,10 @@ unidata.txt: UnicodeData.txt
33 33
34${DSTDIR}/charprop.el: unidata-gen.elc unidata.txt 34${DSTDIR}/charprop.el: unidata-gen.elc unidata.txt
35 ELC=`/bin/pwd`/unidata-gen.elc; \ 35 ELC=`/bin/pwd`/unidata-gen.elc; \
36 DATA=`/bin/pwd`/unidata.txt; \ 36 DATADIR=`/bin/pwd`; \
37 DATA=unidata.txt; \
37 cd ${DSTDIR}; \ 38 cd ${DSTDIR}; \
38 ${RUNEMACS} -batch --load $${ELC} -f unidata-gen-files $${DATA} 39 ${RUNEMACS} -batch --load $${ELC} -f unidata-gen-files $${DATADIR} $${DATA}
39 40
40../../src/biditype.h: UnicodeData.txt 41../../src/biditype.h: UnicodeData.txt
41 gawk -F";" -f biditype.awk $< > $@ 42 gawk -F";" -f biditype.awk $< > $@
diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el
index 9f898668526..ab1dcd134ac 100644
--- a/admin/unidata/unidata-gen.el
+++ b/admin/unidata/unidata-gen.el
@@ -33,24 +33,25 @@
33;; 33;;
34;; charprop.el 34;; charprop.el
35;; It contains a series of forms of this format: 35;; It contains a series of forms of this format:
36;; (char-code-property-register PROP FILE) 36;; (define-char-code-property PROP FILE)
37;; where PROP is a symbol representing a character property 37;; where PROP is a symbol representing a character property
38;; (name, generic-category, etc), and FILE is a name of one of 38;; (name, general-category, etc), and FILE is a name of one of
39;; the following files. 39;; the following files.
40;; 40;;
41;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el, 41;; uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el,
42;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el, 42;; uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el,
43;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el, 43;; uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el,
44;; uni-lowercase.el, uni-titlecase.el 44;; uni-lowercase.el, uni-titlecase.el
45;; They each contain a single form of this format: 45;; They contain one or more forms of this format:
46;; (char-code-property-register PROP CHAR-TABLE) 46;; (define-char-code-property PROP CHAR-TABLE)
47;; where PROP is the same as above, and CHAR-TABLE is a 47;; where PROP is the same as above, and CHAR-TABLE is a
48;; char-table containing property values in a compressed format. 48;; char-table containing property values in a compressed format.
49;; 49;;
50;; When they are installed in .../lisp/international/, the file 50;; When they are installed in .../lisp/international/, the file
51;; "charprop.el" is preloaded in loadup.el. The other files are 51;; "charprop.el" is preloaded in loadup.el. The other files are
52;; automatically loaded when the functions `get-char-code-property' 52;; automatically loaded when the Lisp functions
53;; and `put-char-code-property' are called. 53;; `get-char-code-property' and `put-char-code-property', and C
54;; function uniprop_table are called.
54;; 55;;
55;; FORMAT OF A CHAR TABLE 56;; FORMAT OF A CHAR TABLE
56;; 57;;
@@ -62,17 +63,22 @@
62;; data in a char-table as below. 63;; data in a char-table as below.
63;; 64;;
64;; If succeeding 128*N characters have the same property value, we 65;; If succeeding 128*N characters have the same property value, we
65;; store that value for them. Otherwise, compress values for 66;; store that value (or the encoded one) for them. Otherwise,
66;; succeeding 128 characters into a single string and store it as a 67;; compress values (or the encoded ones) for succeeding 128
67;; value for those characters. The way of compression depends on a 68;; characters into a single string and store it for those
68;; property. See the section "SIMPLE TABLE", "RUN-LENGTH TABLE", 69;; characters. The way of compression depends on a property. See
69;; and "WORD-LIST TABLE". 70;; the section "SIMPLE TABLE", "RUN-LENGTH TABLE", and "WORD-LIST
70 71;; TABLE".
71;; The char table has four extra slots: 72
73;; The char table has five extra slots:
72;; 1st: property symbol 74;; 1st: property symbol
73;; 2nd: function to call to get a property value 75;; 2nd: function to call to get a property value,
74;; 3nd: function to call to put a property value 76;; or an index number of C function to decode the value,
75;; 4th: function to call to get a description of a property value 77;; or nil if the value can be directly got from the table.
78;; 3nd: function to call to put a property value,
79;; or an index number of C function to encode the value,
80;; or nil if the value can be directly stored in the table.
81;; 4th: function to call to get a description of a property value, or nil
76;; 5th: data referred by the above functions 82;; 5th: data referred by the above functions
77 83
78;; List of elements of this form: 84;; List of elements of this form:
@@ -82,6 +88,11 @@
82 88
83(defvar unidata-list nil) 89(defvar unidata-list nil)
84 90
91;; Name of the directory containing files of Unicode Character
92;; Database.
93
94(defvar unidata-dir nil)
95
85(defun unidata-setup-list (unidata-text-file) 96(defun unidata-setup-list (unidata-text-file)
86 (let* ((table (list nil)) 97 (let* ((table (list nil))
87 (tail table) 98 (tail table)
@@ -90,6 +101,7 @@
90 ("^<.*Surrogate" . nil) 101 ("^<.*Surrogate" . nil)
91 ("^<.*Private Use" . PRIVATE\ USE))) 102 ("^<.*Private Use" . PRIVATE\ USE)))
92 val char name) 103 val char name)
104 (setq unidata-text-file (expand-file-name unidata-text-file unidata-dir))
93 (or (file-readable-p unidata-text-file) 105 (or (file-readable-p unidata-text-file)
94 (error "File not readable: %s" unidata-text-file)) 106 (error "File not readable: %s" unidata-text-file))
95 (with-temp-buffer 107 (with-temp-buffer
@@ -134,12 +146,17 @@
134 (setq unidata-list (cdr table)))) 146 (setq unidata-list (cdr table))))
135 147
136;; Alist of this form: 148;; Alist of this form:
137;; (PROP INDEX GENERATOR FILENAME) 149;; (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER VAL-LIST)
138;; PROP: character property 150;; PROP: character property
139;; INDEX: index to each element of unidata-list for PROP 151;; INDEX: index to each element of unidata-list for PROP.
152;; It may be a function that generates an alist of character codes
153;; vs. the corresponding property values.
140;; GENERATOR: function to generate a char-table 154;; GENERATOR: function to generate a char-table
141;; FILENAME: filename to store the char-table 155;; FILENAME: filename to store the char-table
156;; DOCSTRING: docstring for the property
142;; DESCRIBER: function to call to get a description string of property value 157;; DESCRIBER: function to call to get a description string of property value
158;; DEFAULT: the default value of the property
159;; VAL-LIST: list of specially ordered property values
143 160
144(defconst unidata-prop-alist 161(defconst unidata-prop-alist
145 '((name 162 '((name
@@ -152,7 +169,12 @@ Property value is a string.")
152Property value is one of the following symbols: 169Property value is one of the following symbols:
153 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, 170 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
154 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn" 171 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
155 unidata-describe-general-category) 172 unidata-describe-general-category
173 nil
174 ;; The order of elements must be in sync with unicode_category_t
175 ;; in src/character.h.
176 (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po
177 Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co Cn))
156 (canonical-combining-class 178 (canonical-combining-class
157 3 unidata-gen-table-integer "uni-combining.el" 179 3 unidata-gen-table-integer "uni-combining.el"
158 "Unicode canonical combining class. 180 "Unicode canonical combining class.
@@ -164,7 +186,11 @@ Property value is an integer."
164Property value is one of the following symbols: 186Property value is one of the following symbols:
165 L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, 187 L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
166 AN, CS, NSM, BN, B, S, WS, ON" 188 AN, CS, NSM, BN, B, S, WS, ON"
167 unidata-describe-bidi-class) 189 unidata-describe-bidi-class
190 L
191 ;; The order of elements must be in sync with bidi_type_t in
192 ;; src/dispextern.h.
193 (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON))
168 (decomposition 194 (decomposition
169 5 unidata-gen-table-decomposition "uni-decomposition.el" 195 5 unidata-gen-table-decomposition "uni-decomposition.el"
170 "Unicode decomposition mapping. 196 "Unicode decomposition mapping.
@@ -188,7 +214,7 @@ Property value is an integer or a floating point.")
188 (mirrored 214 (mirrored
189 9 unidata-gen-table-symbol "uni-mirrored.el" 215 9 unidata-gen-table-symbol "uni-mirrored.el"
190 "Unicode bidi mirrored flag. 216 "Unicode bidi mirrored flag.
191Property value is a symbol `Y' or `N'.") 217Property value is a symbol `Y' or `N'. See also the property `mirroring'.")
192 (old-name 218 (old-name
193 10 unidata-gen-table-name "uni-old-name.el" 219 10 unidata-gen-table-name "uni-old-name.el"
194 "Unicode old names as published in Unicode 1.0. 220 "Unicode old names as published in Unicode 1.0.
@@ -211,7 +237,12 @@ Property value is a character."
211 14 unidata-gen-table-character "uni-titlecase.el" 237 14 unidata-gen-table-character "uni-titlecase.el"
212 "Unicode simple titlecase mapping. 238 "Unicode simple titlecase mapping.
213Property value is a character." 239Property value is a character."
214 string))) 240 string)
241 (mirroring
242 unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el"
243 "Unicode bidi-mirroring characters.
244Property value is a character that has the corresponding mirroring image,
245or nil for non-mirrored character.")))
215 246
216;; Functions to access the above data. 247;; Functions to access the above data.
217(defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist))) 248(defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
@@ -219,6 +250,8 @@ Property value is a character."
219(defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist))) 250(defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist)))
220(defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist))) 251(defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist)))
221(defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist))) 252(defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist)))
253(defsubst unidata-prop-default (prop) (nth 6 (assq prop unidata-prop-alist)))
254(defsubst unidata-prop-val-list (prop) (nth 7 (assq prop unidata-prop-alist)))
222 255
223 256
224;; SIMPLE TABLE 257;; SIMPLE TABLE
@@ -227,52 +260,34 @@ Property value is a character."
227;; values of succeeding character codes are usually different, we use 260;; values of succeeding character codes are usually different, we use
228;; a char-table described here to store such values. 261;; a char-table described here to store such values.
229;; 262;;
230;; If succeeding 128 characters has no property, a char-table has the 263;; A char-table divides character code space (#x0..#x3FFFFF) into
231;; symbol t for them. Otherwise a char-table has a string of the 264;; #x8000 blocks (each block contains 128 characters).
232;; following format for them. 265
266;; If all characters of a block have no property, a char-table has the
267;; symbol nil for that block. Otherwise a char-table has a string of
268;; the following format for it.
233;; 269;;
234;; The first character of the string is FIRST-INDEX. 270;; The first character of the string is ?\001.
235;; The Nth (N > 0) character of the string is a property value of the 271;; The second character of the string is FIRST-INDEX.
236;; character (BLOCK-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is 272;; The Nth (N > 1) character of the string is a property value of the
237;; the first of the characters in the block. 273;; character (BLOCK-HEAD + FIRST-INDEX + N - 2), where BLOCK-HEAD is
274;; the first character of the block.
238;; 275;;
239;; The 4th extra slot of a char-table is nil. 276;; This kind of char-table has these extra slots:
240 277;; 1st: the property symbol
241(defun unidata-get-character (char val table) 278;; 2nd: nil
242 (cond 279;; 3rd: 0 (corresponding to uniprop_encode_character in chartab.c)
243 ((characterp val) 280;; 4th to 5th: nil
244 val)
245 281
246 ((stringp val) 282(defun unidata-gen-table-character (prop &rest ignore)
247 (let* ((len (length val))
248 (block-head (lsh (lsh char -7) 7))
249 (vec (make-vector 128 nil))
250 (first-index (aref val 0)))
251 (dotimes (i (1- len))
252 (let ((elt (aref val (1+ i))))
253 (if (> elt 0)
254 (aset vec (+ first-index i) elt))))
255 (dotimes (i 128)
256 (aset table (+ block-head i) (aref vec i)))
257 (aref vec (- char block-head))))))
258
259(defun unidata-put-character (char val table)
260 (or (characterp val)
261 (not val)
262 (error "Not a character nor nil: %S" val))
263 (let ((current-val (aref table char)))
264 (unless (eq current-val val)
265 (if (stringp current-val)
266 (funcall (char-table-extra-slot table 1) char current-val table))
267 (aset table char val))))
268
269(defun unidata-gen-table-character (prop)
270 (let ((table (make-char-table 'char-code-property-table)) 283 (let ((table (make-char-table 'char-code-property-table))
271 (prop-idx (unidata-prop-index prop)) 284 (prop-idx (unidata-prop-index prop))
272 (vec (make-vector 128 0)) 285 (vec (make-vector 128 0))
273 (tail unidata-list) 286 (tail unidata-list)
274 elt range val idx slot) 287 elt range val idx slot)
275 (set-char-table-range table (cons 0 (max-char)) t) 288 (if (functionp prop-idx)
289 (setq tail (funcall prop-idx)
290 prop-idx 1))
276 (while tail 291 (while tail
277 (setq elt (car tail) tail (cdr tail)) 292 (setq elt (car tail) tail (cdr tail))
278 (setq range (car elt) 293 (setq range (car elt)
@@ -301,7 +316,7 @@ Property value is a character."
301 (setq first-index last-index))) 316 (setq first-index last-index)))
302 (setq tail (cdr tail))) 317 (setq tail (cdr tail)))
303 (when first-index 318 (when first-index
304 (let ((str (string first-index)) 319 (let ((str (string 1 first-index))
305 c) 320 c)
306 (while (<= first-index last-index) 321 (while (<= first-index last-index)
307 (setq str (format "%s%c" str (or (aref vec first-index) 0)) 322 (setq str (format "%s%c" str (or (aref vec first-index) 0))
@@ -309,184 +324,78 @@ Property value is a character."
309 (set-char-table-range table (cons start limit) str)))))) 324 (set-char-table-range table (cons start limit) str))))))
310 325
311 (set-char-table-extra-slot table 0 prop) 326 (set-char-table-extra-slot table 0 prop)
312 (byte-compile 'unidata-get-character) 327 (set-char-table-extra-slot table 2 0)
313 (byte-compile 'unidata-put-character)
314 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-character))
315 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-character))
316
317 table)) 328 table))
318 329
319 330
320 331
321;; RUN-LENGTH TABLE 332;; RUN-LENGTH TABLE
322;; 333;;
323;; If the type of character property value is symbol, integer, 334;; If many characters of successive character codes have the same
324;; boolean, or character, we use a char-table described here to store 335;; property value, we use a char-table described here to store the
325;; the values. 336;; values.
326;; 337;;
327;; The 4th extra slot is a vector of property values (VAL-TABLE), and 338;; At first, instead of a value itself, we store an index number to
328;; values for succeeding 128 characters are encoded into this 339;; the VAL-TABLE (5th extra slot) in the table. We call that index
329;; character sequence: 340;; number as VAL-CODE here after.
341;;
342;; A char-table divides character code space (#x0..#x3FFFFF) into
343;; #x8000 blocks (each block contains 128 characters).
344;;
345;; If all characters of a block have the same value, a char-table has
346;; VAL-CODE for that block. Otherwise a char-table has a string of
347;; the following format for that block.
348;;
349;; The first character of the string is ?\002.
350;; The following characters has this form:
330;; ( VAL-CODE RUN-LENGTH ? ) + 351;; ( VAL-CODE RUN-LENGTH ? ) +
331;; where: 352;; where:
332;; VAL-CODE (0..127): 353;; VAL-CODE (0..127): index into VAL-TABLE.
333;; (VAL-CODE - 1) is an index into VAL-TABLE.
334;; The value 0 means no-value.
335;; RUN-LENGTH (130..255): 354;; RUN-LENGTH (130..255):
336;; (RUN-LENGTH - 128) specifies how many characters have the same 355;; (RUN-LENGTH - 128) specifies how many characters have the same
337;; value. If omitted, it means 1. 356;; value. If omitted, it means 1.
338 357;;
339 358;; This kind of char-table has these extra slots:
340;; Return a symbol-type character property value of CHAR. VAL is the 359;; 1st: the property symbol
341;; current value of (aref TABLE CHAR). 360;; 2nd: 0 (corresponding to uniprop_decode_value in chartab.c)
342 361;; 3rd: 1..3 (corresponding to uniprop_encode_xxx in chartab.c)
343(defun unidata-get-symbol (char val table) 362;; 4th: function or nil
344 (let ((val-table (char-table-extra-slot table 4))) 363;; 5th: VAL-TABLE
345 (cond ((symbolp val)
346 val)
347 ((stringp val)
348 (let ((first-char (lsh (lsh char -7) 7))
349 (str val)
350 (len (length val))
351 (idx 0)
352 this-val count)
353 (set-char-table-range table (cons first-char (+ first-char 127))
354 nil)
355 (while (< idx len)
356 (setq val (aref str idx) idx (1+ idx)
357 count (if (< idx len) (aref str idx) 1))
358 (setq val (and (> val 0) (aref val-table (1- val)))
359 count (if (< count 128)
360 1
361 (prog1 (- count 128) (setq idx (1+ idx)))))
362 (dotimes (i count)
363 (if val
364 (aset table first-char val))
365 (if (= first-char char)
366 (setq this-val val))
367 (setq first-char (1+ first-char))))
368 this-val))
369 ((> val 0)
370 (aref val-table (1- val))))))
371
372;; Return a integer-type character property value of CHAR. VAL is the
373;; current value of (aref TABLE CHAR).
374
375(defun unidata-get-integer (char val table)
376 (let ((val-table (char-table-extra-slot table 4)))
377 (cond ((integerp val)
378 val)
379 ((stringp val)
380 (let ((first-char (lsh (lsh char -7) 7))
381 (str val)
382 (len (length val))
383 (idx 0)
384 this-val count)
385 (while (< idx len)
386 (setq val (aref str idx) idx (1+ idx)
387 count (if (< idx len) (aref str idx) 1))
388 (setq val (and (> val 0) (aref val-table (1- val)))
389 count (if (< count 128)
390 1
391 (prog1 (- count 128) (setq idx (1+ idx)))))
392 (dotimes (i count)
393 (aset table first-char val)
394 (if (= first-char char)
395 (setq this-val val))
396 (setq first-char (1+ first-char))))
397 this-val)))))
398
399;; Return a numeric-type (integer or float) character property value
400;; of CHAR. VAL is the current value of (aref TABLE CHAR).
401
402(defun unidata-get-numeric (char val table)
403 (cond
404 ((numberp val)
405 val)
406 ((stringp val)
407 (let ((val-table (char-table-extra-slot table 4))
408 (first-char (lsh (lsh char -7) 7))
409 (str val)
410 (len (length val))
411 (idx 0)
412 this-val count)
413 (while (< idx len)
414 (setq val (aref str idx) idx (1+ idx)
415 count (if (< idx len) (aref str idx) 1))
416 (setq val (and (> val 0) (aref val-table (1- val)))
417 count (if (< count 128)
418 1
419 (prog1 (- count 128) (setq idx (1+ idx)))))
420 (dotimes (i count)
421 (aset table first-char val)
422 (if (= first-char char)
423 (setq this-val val))
424 (setq first-char (1+ first-char))))
425 this-val))))
426
427;; Store VAL (symbol) as a character property value of CHAR in TABLE.
428
429(defun unidata-put-symbol (char val table)
430 (or (symbolp val)
431 (error "Not a symbol: %S" val))
432 (let ((current-val (aref table char)))
433 (unless (eq current-val val)
434 (if (stringp current-val)
435 (funcall (char-table-extra-slot table 1) char current-val table))
436 (aset table char val))))
437
438;; Store VAL (integer) as a character property value of CHAR in TABLE.
439
440(defun unidata-put-integer (char val table)
441 (or (integerp val)
442 (not val)
443 (error "Not an integer nor nil: %S" val))
444 (let ((current-val (aref table char)))
445 (unless (eq current-val val)
446 (if (stringp current-val)
447 (funcall (char-table-extra-slot table 1) char current-val table))
448 (aset table char val))))
449
450;; Store VAL (integer or float) as a character property value of CHAR
451;; in TABLE.
452
453(defun unidata-put-numeric (char val table)
454 (or (numberp val)
455 (not val)
456 (error "Not a number nor nil: %S" val))
457 (let ((current-val (aref table char)))
458 (unless (equal current-val val)
459 (if (stringp current-val)
460 (funcall (char-table-extra-slot table 1) char current-val table))
461 (aset table char val))))
462 364
463;; Encode the character property value VAL into an integer value by 365;; Encode the character property value VAL into an integer value by
464;; VAL-LIST. By side effect, VAL-LIST is modified. 366;; VAL-LIST. By side effect, VAL-LIST is modified.
465;; VAL-LIST has this form: 367;; VAL-LIST has this form:
466;; (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) 368;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ...)
467;; If VAL is one of VALn, just return VAL-CODEn. Otherwise, 369;; If VAL is one of VALn, just return n.
468;; VAL-LIST is modified to this: 370;; Otherwise, VAL-LIST is modified to this:
469;; (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...) 371;; ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1))
470 372
471(defun unidata-encode-val (val-list val) 373(defun unidata-encode-val (val-list val)
472 (let ((slot (assoc val val-list)) 374 (let ((slot (assoc val val-list))
473 val-code) 375 val-code)
474 (if slot 376 (if slot
475 (cdr slot) 377 (cdr slot)
476 (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1)) 378 (setq val-code (length val-list))
477 (setcdr val-list (cons (cons val val-code) (cdr val-list))) 379 (nconc val-list (list (cons val val-code)))
478 val-code))) 380 val-code)))
479 381
480;; Generate a char-table for the character property PROP. 382;; Generate a char-table for the character property PROP.
481 383
482(defun unidata-gen-table (prop val-func default-value) 384(defun unidata-gen-table (prop val-func default-value val-list)
483 (let ((table (make-char-table 'char-code-property-table)) 385 (let ((table (make-char-table 'char-code-property-table))
484 (prop-idx (unidata-prop-index prop)) 386 (prop-idx (unidata-prop-index prop))
485 (val-list (list t))
486 (vec (make-vector 128 0)) 387 (vec (make-vector 128 0))
487 tail elt range val val-code idx slot 388 tail elt range val val-code idx slot
488 prev-range-data) 389 prev-range-data)
489 (set-char-table-range table (cons 0 (max-char)) default-value) 390 (setq val-list (cons nil (copy-sequence val-list)))
391 (setq tail val-list val-code 0)
392 ;; Convert (nil A B ...) to ((nil . 0) (A . 1) (B . 2) ...)
393 (while tail
394 (setcar tail (cons (car tail) val-code))
395 (setq tail (cdr tail) val-code (1+ val-code)))
396 (setq default-value (unidata-encode-val val-list default-value))
397 (set-char-table-range table t default-value)
398 (set-char-table-range table nil default-value)
490 (setq tail unidata-list) 399 (setq tail unidata-list)
491 (while tail 400 (while tail
492 (setq elt (car tail) tail (cdr tail)) 401 (setq elt (car tail) tail (cdr tail))
@@ -495,7 +404,7 @@ Property value is a character."
495 (setq val-code (if val (unidata-encode-val val-list val))) 404 (setq val-code (if val (unidata-encode-val val-list val)))
496 (if (consp range) 405 (if (consp range)
497 (when val-code 406 (when val-code
498 (set-char-table-range table range val) 407 (set-char-table-range table range val-code)
499 (let ((from (car range)) (to (cdr range))) 408 (let ((from (car range)) (to (cdr range)))
500 ;; If RANGE doesn't end at the char-table boundary (each 409 ;; If RANGE doesn't end at the char-table boundary (each
501 ;; 128 characters), we may have to carry over the data 410 ;; 128 characters), we may have to carry over the data
@@ -534,7 +443,7 @@ Property value is a character."
534 (if val-code 443 (if val-code
535 (aset vec (- range start) val-code)) 444 (aset vec (- range start) val-code))
536 (setq tail (cdr tail))) 445 (setq tail (cdr tail)))
537 (setq str "" val-code -1 count 0) 446 (setq str "\002" val-code -1 count 0)
538 (mapc #'(lambda (x) 447 (mapc #'(lambda (x)
539 (if (= val-code x) 448 (if (= val-code x)
540 (setq count (1+ count)) 449 (setq count (1+ count))
@@ -549,7 +458,7 @@ Property value is a character."
549 vec) 458 vec)
550 (if (= count 128) 459 (if (= count 128)
551 (if val 460 (if val
552 (set-char-table-range table (cons start limit) val)) 461 (set-char-table-range table (cons start limit) val-code))
553 (if (= val-code 0) 462 (if (= val-code 0)
554 (set-char-table-range table (cons start limit) str) 463 (set-char-table-range table (cons start limit) str)
555 (if (> count 2) 464 (if (> count 2)
@@ -559,34 +468,29 @@ Property value is a character."
559 (setq str (concat str (string val-code))))) 468 (setq str (concat str (string val-code)))))
560 (set-char-table-range table (cons start limit) str)))))) 469 (set-char-table-range table (cons start limit) str))))))
561 470
562 (setq val-list (nreverse (cdr val-list)))
563 (set-char-table-extra-slot table 0 prop) 471 (set-char-table-extra-slot table 0 prop)
564 (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list))) 472 (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
565 table)) 473 table))
566 474
567(defun unidata-gen-table-symbol (prop) 475(defun unidata-gen-table-symbol (prop default-value val-list)
568 (let ((table (unidata-gen-table prop 476 (let ((table (unidata-gen-table prop
569 #'(lambda (x) (and (> (length x) 0) 477 #'(lambda (x) (and (> (length x) 0)
570 (intern x))) 478 (intern x)))
571 0))) 479 default-value val-list)))
572 (byte-compile 'unidata-get-symbol) 480 (set-char-table-extra-slot table 1 0)
573 (byte-compile 'unidata-put-symbol) 481 (set-char-table-extra-slot table 2 1)
574 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol))
575 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol))
576 table)) 482 table))
577 483
578(defun unidata-gen-table-integer (prop) 484(defun unidata-gen-table-integer (prop default-value val-list)
579 (let ((table (unidata-gen-table prop 485 (let ((table (unidata-gen-table prop
580 #'(lambda (x) (and (> (length x) 0) 486 #'(lambda (x) (and (> (length x) 0)
581 (string-to-number x))) 487 (string-to-number x)))
582 t))) 488 default-value val-list)))
583 (byte-compile 'unidata-get-integer) 489 (set-char-table-extra-slot table 1 0)
584 (byte-compile 'unidata-put-integer) 490 (set-char-table-extra-slot table 2 1)
585 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer))
586 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer))
587 table)) 491 table))
588 492
589(defun unidata-gen-table-numeric (prop) 493(defun unidata-gen-table-numeric (prop default-value val-list)
590 (let ((table (unidata-gen-table prop 494 (let ((table (unidata-gen-table prop
591 #'(lambda (x) 495 #'(lambda (x)
592 (if (string-match "/" x) 496 (if (string-match "/" x)
@@ -595,11 +499,9 @@ Property value is a character."
595 (substring x (match-end 0)))) 499 (substring x (match-end 0))))
596 (if (> (length x) 0) 500 (if (> (length x) 0)
597 (string-to-number x)))) 501 (string-to-number x))))
598 t))) 502 default-value val-list)))
599 (byte-compile 'unidata-get-numeric) 503 (set-char-table-extra-slot table 1 0)
600 (byte-compile 'unidata-put-numeric) 504 (set-char-table-extra-slot table 2 2)
601 (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-numeric))
602 (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-numeric))
603 table)) 505 table))
604 506
605 507
@@ -892,7 +794,6 @@ Property value is a character."
892 word-table 794 word-table
893 block-list block-word-table block-end 795 block-list block-word-table block-end
894 tail elt range val idx slot) 796 tail elt range val idx slot)
895 (set-char-table-range table (cons 0 (max-char)) 0)
896 (setq tail unidata-list) 797 (setq tail unidata-list)
897 (setq block-end -1) 798 (setq block-end -1)
898 (while tail 799 (while tail
@@ -1025,7 +926,7 @@ Property value is a character."
1025 idx (1+ i))))) 926 idx (1+ i)))))
1026 (nreverse (cons (intern (substring str idx)) l)))))) 927 (nreverse (cons (intern (substring str idx)) l))))))
1027 928
1028(defun unidata-gen-table-name (prop) 929(defun unidata-gen-table-name (prop &rest ignore)
1029 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name)) 930 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
1030 (word-tables (char-table-extra-slot table 4))) 931 (word-tables (char-table-extra-slot table 4)))
1031 (byte-compile 'unidata-get-name) 932 (byte-compile 'unidata-get-name)
@@ -1064,7 +965,7 @@ Property value is a character."
1064 (nreverse l))))) 965 (nreverse l)))))
1065 966
1066 967
1067(defun unidata-gen-table-decomposition (prop) 968(defun unidata-gen-table-decomposition (prop &rest ignore)
1068 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition)) 969 (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition))
1069 (word-tables (char-table-extra-slot table 4))) 970 (word-tables (char-table-extra-slot table 4)))
1070 (byte-compile 'unidata-get-decomposition) 971 (byte-compile 'unidata-get-decomposition)
@@ -1080,7 +981,8 @@ Property value is a character."
1080 981
1081(defun unidata-describe-general-category (val) 982(defun unidata-describe-general-category (val)
1082 (cdr (assq val 983 (cdr (assq val
1083 '((Lu . "Letter, Uppercase") 984 '((nil . "Uknown")
985 (Lu . "Letter, Uppercase")
1084 (Ll . "Letter, Lowercase") 986 (Ll . "Letter, Lowercase")
1085 (Lt . "Letter, Titlecase") 987 (Lt . "Letter, Titlecase")
1086 (Lm . "Letter, Modifier") 988 (Lm . "Letter, Modifier")
@@ -1171,6 +1073,19 @@ Property value is a character."
1171 (string ?')))) 1073 (string ?'))))
1172 val " ")) 1074 val " "))
1173 1075
1076(defun unidata-gen-mirroring-list ()
1077 (let ((head (list nil))
1078 tail)
1079 (with-temp-buffer
1080 (insert-file-contents (expand-file-name "BidiMirroring.txt" unidata-dir))
1081 (goto-char (point-min))
1082 (setq tail head)
1083 (while (re-search-forward "^\\([0-9A-F]+\\);\\s +\\([0-9A-F]+\\)" nil t)
1084 (let ((char (string-to-number (match-string 1) 16))
1085 (mirror (match-string 2)))
1086 (setq tail (setcdr tail (list (list char mirror)))))))
1087 (cdr head)))
1088
1174;; Verify if we can retrieve correct values from the generated 1089;; Verify if we can retrieve correct values from the generated
1175;; char-tables. 1090;; char-tables.
1176 1091
@@ -1212,13 +1127,21 @@ Property value is a character."
1212;; The entry function. It generates files described in the header 1127;; The entry function. It generates files described in the header
1213;; comment of this file. 1128;; comment of this file.
1214 1129
1215(defun unidata-gen-files (&optional unidata-text-file) 1130(defun unidata-gen-files (&optional data-dir unidata-text-file)
1216 (or unidata-text-file 1131 (or data-dir
1217 (setq unidata-text-file (car command-line-args-left) 1132 (setq data-dir (car command-line-args-left)
1133 command-line-args-left (cdr command-line-args-left)
1134 unidata-text-file (car command-line-args-left)
1218 command-line-args-left (cdr command-line-args-left))) 1135 command-line-args-left (cdr command-line-args-left)))
1219 (unidata-setup-list unidata-text-file)
1220 (let ((coding-system-for-write 'utf-8-unix) 1136 (let ((coding-system-for-write 'utf-8-unix)
1221 (charprop-file "charprop.el")) 1137 (charprop-file "charprop.el")
1138 (unidata-dir data-dir))
1139 (dolist (elt unidata-prop-alist)
1140 (let* ((prop (car elt))
1141 (file (unidata-prop-file prop)))
1142 (if (file-exists-p file)
1143 (delete-file file))))
1144 (unidata-setup-list unidata-text-file)
1222 (with-temp-file charprop-file 1145 (with-temp-file charprop-file
1223 (insert ";; Automatically generated by unidata-gen.el.\n") 1146 (insert ";; Automatically generated by unidata-gen.el.\n")
1224 (dolist (elt unidata-prop-alist) 1147 (dolist (elt unidata-prop-alist)
@@ -1227,31 +1150,41 @@ Property value is a character."
1227 (file (unidata-prop-file prop)) 1150 (file (unidata-prop-file prop))
1228 (docstring (unidata-prop-docstring prop)) 1151 (docstring (unidata-prop-docstring prop))
1229 (describer (unidata-prop-describer prop)) 1152 (describer (unidata-prop-describer prop))
1153 (default-value (unidata-prop-default prop))
1154 (val-list (unidata-prop-val-list prop))
1230 table) 1155 table)
1231 ;; Filename in this comment line is extracted by sed in 1156 ;; Filename in this comment line is extracted by sed in
1232 ;; Makefile. 1157 ;; Makefile.
1233 (insert (format ";; FILE: %s\n" file)) 1158 (insert (format ";; FILE: %s\n" file))
1234 (insert (format "(define-char-code-property '%S %S\n %S)\n" 1159 (insert (format "(define-char-code-property '%S %S\n %S)\n"
1235 prop file docstring)) 1160 prop file docstring))
1236 (with-temp-file file 1161 (with-temp-buffer
1237 (message "Generating %s..." file) 1162 (message "Generating %s..." file)
1238 (setq table (funcall generator prop)) 1163 (when (file-exists-p file)
1164 (insert-file-contents file)
1165 (goto-char (point-max))
1166 (search-backward ";; Local Variables:"))
1167 (setq table (funcall generator prop default-value val-list))
1239 (when describer 1168 (when describer
1240 (unless (subrp (symbol-function describer)) 1169 (unless (subrp (symbol-function describer))
1241 (byte-compile describer) 1170 (byte-compile describer)
1242 (setq describer (symbol-function describer))) 1171 (setq describer (symbol-function describer)))
1243 (set-char-table-extra-slot table 3 describer)) 1172 (set-char-table-extra-slot table 3 describer))
1244 (insert ";; Copyright (C) 1991-2009 Unicode, Inc. 1173 (if (bobp)
1245;; This file was generated from the Unicode data file at 1174 (insert ";; Copyright (C) 1991-2009 Unicode, Inc.
1246;; http://www.unicode.org/Public/UNIDATA/UnicodeData.txt. 1175;; This file was generated from the Unicode data files at
1247;; See lisp/international/README for the copyright and permission notice.\n" 1176;; http://www.unicode.org/Public/UNIDATA/.
1248 (format "(define-char-code-property '%S %S %S)\n" 1177;; See lisp/international/README for the copyright and permission notice.\n"))
1249 prop table docstring) 1178 (insert (format "(define-char-code-property '%S %S %S)\n"
1250 ";; Local Variables:\n" 1179 prop table docstring))
1251 ";; coding: utf-8\n" 1180 (if (eobp)
1252 ";; no-byte-compile: t\n" 1181 (insert ";; Local Variables:\n"
1253 ";; End:\n\n" 1182 ";; coding: utf-8\n"
1254 (format ";; %s ends here\n" file))))) 1183 ";; no-byte-compile: t\n"
1184 ";; End:\n\n"
1185 (format ";; %s ends here\n" file)))
1186 (write-file file)
1187 (message "Generating %s...done" file))))
1255 (message "Writing %s..." charprop-file) 1188 (message "Writing %s..." charprop-file)
1256 (insert ";; Local Variables:\n" 1189 (insert ";; Local Variables:\n"
1257 ";; coding: utf-8\n" 1190 ";; coding: utf-8\n"