aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Love2002-12-15 16:46:00 +0000
committerDave Love2002-12-15 16:46:00 +0000
commitccdd5c61872c10cce32e500bc15aa3efe05c7d75 (patch)
tree9671ae37571353edd71be29942405581649a9357
parent241094692158692abf6c958873d98e4738ad72ef (diff)
downloademacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.tar.gz
emacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.zip
(ucs-mule-cjk-to-unicode)
(utf-subst-table-for-encode, ucs-unicode-to-mule-cjk) (utf-subst-table-for-decode): Specify :size, :rehash-size. (utf-translate-cjk): :set rewritten to load subst-... files. Add :set-after. (ccl-decode-mule-utf-8): Consider CJK translation for r3<#x3400.
-rw-r--r--lisp/international/utf-8.el84
1 files changed, 58 insertions, 26 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 9bbb9d2c756..018691b1168 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -47,7 +47,7 @@
47;; idempotent -- to represent the bytes to fix that needs a new charset. 47;; idempotent -- to represent the bytes to fix that needs a new charset.
48;; 48;;
49;; Characters from other character sets can be encoded with mule-utf-8 49;; Characters from other character sets can be encoded with mule-utf-8
50;; by populating the translation-table 50;; by populating the translation table
51;; `utf-translation-table-for-encode' and registering the translation 51;; `utf-translation-table-for-encode' and registering the translation
52;; with `register-char-codings'. Hash tables 52;; with `register-char-codings'. Hash tables
53;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are 53;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
@@ -95,23 +95,25 @@ translation-table named `utf-translation-table-for-encode'")
95(define-translation-table 'utf-translation-table-for-decode) 95(define-translation-table 'utf-translation-table-for-decode)
96 96
97 97
98(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) 98(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000
99 :rehash-size 1000)
99 "Hash table mapping Emacs CJK character sets to Unicode code points. 100 "Hash table mapping Emacs CJK character sets to Unicode code points.
100 101
101If `utf-translate-cjk' is non-nil, this table populates the 102If `utf-translate-cjk' is non-nil, this table populates the
102translation-hash-table named `utf-subst-table-for-encode'.") 103translation-hash-table named `utf-subst-table-for-encode'.")
103 104
104(define-translation-hash-table 'utf-subst-table-for-encode 105(define-translation-hash-table 'utf-subst-table-for-encode
105 (make-hash-table :test 'eq)) 106 (make-hash-table :test 'eq :size 43000 :rehash-size 1000))
106 107
107(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) 108(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000
109 :rehash-size 1000)
108 "Hash table mapping Unicode code points to Emacs CJK character sets. 110 "Hash table mapping Unicode code points to Emacs CJK character sets.
109 111
110If `utf-translate-cjk' is non-nil, this table populates the 112If `utf-translate-cjk' is non-nil, this table populates the
111translation-hash-table named `utf-subst-table-for-decode'.") 113translation-hash-table named `utf-subst-table-for-decode'.")
112 114
113(define-translation-hash-table 'utf-subst-table-for-decode 115(define-translation-hash-table 'utf-subst-table-for-decode
114 (make-hash-table :test 'eq)) 116 (make-hash-table :test 'eq :size 21500 :rehash-size 200))
115 117
116(mapc 118(mapc
117 (lambda (pair) 119 (lambda (pair)
@@ -205,19 +207,46 @@ Setting this variable outside customize has no effect."
205 207
206(defcustom utf-translate-cjk nil 208(defcustom utf-translate-cjk nil
207 "Whether the UTF based coding systems should decode/encode CJK characters. 209 "Whether the UTF based coding systems should decode/encode CJK characters.
208 210Enabling this loads tables which allow the coding systems mule-utf-8,
209Enabling this loads tables which enable the coding systems: 211mule-utf-16-le and mule-utf-16-be to encode characters in the charsets
210 mule-utf-8, mule-utf-16-le, mule-utf-16-be 212`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
211to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and 213`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
212`japanese-jisx0208', and to decode the corresponding unicodes into 214decode the corresponding unicodes into such characters.
213such characters. This works by loading the library `utf-8-subst'; see 215
214its commentary. The tables are fairly large (about 33000 entries), so this 216Where the charsets overlap, the one preferred for decoding is chosen
215option is not the default." 217according to the language environment in effect when this option is
216 :link '(emacs-commentary-link "utf-8-subst") 218turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
219Chinese-Big5 and jisx for other environments.
220
221The tables are large (over 40000 entries), so this option is not the
222default. Also, installing them may be rather slow."
217 :set (lambda (s v) 223 :set (lambda (s v)
218 (if v 224 (if v
219 (progn 225 (progn
220 (require 'utf-8-subst) 226 ;; Load the files explicitly, to avoid having to keep
227 ;; around the large tables they contain (as well as the
228 ;; ones which get built).
229 (cond
230 ((string= "Korean" current-language-environment)
231 (load "subst-jis")
232 (load "subst-big5")
233 (load "subst-gb2312")
234 (load "subst-ksc"))
235 ((string= "Chinese-BIG5" current-language-environment)
236 (load "subst-jis")
237 (load "subst-ksc")
238 (load "subst-gb2312")
239 (load "subst-big5"))
240 ((string= "Chinese-GB" current-language-environment)
241 (load "subst-jis")
242 (load "subst-ksc")
243 (load "subst-big5")
244 (load "subst-gb2312"))
245 (t
246 (load "subst-ksc")
247 (load "subst-gb2312")
248 (load "subst-big5")
249 (load "subst-jis"))) ; jis covers as much as big5, gb2312
221 (let ((table (make-char-table 'translation-table))) 250 (let ((table (make-char-table 'translation-table)))
222 (maphash (lambda (k v) 251 (maphash (lambda (k v)
223 (aset table k t)) 252 (aset table k t))
@@ -244,6 +273,7 @@ option is not the default."
244 (set-default s v)) 273 (set-default s v))
245 :version "21.4" 274 :version "21.4"
246 :type 'boolean 275 :type 'boolean
276 :set-after '(current-language-environment)
247 :group 'mule) 277 :group 'mule)
248 278
249(define-ccl-program ccl-decode-mule-utf-8 279(define-ccl-program ccl-decode-mule-utf-8
@@ -378,18 +408,20 @@ option is not the default."
378 (write-multibyte-character r0 r1)) 408 (write-multibyte-character r0 r1))
379 409
380 ;; mule-unicode-2500-33ff 410 ;; mule-unicode-2500-33ff
381 ;; Fixme: Perhaps allow translation via
382 ;; utf-subst-table-for-decode for #x2e80 up, so
383 ;; that we use consistent charsets for all of
384 ;; CJK. Would need corresponding change to
385 ;; encoding tables.
386 (if (r3 < #x3400) 411 (if (r3 < #x3400)
387 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 412 ((r4 = r3) ; don't zap r3
388 (r3 -= #x2500) 413 (lookup-integer utf-subst-table-for-decode r4 r5)
389 (r3 //= 96) 414 (if r7
390 (r1 = (r7 + 32)) 415 ;; got a translation
391 (r1 += ((r3 + 32) << 7)) 416 ((write-multibyte-character r4 r5)
392 (write-multibyte-character r0 r1)) 417 ;; Zapped through register starvation.
418 (r5 = ,(charset-id 'eight-bit-control)))
419 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
420 (r3 -= #x2500)
421 (r3 //= 96)
422 (r1 = (r7 + 32))
423 (r1 += ((r3 + 32) << 7))
424 (write-multibyte-character r0 r1))))
393 425
394 ;; U+3400 .. U+D7FF 426 ;; U+3400 .. U+D7FF
395 ;; Try to convert to CJK chars, else keep 427 ;; Try to convert to CJK chars, else keep