diff options
| author | Dave Love | 2002-12-15 16:46:00 +0000 |
|---|---|---|
| committer | Dave Love | 2002-12-15 16:46:00 +0000 |
| commit | ccdd5c61872c10cce32e500bc15aa3efe05c7d75 (patch) | |
| tree | 9671ae37571353edd71be29942405581649a9357 | |
| parent | 241094692158692abf6c958873d98e4738ad72ef (diff) | |
| download | emacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.tar.gz emacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.zip | |
(ucs-mule-cjk-to-unicode)
(utf-subst-table-for-encode, ucs-unicode-to-mule-cjk)
(utf-subst-table-for-decode): Specify :size, :rehash-size.
(utf-translate-cjk): :set rewritten to load subst-... files. Add
:set-after.
(ccl-decode-mule-utf-8): Consider CJK translation for r3<#x3400.
| -rw-r--r-- | lisp/international/utf-8.el | 84 |
1 files changed, 58 insertions, 26 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 9bbb9d2c756..018691b1168 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el | |||
| @@ -47,7 +47,7 @@ | |||
| 47 | ;; idempotent -- to represent the bytes to fix that needs a new charset. | 47 | ;; idempotent -- to represent the bytes to fix that needs a new charset. |
| 48 | ;; | 48 | ;; |
| 49 | ;; Characters from other character sets can be encoded with mule-utf-8 | 49 | ;; Characters from other character sets can be encoded with mule-utf-8 |
| 50 | ;; by populating the translation-table | 50 | ;; by populating the translation table |
| 51 | ;; `utf-translation-table-for-encode' and registering the translation | 51 | ;; `utf-translation-table-for-encode' and registering the translation |
| 52 | ;; with `register-char-codings'. Hash tables | 52 | ;; with `register-char-codings'. Hash tables |
| 53 | ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are | 53 | ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are |
| @@ -95,23 +95,25 @@ translation-table named `utf-translation-table-for-encode'") | |||
| 95 | (define-translation-table 'utf-translation-table-for-decode) | 95 | (define-translation-table 'utf-translation-table-for-decode) |
| 96 | 96 | ||
| 97 | 97 | ||
| 98 | (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) | 98 | (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000 |
| 99 | :rehash-size 1000) | ||
| 99 | "Hash table mapping Emacs CJK character sets to Unicode code points. | 100 | "Hash table mapping Emacs CJK character sets to Unicode code points. |
| 100 | 101 | ||
| 101 | If `utf-translate-cjk' is non-nil, this table populates the | 102 | If `utf-translate-cjk' is non-nil, this table populates the |
| 102 | translation-hash-table named `utf-subst-table-for-encode'.") | 103 | translation-hash-table named `utf-subst-table-for-encode'.") |
| 103 | 104 | ||
| 104 | (define-translation-hash-table 'utf-subst-table-for-encode | 105 | (define-translation-hash-table 'utf-subst-table-for-encode |
| 105 | (make-hash-table :test 'eq)) | 106 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000)) |
| 106 | 107 | ||
| 107 | (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) | 108 | (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000 |
| 109 | :rehash-size 1000) | ||
| 108 | "Hash table mapping Unicode code points to Emacs CJK character sets. | 110 | "Hash table mapping Unicode code points to Emacs CJK character sets. |
| 109 | 111 | ||
| 110 | If `utf-translate-cjk' is non-nil, this table populates the | 112 | If `utf-translate-cjk' is non-nil, this table populates the |
| 111 | translation-hash-table named `utf-subst-table-for-decode'.") | 113 | translation-hash-table named `utf-subst-table-for-decode'.") |
| 112 | 114 | ||
| 113 | (define-translation-hash-table 'utf-subst-table-for-decode | 115 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 114 | (make-hash-table :test 'eq)) | 116 | (make-hash-table :test 'eq :size 21500 :rehash-size 200)) |
| 115 | 117 | ||
| 116 | (mapc | 118 | (mapc |
| 117 | (lambda (pair) | 119 | (lambda (pair) |
| @@ -205,19 +207,46 @@ Setting this variable outside customize has no effect." | |||
| 205 | 207 | ||
| 206 | (defcustom utf-translate-cjk nil | 208 | (defcustom utf-translate-cjk nil |
| 207 | "Whether the UTF based coding systems should decode/encode CJK characters. | 209 | "Whether the UTF based coding systems should decode/encode CJK characters. |
| 208 | 210 | Enabling this loads tables which allow the coding systems mule-utf-8, | |
| 209 | Enabling this loads tables which enable the coding systems: | 211 | mule-utf-16-le and mule-utf-16-be to encode characters in the charsets |
| 210 | mule-utf-8, mule-utf-16-le, mule-utf-16-be | 212 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
| 211 | to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and | 213 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to |
| 212 | `japanese-jisx0208', and to decode the corresponding unicodes into | 214 | decode the corresponding unicodes into such characters. |
| 213 | such characters. This works by loading the library `utf-8-subst'; see | 215 | |
| 214 | its commentary. The tables are fairly large (about 33000 entries), so this | 216 | Where the charsets overlap, the one preferred for decoding is chosen |
| 215 | option is not the default." | 217 | according to the language environment in effect when this option is |
| 216 | :link '(emacs-commentary-link "utf-8-subst") | 218 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for |
| 219 | Chinese-Big5 and jisx for other environments. | ||
| 220 | |||
| 221 | The tables are large (over 40000 entries), so this option is not the | ||
| 222 | default. Also, installing them may be rather slow." | ||
| 217 | :set (lambda (s v) | 223 | :set (lambda (s v) |
| 218 | (if v | 224 | (if v |
| 219 | (progn | 225 | (progn |
| 220 | (require 'utf-8-subst) | 226 | ;; Load the files explicitly, to avoid having to keep |
| 227 | ;; around the large tables they contain (as well as the | ||
| 228 | ;; ones which get built). | ||
| 229 | (cond | ||
| 230 | ((string= "Korean" current-language-environment) | ||
| 231 | (load "subst-jis") | ||
| 232 | (load "subst-big5") | ||
| 233 | (load "subst-gb2312") | ||
| 234 | (load "subst-ksc")) | ||
| 235 | ((string= "Chinese-BIG5" current-language-environment) | ||
| 236 | (load "subst-jis") | ||
| 237 | (load "subst-ksc") | ||
| 238 | (load "subst-gb2312") | ||
| 239 | (load "subst-big5")) | ||
| 240 | ((string= "Chinese-GB" current-language-environment) | ||
| 241 | (load "subst-jis") | ||
| 242 | (load "subst-ksc") | ||
| 243 | (load "subst-big5") | ||
| 244 | (load "subst-gb2312")) | ||
| 245 | (t | ||
| 246 | (load "subst-ksc") | ||
| 247 | (load "subst-gb2312") | ||
| 248 | (load "subst-big5") | ||
| 249 | (load "subst-jis"))) ; jis covers as much as big5, gb2312 | ||
| 221 | (let ((table (make-char-table 'translation-table))) | 250 | (let ((table (make-char-table 'translation-table))) |
| 222 | (maphash (lambda (k v) | 251 | (maphash (lambda (k v) |
| 223 | (aset table k t)) | 252 | (aset table k t)) |
| @@ -244,6 +273,7 @@ option is not the default." | |||
| 244 | (set-default s v)) | 273 | (set-default s v)) |
| 245 | :version "21.4" | 274 | :version "21.4" |
| 246 | :type 'boolean | 275 | :type 'boolean |
| 276 | :set-after '(current-language-environment) | ||
| 247 | :group 'mule) | 277 | :group 'mule) |
| 248 | 278 | ||
| 249 | (define-ccl-program ccl-decode-mule-utf-8 | 279 | (define-ccl-program ccl-decode-mule-utf-8 |
| @@ -378,18 +408,20 @@ option is not the default." | |||
| 378 | (write-multibyte-character r0 r1)) | 408 | (write-multibyte-character r0 r1)) |
| 379 | 409 | ||
| 380 | ;; mule-unicode-2500-33ff | 410 | ;; mule-unicode-2500-33ff |
| 381 | ;; Fixme: Perhaps allow translation via | ||
| 382 | ;; utf-subst-table-for-decode for #x2e80 up, so | ||
| 383 | ;; that we use consistent charsets for all of | ||
| 384 | ;; CJK. Would need corresponding change to | ||
| 385 | ;; encoding tables. | ||
| 386 | (if (r3 < #x3400) | 411 | (if (r3 < #x3400) |
| 387 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 412 | ((r4 = r3) ; don't zap r3 |
| 388 | (r3 -= #x2500) | 413 | (lookup-integer utf-subst-table-for-decode r4 r5) |
| 389 | (r3 //= 96) | 414 | (if r7 |
| 390 | (r1 = (r7 + 32)) | 415 | ;; got a translation |
| 391 | (r1 += ((r3 + 32) << 7)) | 416 | ((write-multibyte-character r4 r5) |
| 392 | (write-multibyte-character r0 r1)) | 417 | ;; Zapped through register starvation. |
| 418 | (r5 = ,(charset-id 'eight-bit-control))) | ||
| 419 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 420 | (r3 -= #x2500) | ||
| 421 | (r3 //= 96) | ||
| 422 | (r1 = (r7 + 32)) | ||
| 423 | (r1 += ((r3 + 32) << 7)) | ||
| 424 | (write-multibyte-character r0 r1)))) | ||
| 393 | 425 | ||
| 394 | ;; U+3400 .. U+D7FF | 426 | ;; U+3400 .. U+D7FF |
| 395 | ;; Try to convert to CJK chars, else keep | 427 | ;; Try to convert to CJK chars, else keep |