diff options
| author | Werner LEMBERG | 2001-12-18 17:55:16 +0000 |
|---|---|---|
| committer | Werner LEMBERG | 2001-12-18 17:55:16 +0000 |
| commit | 64b4e1f11b58808211ca20ca24539b6d76455510 (patch) | |
| tree | 6365f452a278bb079629020da1b444920225ba8d | |
| parent | 5ef350634d8259cb27ff797c4b9ce864bdd9bd35 (diff) | |
| download | emacs-64b4e1f11b58808211ca20ca24539b6d76455510.tar.gz emacs-64b4e1f11b58808211ca20ca24539b6d76455510.zip | |
Implementing euc-tw encoding.
| -rw-r--r-- | lisp/ChangeLog | 24 | ||||
| -rw-r--r-- | lisp/language/china-util.el | 271 |
2 files changed, 293 insertions, 2 deletions
diff --git a/lisp/ChangeLog b/lisp/ChangeLog index b63d39b5c5a..4a490c35642 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog | |||
| @@ -1,3 +1,27 @@ | |||
| 1 | 2001-12-18 Werner Lemberg <wl@gnu.org> | ||
| 2 | |||
| 3 | * language/chinese.el, language/cyrillic.el, | ||
| 4 | language/czech.el, language/european.el, language/greek.el, | ||
| 5 | language/hebrew.el, language/indian.el, language/japanese.el, | ||
| 6 | language/korean.el, language/lao.el, language/slovak.el, | ||
| 7 | language/thai.el, language/tibetan.el, language/vietnamese.el: | ||
| 8 | Improve documentation strings of coding systems and language infos. | ||
| 9 | |||
| 10 | 2001-12-18 Werner LEMBERG <wl@gnu.org> | ||
| 11 | |||
| 12 | Add support for EUC-TW decoding/encoding. | ||
| 13 | |||
| 14 | * language/china-util.el (big5-to-flat-code, flat-code-to-big5, | ||
| 15 | euc-to-flat-code, flat-code-to-euc, expand-euc-big5-alist): | ||
| 16 | New auxiliary functions to build `big5-to-cns'. | ||
| 17 | (big5-to-cns): New translation alist. | ||
| 18 | |||
| 19 | * language/chinese.el: Added new coding system `euc-tw' and its | ||
| 20 | alias `euc-taiwan'. | ||
| 21 | Updated language `Chinese-CNS' to include euc-tw encoding also. | ||
| 22 | (ccl-decode-euc-tw, ccl-encode-euc-tw): New functions for handling | ||
| 23 | euc-tw. | ||
| 24 | |||
| 1 | 2001-12-18 Dave Love <fx@gnu.org> | 25 | 2001-12-18 Dave Love <fx@gnu.org> |
| 2 | 26 | ||
| 3 | * loadup.el: Add language/utf-8-lang, language/georgian. | 27 | * loadup.el: Add language/utf-8-lang, language/georgian. |
diff --git a/lisp/language/china-util.el b/lisp/language/china-util.el index 8a0a83d834a..2be6ebcdff4 100644 --- a/lisp/language/china-util.el +++ b/lisp/language/china-util.el | |||
| @@ -1,7 +1,8 @@ | |||
| 1 | ;;; china-util.el --- utilities for Chinese | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN. | 3 | ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN. |
| 4 | ;; Licensed to the Free Software Foundation. | 4 | ;; Licensed to the Free Software Foundation. |
| 5 | ;; Copyright (C) 1995, 2001 Free Software Foundation, Inc. | ||
| 5 | 6 | ||
| 6 | ;; Keywords: mule, multilingual, Chinese | 7 | ;; Keywords: mule, multilingual, Chinese |
| 7 | 8 | ||
| @@ -26,7 +27,7 @@ | |||
| 26 | 27 | ||
| 27 | ;;; Code: | 28 | ;;; Code: |
| 28 | 29 | ||
| 29 | ;; Hz/ZW encoding stuffs | 30 | ;; Hz/ZW/EUC-TW encoding stuff |
| 30 | 31 | ||
| 31 | ;; HZ is an encoding method for Chinese character set GB2312 used | 32 | ;; HZ is an encoding method for Chinese character set GB2312 used |
| 32 | ;; widely in Internet. It is very similar to 7-bit environment of | 33 | ;; widely in Internet. It is very similar to 7-bit environment of |
| @@ -38,6 +39,13 @@ | |||
| 38 | ;; encodes Chinese characters line by line by starting each line with | 39 | ;; encodes Chinese characters line by line by starting each line with |
| 39 | ;; the sequence "zW". It also uses only 7-bit as HZ. | 40 | ;; the sequence "zW". It also uses only 7-bit as HZ. |
| 40 | 41 | ||
| 42 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is | ||
| 43 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | ||
| 44 | ;; a single shift escape followed by three bytes: the first gives the | ||
| 45 | ;; plane, the second and third the character code. Note that characters | ||
| 46 | ;; of plane 1 are (redundantly) accessible with a single shift escape | ||
| 47 | ;; also. | ||
| 48 | |||
| 41 | ;; ISO-2022 escape sequence to designate GB2312. | 49 | ;; ISO-2022 escape sequence to designate GB2312. |
| 42 | (defvar iso2022-gb-designation "\e$A") | 50 | (defvar iso2022-gb-designation "\e$A") |
| 43 | ;; HZ escape sequence to designate GB2312. | 51 | ;; HZ escape sequence to designate GB2312. |
| @@ -156,6 +164,265 @@ Return the length of resulting text." | |||
| 156 | (interactive) | 164 | (interactive) |
| 157 | (encode-hz-region (point-min) (point-max))) | 165 | (encode-hz-region (point-min) (point-max))) |
| 158 | 166 | ||
| 167 | ;; The following sets up a translation table (big5-to-cns) from Big 5 | ||
| 168 | ;; to CNS encoding, using some auxiliary functions to make the code | ||
| 169 | ;; more readable. | ||
| 170 | |||
| 171 | ;; Many kudos to Himi! The used code has been adapted from his | ||
| 172 | ;; mule-ucs package. | ||
| 173 | |||
| 174 | (defun big5-to-flat-code (num) | ||
| 175 | "Convert NUM in Big 5 encoding to a `flat code'. | ||
| 176 | 0xA140 will be mapped to position 0, 0xA141 to position 1, etc. | ||
| 177 | There are no gaps in the flat code." | ||
| 178 | |||
| 179 | (let ((hi (/ num 256)) | ||
| 180 | (lo (% num 256))) | ||
| 181 | (+ (* 157 (- hi #xa1)) | ||
| 182 | (- lo (if (>= lo #xa1) 98 64))))) | ||
| 183 | |||
| 184 | (defun flat-code-to-big5 (num) | ||
| 185 | "Convert NUM from a `flat code' to Big 5 encoding. | ||
| 186 | This is the inverse function of `big5-to-flat-code'." | ||
| 187 | |||
| 188 | (let ((hi (/ num 157)) | ||
| 189 | (lo (% num 157))) | ||
| 190 | (+ (* 256 (+ hi #xa1)) | ||
| 191 | (+ lo (if (< lo 63) 64 98))))) | ||
| 192 | |||
| 193 | (defun euc-to-flat-code (num) | ||
| 194 | "Convert NUM in EUC encoding (in GL representation) to a `flat code'. | ||
| 195 | 0x2121 will be mapped to position 0, 0x2122 to position 1, etc. | ||
| 196 | There are no gaps in the flat code." | ||
| 197 | |||
| 198 | (let ((hi (/ num 256)) | ||
| 199 | (lo (% num 256))) | ||
| 200 | (+ (* 94 (- hi #x21)) | ||
| 201 | (- lo #x21)))) | ||
| 202 | |||
| 203 | (defun flat-code-to-euc (num) | ||
| 204 | "Convert NUM from a `flat code' to EUC encoding (in GL representation). | ||
| 205 | The inverse function of `euc-to-flat-code'. The high and low bytes are | ||
| 206 | returned in a list." | ||
| 207 | |||
| 208 | (let ((hi (/ num 94)) | ||
| 209 | (lo (% num 94))) | ||
| 210 | (list (+ hi #x21) (+ lo #x21)))) | ||
| 211 | |||
| 212 | (defun expand-euc-big5-alist (alist) | ||
| 213 | "Create a translation table and fills it with data given in ALIST. | ||
| 214 | Elements of ALIST can be either given as | ||
| 215 | |||
| 216 | ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) | ||
| 217 | |||
| 218 | or as | ||
| 219 | |||
| 220 | (euc-character . big5-charcode) | ||
| 221 | |||
| 222 | The former maps a range of glyphs in an EUC charset (where STARTCHAR | ||
| 223 | is in GL representation) to a certain range of Big 5 encoded | ||
| 224 | characters, the latter maps a single glyph. Glyphs which can't be | ||
| 225 | mapped will be represented with the byte 0xFF. | ||
| 226 | |||
| 227 | The return value is the filled translation table." | ||
| 228 | |||
| 229 | (let (chartable | ||
| 230 | elem | ||
| 231 | result | ||
| 232 | char | ||
| 233 | big5 | ||
| 234 | i | ||
| 235 | end | ||
| 236 | codepoint | ||
| 237 | charset) | ||
| 238 | (setq chartable (make-char-table 'translation-table #xFF)) | ||
| 239 | (while alist | ||
| 240 | (setq elem (car alist) | ||
| 241 | char (car elem) | ||
| 242 | big5 (cdr elem) | ||
| 243 | alist (cdr alist)) | ||
| 244 | (cond ((and (consp char) | ||
| 245 | (consp big5)) | ||
| 246 | (setq i (big5-to-flat-code (car big5)) | ||
| 247 | end (big5-to-flat-code (cdr big5)) | ||
| 248 | codepoint (euc-to-flat-code (cdr char)) | ||
| 249 | charset (car char)) | ||
| 250 | (while (>= end i) | ||
| 251 | (aset chartable | ||
| 252 | (decode-big5-char (flat-code-to-big5 i)) | ||
| 253 | (apply (function make-char) | ||
| 254 | charset | ||
| 255 | (flat-code-to-euc codepoint))) | ||
| 256 | (setq i (1+ i) | ||
| 257 | codepoint (1+ codepoint))) | ||
| 258 | ) | ||
| 259 | ((and (char-valid-p char) | ||
| 260 | (numberp big5)) | ||
| 261 | (setq i (decode-big5-char big5)) | ||
| 262 | (aset chartable i char) | ||
| 263 | ) | ||
| 264 | (t | ||
| 265 | (error "Unknown slot type: %S" elem) | ||
| 266 | ) | ||
| 267 | ) | ||
| 268 | ) | ||
| 269 | ;; the return value | ||
| 270 | chartable | ||
| 271 | ) | ||
| 272 | ) | ||
| 273 | |||
| 274 | ;; All non-CNS encodings are commented out. | ||
| 275 | |||
| 276 | (define-translation-table 'big5-to-cns | ||
| 277 | (expand-euc-big5-alist | ||
| 278 | '( | ||
| 279 | ;; Symbols | ||
| 280 | ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) | ||
| 281 | (?$(G"X(B . #xA1F6) | ||
| 282 | (?$(G"W(B . #xA1F7) | ||
| 283 | ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) | ||
| 284 | ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) | ||
| 285 | ;; Control codes (vendor dependent) | ||
| 286 | ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) | ||
| 287 | ;; Level 1 Ideographs | ||
| 288 | ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) | ||
| 289 | (?$(GWS(B . #xACFE) | ||
| 290 | ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) | ||
| 291 | ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) | ||
| 292 | ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) | ||
| 293 | (?$(GkP(B . #xBE52) | ||
| 294 | ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) | ||
| 295 | ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) | ||
| 296 | (?$(Gu5(B . #xC2CB) | ||
| 297 | ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) | ||
| 298 | ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) | ||
| 299 | (?$(Gxe(B . #xC3B9) | ||
| 300 | (?$(Gxd(B . #xC3BA) | ||
| 301 | ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) | ||
| 302 | (?$(Gx-(B . #xC456) | ||
| 303 | ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) | ||
| 304 | ;; Symbols | ||
| 305 | ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) | ||
| 306 | ;; Radicals | ||
| 307 | (?$(G'#(B . #xC6BF) | ||
| 308 | (?$(G'$(B . #xC6C0) | ||
| 309 | (?$(G'&(B . #xC6C1) | ||
| 310 | (?$(G'((B . #xC6C2) | ||
| 311 | (?$(G'-(B . #xC6C3) | ||
| 312 | (?$(G'.(B . #xC6C4) | ||
| 313 | (?$(G'/(B . #xC6C5) | ||
| 314 | (?$(G'4(B . #xC6C6) | ||
| 315 | (?$(G'7(B . #xC6C7) | ||
| 316 | (?$(G':(B . #xC6C8) | ||
| 317 | (?$(G'<(B . #xC6C9) | ||
| 318 | (?$(G'B(B . #xC6CA) | ||
| 319 | (?$(G'G(B . #xC6CB) | ||
| 320 | (?$(G'N(B . #xC6CC) | ||
| 321 | (?$(G'S(B . #xC6CD) | ||
| 322 | (?$(G'T(B . #xC6CE) | ||
| 323 | (?$(G'U(B . #xC6CF) | ||
| 324 | (?$(G'Y(B . #xC6D0) | ||
| 325 | (?$(G'Z(B . #xC6D1) | ||
| 326 | (?$(G'a(B . #xC6D2) | ||
| 327 | (?$(G'f(B . #xC6D3) | ||
| 328 | (?$(G()(B . #xC6D4) | ||
| 329 | (?$(G(*(B . #xC6D5) | ||
| 330 | (?$(G(c(B . #xC6D6) | ||
| 331 | (?$(G(l(B . #xC6D7) | ||
| 332 | ;; Diacritical Marks | ||
| 333 | ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) | ||
| 334 | ;; Japanese Kana Supplement | ||
| 335 | ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) | ||
| 336 | ;; Japanese Hiragana | ||
| 337 | ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) | ||
| 338 | ;; Japanese Katakana | ||
| 339 | ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) | ||
| 340 | ;; Cyrillic Characters | ||
| 341 | ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) | ||
| 342 | ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) | ||
| 343 | ;; Special Chinese Characters | ||
| 344 | (?$(J!#(B . #xC879) | ||
| 345 | (?$(J!$(B . #xC87B) | ||
| 346 | (?$(J!*(B . #xC87D) | ||
| 347 | (?$(J!R(B . #xC8A2) | ||
| 348 | |||
| 349 | ;; JIS X 0208 NOT SIGN (cf. U+00AC) | ||
| 350 | ; (?$B"L(B . #xC8CD) | ||
| 351 | ;; JIS X 0212 BROKEN BAR (cf. U+00A6) | ||
| 352 | ; (?$(D"C(B . #xC8CE) | ||
| 353 | |||
| 354 | ;; GB 2312 characters | ||
| 355 | ; (?$A!d(B . #xC8CF) | ||
| 356 | ; (?$A!e(B . #xC8D0) | ||
| 357 | ;;;;; C8D1 - Japanese `($B3t(B)' | ||
| 358 | ; (?$A!m(B . #xC8D2) | ||
| 359 | ;;;;; C8D2 - Tel. | ||
| 360 | |||
| 361 | ;; Level 2 Ideographs | ||
| 362 | ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) | ||
| 363 | (?$(GDB(B . #xC94A);; a duplicate of #xA461 | ||
| 364 | ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) | ||
| 365 | ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) | ||
| 366 | (?$(H!L(B . #xC9BE) | ||
| 367 | ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) | ||
| 368 | ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) | ||
| 369 | (?$(H"M(B . #xCAF7) | ||
| 370 | ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) | ||
| 371 | (?$(H>c(B . #xD6CC) | ||
| 372 | ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) | ||
| 373 | (?$(H?j(B . #xD77A) | ||
| 374 | ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) | ||
| 375 | (?$(H7o(B . #xDADF) | ||
| 376 | ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) | ||
| 377 | ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) | ||
| 378 | (?$(HAv(B . #xDDFC);; a duplicate of #xDCD1 | ||
| 379 | ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) | ||
| 380 | ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) | ||
| 381 | ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) | ||
| 382 | ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) | ||
| 383 | (?$(HUK(B . #xEBF1) | ||
| 384 | ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) | ||
| 385 | (?$(HW"(B . #xECDE) | ||
| 386 | ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) | ||
| 387 | ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) | ||
| 388 | (?$(Hd/(B . #xEEEB) | ||
| 389 | ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) | ||
| 390 | (?$(H]t(B . #xF056) | ||
| 391 | ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) | ||
| 392 | (?$(HZ((B . #xF0CB) | ||
| 393 | ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) | ||
| 394 | ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) | ||
| 395 | (?$(Hga(B . #xF16B) | ||
| 396 | ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) | ||
| 397 | (?$(Hi4(B . #xF268) | ||
| 398 | ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) | ||
| 399 | ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) | ||
| 400 | ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) | ||
| 401 | ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) | ||
| 402 | (?$(HfM(B . #xF4B5) | ||
| 403 | ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) | ||
| 404 | ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) | ||
| 405 | (?$(HjK(B . #xF663) | ||
| 406 | ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) | ||
| 407 | ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) | ||
| 408 | (?$(Hqf(B . #xF9C4) | ||
| 409 | (?$(Hr4(B . #xF9C5) | ||
| 410 | (?$(Hr@(B . #xF9C6) | ||
| 411 | ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) | ||
| 412 | ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) | ||
| 413 | |||
| 414 | ;; Additional Ideographs | ||
| 415 | (?$(IC7(B . #xF9D6) | ||
| 416 | (?$(IOP(B . #xF9D7) | ||
| 417 | (?$(IDN(B . #xF9D8) | ||
| 418 | (?$(IPJ(B . #xF9D9) | ||
| 419 | (?$(I,](B . #xF9DA) | ||
| 420 | (?$(I=~(B . #xF9DB) | ||
| 421 | (?$(IK\(B . #xF9DC) | ||
| 422 | ) | ||
| 423 | ) | ||
| 424 | ) | ||
| 425 | |||
| 159 | ;; | 426 | ;; |
| 160 | (provide 'china-util) | 427 | (provide 'china-util) |
| 161 | 428 | ||