diff options
| author | Dave Love | 2001-12-07 14:26:02 +0000 |
|---|---|---|
| committer | Dave Love | 2001-12-07 14:26:02 +0000 |
| commit | aa2e3f49f337b3a14e208d1f2ddb34912730c535 (patch) | |
| tree | 1269fbc291ca9363b5431b844d0c3be76db7f44a | |
| parent | 6964db142f8436c6af64cb2ac64a3961d02a8162 (diff) | |
| download | emacs-aa2e3f49f337b3a14e208d1f2ddb34912730c535.tar.gz emacs-aa2e3f49f337b3a14e208d1f2ddb34912730c535.zip | |
(ucs-mule-to-mule-unicode): New
translation table.
(ccl-encode-mule-utf-8): Use it.
(utf-8-untranslated-to-ucs, utf-8-help-echo, utf-8-compose)
(utf-8-post-read-conversion, utf-8-pre-write-conversion): New
function.
(utf-8-subst-table): New variable.
(utf-8-compose-scripts): New option.
(mule-utf-8): Update safe-charsets, pre-write and post-read
conversion.
| -rw-r--r-- | lisp/international/utf-8.el | 195 |
1 files changed, 171 insertions, 24 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 85f60409567..f04fc45e7e9 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | ;;; utf-8.el --- limited UTF-8 decoding/encoding support | 1 | ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
| 4 | ;; Licensed to the Free Software Foundation. | 4 | ;; Licensed to the Free Software Foundation. |
| @@ -26,8 +26,8 @@ | |||
| 26 | 26 | ||
| 27 | ;;; Commentary: | 27 | ;;; Commentary: |
| 28 | 28 | ||
| 29 | ;; The coding-system `mule-utf-8' supports encoding/decoding of the | 29 | ;; The coding-system `mule-utf-8' basically supports encoding/decoding |
| 30 | ;; following character sets to and from UTF-8: | 30 | ;; of the following character sets to and from UTF-8: |
| 31 | ;; | 31 | ;; |
| 32 | ;; ascii | 32 | ;; ascii |
| 33 | ;; eight-bit-control | 33 | ;; eight-bit-control |
| @@ -36,15 +36,14 @@ | |||
| 36 | ;; mule-unicode-2500-33ff | 36 | ;; mule-unicode-2500-33ff |
| 37 | ;; mule-unicode-e000-ffff | 37 | ;; mule-unicode-e000-ffff |
| 38 | ;; | 38 | ;; |
| 39 | ;; Characters of other character sets cannot be encoded with | ||
| 40 | ;; mule-utf-8. Note that the mule-unicode charsets currently lack | ||
| 41 | ;; case and syntax information, so things like `downcase' will only | ||
| 42 | ;; work for characters from ASCII and Latin-1. | ||
| 43 | ;; | ||
| 44 | ;; On decoding, Unicode characters that do not fit into the above | 39 | ;; On decoding, Unicode characters that do not fit into the above |
| 45 | ;; character sets are handled as `eight-bit-control' or | 40 | ;; character sets are handled as `eight-bit-control' or |
| 46 | ;; `eight-bit-graphic' characters to retain the information about the | 41 | ;; `eight-bit-graphic' characters to retain the information about the |
| 47 | ;; original byte sequence. | 42 | ;; original byte sequence. |
| 43 | ;; | ||
| 44 | ;; Characters from other character sets can be encoded with | ||
| 45 | ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | ||
| 46 | ;; registering the translation with `register-char-codings'. | ||
| 48 | 47 | ||
| 49 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | 48 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: |
| 50 | 49 | ||
| @@ -57,6 +56,11 @@ | |||
| 57 | 56 | ||
| 58 | ;;; Code: | 57 | ;;; Code: |
| 59 | 58 | ||
| 59 | (defvar ucs-mule-to-mule-unicode (make-translation-table) | ||
| 60 | "Translation table for encoding to `mule-utf-8'.") | ||
| 61 | ;; Could have been done by ucs-tables loaded before. | ||
| 62 | (unless (get 'ucs-mule-to-mule-unicode 'translation-table) | ||
| 63 | (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) | ||
| 60 | (define-ccl-program ccl-decode-mule-utf-8 | 64 | (define-ccl-program ccl-decode-mule-utf-8 |
| 61 | ;; | 65 | ;; |
| 62 | ;; charset | bytes in utf-8 | bytes in emacs | 66 | ;; charset | bytes in utf-8 | bytes in emacs |
| @@ -64,6 +68,7 @@ | |||
| 64 | ;; ascii | 1 | 1 | 68 | ;; ascii | 1 | 1 |
| 65 | ;; -----------------------+----------------+--------------- | 69 | ;; -----------------------+----------------+--------------- |
| 66 | ;; eight-bit-control | 2 | 2 | 70 | ;; eight-bit-control | 2 | 2 |
| 71 | ;; eight-bit-graphic | 2 | 1 | ||
| 67 | ;; latin-iso8859-1 | 2 | 2 | 72 | ;; latin-iso8859-1 | 2 | 2 |
| 68 | ;; -----------------------+----------------+--------------- | 73 | ;; -----------------------+----------------+--------------- |
| 69 | ;; mule-unicode-0100-24ff | 2 | 4 | 74 | ;; mule-unicode-0100-24ff | 2 | 4 |
| @@ -228,7 +233,8 @@ characters.") | |||
| 228 | (loop | 233 | (loop |
| 229 | (if (r5 < 0) | 234 | (if (r5 < 0) |
| 230 | ((r1 = -1) | 235 | ((r1 = -1) |
| 231 | (read-multibyte-character r0 r1)) | 236 | (read-multibyte-character r0 r1) |
| 237 | (translate-character ucs-mule-to-mule-unicode r0 r1)) | ||
| 232 | (;; We have already done read-multibyte-character. | 238 | (;; We have already done read-multibyte-character. |
| 233 | (r0 = r5) | 239 | (r0 = r5) |
| 234 | (r1 = r6) | 240 | (r1 = r6) |
| @@ -340,26 +346,126 @@ Only characters from the charsets ascii, eight-bit-control, | |||
| 340 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. | 346 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
| 341 | Others are encoded as U+FFFD.") | 347 | Others are encoded as U+FFFD.") |
| 342 | 348 | ||
| 349 | ;; Dummy definition so that the CCL can be checked correctly; the | ||
| 350 | ;; actual data are loaded on demand. | ||
| 351 | (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | ||
| 352 | (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | ||
| 353 | |||
| 354 | (defsubst utf-8-untranslated-to-ucs () | ||
| 355 | (let ((b1 (char-after)) | ||
| 356 | (b2 (char-after (1+ (point)))) | ||
| 357 | (b3 (char-after (+ 2 (point)))) | ||
| 358 | (b4 (char-after (+ 4 (point))))) | ||
| 359 | (if (and b1 b2 b3) | ||
| 360 | (cond ((< b1 ?\xf0) | ||
| 361 | (setq b2 (lsh (logand b2 ?\x3f) 6)) | ||
| 362 | (setq b3 (logand b3 ?\x3f)) | ||
| 363 | (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12)))) | ||
| 364 | (b4 | ||
| 365 | (setq b2 (lsh (logand b2 ?\x3f) 12)) | ||
| 366 | (setq b3 (lsh (logand b3 ?\x3f) 6)) | ||
| 367 | (setq b4 (logand b4 ?\x3f)) | ||
| 368 | (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07) | ||
| 369 | 18))))))))) | ||
| 370 | |||
| 371 | (defun utf-8-help-echo (window object position) | ||
| 372 | (format "Untranslated Unicode U+%04X" | ||
| 373 | (get-char-property position 'untranslated-utf-8 object))) | ||
| 374 | |||
| 375 | (defvar utf-8-subst-table nil | ||
| 376 | "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.") | ||
| 377 | |||
| 378 | ;; We compose the untranslatable sequences into a single character. | ||
| 379 | ;; This is infelicitous for editing, because there's currently no | ||
| 380 | ;; mechanism for treating compositions as atomic, but is OK for | ||
| 381 | ;; display. We try to compose an appropriate character from a hash | ||
| 382 | ;; table of CJK characters to display correctly. Otherwise we use | ||
| 383 | ;; U+FFFD. What we really should have is hash table lookup from CCL | ||
| 384 | ;; so that we could do this properly. This function GCs too much. | ||
| 385 | (defsubst utf-8-compose () | ||
| 386 | "Put a suitable composition on an untranslatable sequence. | ||
| 387 | Return the sequence's length." | ||
| 388 | (let* ((u (utf-8-untranslated-to-ucs)) | ||
| 389 | (l (and u (if (>= u ?\x10000) | ||
| 390 | 4 | ||
| 391 | 3))) | ||
| 392 | (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) | ||
| 393 | (when u | ||
| 394 | (put-text-property (point) (min (point-max) (+ l (point))) | ||
| 395 | 'untranslated-utf-8 u) | ||
| 396 | (unless subst | ||
| 397 | (put-text-property (point) (min (point-max) (+ l (point))) | ||
| 398 | 'help-echo 'utf-8-help-echo) | ||
| 399 | (setq subst ?$,3u=(B)) | ||
| 400 | (compose-region (point) (+ l (point)) subst) | ||
| 401 | l))) | ||
| 402 | |||
| 403 | (defcustom utf-8-compose-scripts nil | ||
| 404 | "*Non-nil means compose various scipts on decoding utf-8 text." | ||
| 405 | :group 'mule | ||
| 406 | :type 'boolean) ; omitted in Emacs 21.1 | ||
| 407 | |||
| 408 | (defun utf-8-post-read-conversion (length) | ||
| 409 | "Compose untranslated utf-8 sequences into single characters. | ||
| 410 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | ||
| 411 | (save-excursion | ||
| 412 | ;; Can't do eval-when-compile to insert a multibyte constant | ||
| 413 | ;; version of the string in the loop, since it's always loaded as | ||
| 414 | ;; unibyte from a byte-compiled file. | ||
| 415 | (let ((range (string-as-multibyte "^\341-\377"))) | ||
| 416 | (while (and (skip-chars-forward | ||
| 417 | range) | ||
| 418 | (not (eobp))) | ||
| 419 | (forward-char (utf-8-compose))))) | ||
| 420 | ;; Fixme: Takahashi-san implies it may not work this easily -- needs | ||
| 421 | ;; checking with him. | ||
| 422 | (when (and utf-8-compose-scripts (> length 1)) | ||
| 423 | ;; These currently have definitions which cover the relevant | ||
| 424 | ;; Unicodes. We could avoid loading thai-util &c by checking | ||
| 425 | ;; whether the region contains any characters with the appropriate | ||
| 426 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | ||
| 427 | (save-excursion (setq length (diacritic-post-read-conversion length))) | ||
| 428 | (save-excursion (setq length (thai-post-read-conversion length))) | ||
| 429 | (save-excursion (setq length (lao-post-read-conversion length))) | ||
| 430 | (save-excursion (setq length (devanagari-post-read-conversion length)))) | ||
| 431 | length) | ||
| 432 | |||
| 433 | (defun utf-8-pre-write-conversion (beg end) | ||
| 434 | "Semi-dummy pre-write function effectively to autoload ucs-tables." | ||
| 435 | ;; Ensure translation table is loaded. | ||
| 436 | (require 'ucs-tables) | ||
| 437 | ;; Don't do this again. | ||
| 438 | (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | ||
| 439 | nil) | ||
| 440 | |||
| 343 | (make-coding-system | 441 | (make-coding-system |
| 344 | 'mule-utf-8 4 ?u | 442 | 'mule-utf-8 4 ?u |
| 345 | "UTF-8 encoding for Emacs-supported Unicode characters. | 443 | "UTF-8 encoding for Emacs-supported Unicode characters. |
| 346 | The supported Emacs character sets are: | 444 | The supported Emacs character sets are the following, plus others |
| 347 | ascii | 445 | which may be included in the translation table |
| 348 | eight-bit-control | 446 | `ucs-mule-to-mule-unicode': |
| 349 | eight-bit-graphic | 447 | ascii |
| 350 | latin-iso8859-1 | 448 | eight-bit-control |
| 351 | mule-unicode-0100-24ff | 449 | eight-bit-graphic |
| 352 | mule-unicode-2500-33ff | 450 | latin-iso8859-1 |
| 353 | mule-unicode-e000-ffff | 451 | latin-iso8859-2 |
| 452 | latin-iso8859-3 | ||
| 453 | latin-iso8859-4 | ||
| 454 | cyrillic-iso8859-5 | ||
| 455 | greek-iso8859-7 | ||
| 456 | hebrew-iso8859-8 | ||
| 457 | latin-iso8859-9 | ||
| 458 | latin-iso8859-14 | ||
| 459 | latin-iso8859-15 | ||
| 460 | mule-unicode-0100-24ff | ||
| 461 | mule-unicode-2500-33ff | ||
| 462 | mule-unicode-e000-ffff | ||
| 354 | 463 | ||
| 355 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF | 464 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
| 356 | are decoded into sequences of eight-bit-control and eight-bit-graphic | 465 | are decoded into sequences of eight-bit-control and eight-bit-graphic |
| 357 | characters to preserve their byte sequences. Emacs characters out of | 466 | characters to preserve their byte sequences and composed to display as |
| 358 | these ranges are encoded into U+FFFD. | 467 | a single character. Emacs characters that can't be encoded to these |
| 359 | 468 | ranges are encoded as U+FFFD." | |
| 360 | Note that, currently, characters in the mule-unicode charsets have no | ||
| 361 | syntax and case information. Thus, for instance, upper- and | ||
| 362 | lower-casing commands won't work with them." | ||
| 363 | 469 | ||
| 364 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 470 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
| 365 | '((safe-charsets | 471 | '((safe-charsets |
| @@ -367,13 +473,54 @@ lower-casing commands won't work with them." | |||
| 367 | eight-bit-control | 473 | eight-bit-control |
| 368 | eight-bit-graphic | 474 | eight-bit-graphic |
| 369 | latin-iso8859-1 | 475 | latin-iso8859-1 |
| 476 | latin-iso8859-15 | ||
| 477 | latin-iso8859-14 | ||
| 478 | latin-iso8859-9 | ||
| 479 | hebrew-iso8859-8 | ||
| 480 | greek-iso8859-7 | ||
| 481 | cyrillic-iso8859-5 | ||
| 482 | latin-iso8859-4 | ||
| 483 | latin-iso8859-3 | ||
| 484 | latin-iso8859-2 | ||
| 485 | vietnamese-viscii-lower | ||
| 486 | vietnamese-viscii-upper | ||
| 487 | thai-tis620 | ||
| 488 | ipa | ||
| 489 | ethiopic | ||
| 490 | indian-is13194 | ||
| 491 | katakana-jisx0201 | ||
| 492 | chinese-sisheng | ||
| 493 | lao | ||
| 370 | mule-unicode-0100-24ff | 494 | mule-unicode-0100-24ff |
| 371 | mule-unicode-2500-33ff | 495 | mule-unicode-2500-33ff |
| 372 | mule-unicode-e000-ffff) | 496 | mule-unicode-e000-ffff) |
| 373 | (mime-charset . utf-8) | 497 | (mime-charset . utf-8) |
| 374 | (coding-category . coding-category-utf-8) | 498 | (coding-category . coding-category-utf-8) |
| 375 | (valid-codes (0 . 255)))) | 499 | (valid-codes (0 . 255)) |
| 500 | (pre-write-conversion . utf-8-pre-write-conversion) | ||
| 501 | (post-read-conversion . utf-8-post-read-conversion))) | ||
| 376 | 502 | ||
| 377 | (define-coding-system-alias 'utf-8 'mule-utf-8) | 503 | (define-coding-system-alias 'utf-8 'mule-utf-8) |
| 378 | 504 | ||
| 505 | ;; I think this needs special private charsets defined for the | ||
| 506 | ;; untranslated sequences, if it's going to work well. | ||
| 507 | |||
| 508 | ;;; (defun utf-8-compose-function (pos to pattern &optional string) | ||
| 509 | ;;; (let* ((prop (get-char-property pos 'composition string)) | ||
| 510 | ;;; (l (and prop (- (cadr prop) (car prop))))) | ||
| 511 | ;;; (cond ((and l (> l (- to pos))) | ||
| 512 | ;;; (delete-region pos to)) | ||
| 513 | ;;; ((and (> (char-after pos) 224) | ||
| 514 | ;;; (< (char-after pos) 256) | ||
| 515 | ;;; (save-restriction | ||
| 516 | ;;; (narrow-to-region pos to) | ||
| 517 | ;;; (utf-8-compose))) | ||
| 518 | ;;; t)))) | ||
| 519 | |||
| 520 | ;;; (dotimes (i 96) | ||
| 521 | ;;; (aset composition-function-table | ||
| 522 | ;;; (+ 128 i) | ||
| 523 | ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") | ||
| 524 | ;;; . utf-8-compose-function)))) | ||
| 525 | |||
| 379 | ;;; utf-8.el ends here | 526 | ;;; utf-8.el ends here |