diff options
| author | Dave Love | 2002-07-17 15:04:25 +0000 |
|---|---|---|
| committer | Dave Love | 2002-07-17 15:04:25 +0000 |
| commit | 9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505 (patch) | |
| tree | d58351e721a9e0160ba884fc0c5a3be858708c84 | |
| parent | f9bd23fdb89db3c6abbae24e332a300cbf1cf5ac (diff) | |
| download | emacs-9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505.tar.gz emacs-9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505.zip | |
(utf-8-subst-table)
(utf-8-subst-rev-table, utf-8-translation-table-for-decode)
(utf-8-fragment-on-decoding, ccl-untranslated-to-ucs)
(utf-8-ccl-regs, utf-8-translate-cjk): New.
(ccl-encode-mule-utf-8): Use utf-8-subst-rev-table.
(ccl-decode-mule-utf-8, ccl-untranslated-to-ucs)
(utf-8-untranslated-to-ucs, utf-8-compose): Rewritten.
(mule-utf-8): Remove pre-write-conversion.
(utf-8-post-read-conversion): Comment out.
| -rw-r--r-- | lisp/international/utf-8.el | 591 |
1 files changed, 402 insertions, 189 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 068a7bbeaa1..b3f6390322e 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el | |||
| @@ -1,10 +1,11 @@ | |||
| 1 | ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- | 1 | ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
| 4 | ;; Licensed to the Free Software Foundation. | 4 | ;; Licensed to the Free Software Foundation. |
| 5 | ;; Copyright (C) 2001 Free Software Foundation, Inc. | 5 | ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
| 6 | 6 | ||
| 7 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> | 7 | ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
| 8 | ;; Maintainer: FSF | ||
| 8 | ;; Keywords: multilingual, Unicode, UTF-8, i18n | 9 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
| 9 | 10 | ||
| 10 | ;; This file is part of GNU Emacs. | 11 | ;; This file is part of GNU Emacs. |
| @@ -39,11 +40,18 @@ | |||
| 39 | ;; On decoding, Unicode characters that do not fit into the above | 40 | ;; On decoding, Unicode characters that do not fit into the above |
| 40 | ;; character sets are handled as `eight-bit-control' or | 41 | ;; character sets are handled as `eight-bit-control' or |
| 41 | ;; `eight-bit-graphic' characters to retain the information about the | 42 | ;; `eight-bit-graphic' characters to retain the information about the |
| 42 | ;; original byte sequence. | 43 | ;; original byte sequence and text properties record the corresponding |
| 44 | ;; unicode. | ||
| 45 | ;; | ||
| 46 | ;; Fixme: note that reading and writing invalid utf-8 may not be | ||
| 47 | ;; idempotent -- to represent the bytes to fix that needs a new charset. | ||
| 43 | ;; | 48 | ;; |
| 44 | ;; Characters from other character sets can be encoded with | 49 | ;; Characters from other character sets can be encoded with |
| 45 | ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and | 50 | ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and |
| 46 | ;; registering the translation with `register-char-codings'. | 51 | ;; registering the translation with `register-char-codings'. Hash |
| 52 | ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to | ||
| 53 | ;; support encoding and decoding of about a quarter of the CJK space | ||
| 54 | ;; between U+3400 and U+DFFF. | ||
| 47 | 55 | ||
| 48 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | 56 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: |
| 49 | 57 | ||
| @@ -60,7 +68,111 @@ | |||
| 60 | "Translation table for encoding to `mule-utf-8'.") | 68 | "Translation table for encoding to `mule-utf-8'.") |
| 61 | ;; Could have been done by ucs-tables loaded before. | 69 | ;; Could have been done by ucs-tables loaded before. |
| 62 | (unless (get 'ucs-mule-to-mule-unicode 'translation-table) | 70 | (unless (get 'ucs-mule-to-mule-unicode 'translation-table) |
| 63 | (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) | 71 | (define-translation-table 'ucs-mule-to-mule-unicode |
| 72 | ucs-mule-to-mule-unicode)) | ||
| 73 | |||
| 74 | (defvar utf-8-subst-table (make-hash-table :test 'eq)) | ||
| 75 | (defvar utf-8-subst-rev-table (make-hash-table :test 'eq)) | ||
| 76 | (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table) | ||
| 77 | (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table) | ||
| 78 | |||
| 79 | (defvar utf-8-translation-table-for-decode (make-translation-table) | ||
| 80 | "Translation table applied after decoding utf-8 to mule-unicode. | ||
| 81 | This is only actually applied to characters which would normally be | ||
| 82 | decoded into mule-unicode-0100-24ff.") | ||
| 83 | (define-translation-table 'utf-8-translation-table-for-decode | ||
| 84 | utf-8-translation-table-for-decode) | ||
| 85 | |||
| 86 | ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the | ||
| 87 | ;; space of mule-unicode. For Latin scripts this isn't very | ||
| 88 | ;; important. Hebrew and Arabic might go here too when there's proper | ||
| 89 | ;; support for them. | ||
| 90 | (mapc | ||
| 91 | (lambda (pair) | ||
| 92 | (aset utf-8-translation-table-for-decode (car pair) (cdr pair))) | ||
| 93 | '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) | ||
| 94 | (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) | ||
| 95 | (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) | ||
| 96 | (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) | ||
| 97 | (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) | ||
| 98 | (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) | ||
| 99 | (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) | ||
| 100 | (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) | ||
| 101 | (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) | ||
| 102 | (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) | ||
| 103 | (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) | ||
| 104 | (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) | ||
| 105 | (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) | ||
| 106 | (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) | ||
| 107 | (?$,1'N(B . ?,F~(B) | ||
| 108 | |||
| 109 | (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) | ||
| 110 | (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) | ||
| 111 | (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) | ||
| 112 | (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) | ||
| 113 | (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) | ||
| 114 | (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) | ||
| 115 | (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) | ||
| 116 | (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) | ||
| 117 | (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) | ||
| 118 | (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) | ||
| 119 | (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) | ||
| 120 | (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) | ||
| 121 | (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) | ||
| 122 | (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) | ||
| 123 | (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) | ||
| 124 | (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) | ||
| 125 | (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) | ||
| 126 | (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) | ||
| 127 | (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) | ||
| 128 | |||
| 129 | (defcustom utf-8-fragment-on-decoding nil | ||
| 130 | "Whether or not to decode some scripts in UTF-8 text into 8-bit characters. | ||
| 131 | Setting this means that the relevant Cyrillic and Greek characters are | ||
| 132 | decoded into the iso8859 charsets rather than into | ||
| 133 | mule-unicode-0100-24ff. The 8-bit characters take half as much space | ||
| 134 | in the buffer, but using them may affect how the buffer can be re-encoded | ||
| 135 | and may require a different input method to search for them, for instance. | ||
| 136 | See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' | ||
| 137 | for mechanisms to make this largely transparent." | ||
| 138 | :set (lambda (s v) | ||
| 139 | (if v | ||
| 140 | (define-translation-table 'utf-8-translation-table-for-decode | ||
| 141 | utf-8-translation-table-for-decode) | ||
| 142 | (define-translation-table 'utf-8-translation-table-for-decode)) | ||
| 143 | (set-default s v)) | ||
| 144 | :version "21.4" | ||
| 145 | :type 'boolean | ||
| 146 | :group 'mule) | ||
| 147 | |||
| 148 | (defcustom utf-8-translate-cjk nil | ||
| 149 | "Whether the `mule-utf-8' coding system should encode many CJK characters. | ||
| 150 | |||
| 151 | Enabling this loads tables which enable the coding system to encode | ||
| 152 | characters in the charsets `korean-ksc5601', `chinese-gb2312' and | ||
| 153 | `japanese-jisx0208', and to decode the corresponding unicodes into | ||
| 154 | such characters. This works by loading the library `utf-8-subst'; see | ||
| 155 | its commentary. The tables are fairly large (about 33000 entries), so this | ||
| 156 | option is not the default." | ||
| 157 | :link '(emacs-commentary-link "utf-8-subst") | ||
| 158 | :set (lambda (s v) | ||
| 159 | (when v | ||
| 160 | (require 'utf-8-subst) | ||
| 161 | (let ((table (make-char-table 'translation-table))) | ||
| 162 | (coding-system-put 'mule-utf-8 'safe-charsets | ||
| 163 | (append (coding-system-get 'mule-utf-8 | ||
| 164 | 'safe-charsets) | ||
| 165 | '(korean-ksc5601 chinese-gb2312 | ||
| 166 | japanese-jisx0208))) | ||
| 167 | (maphash (lambda (k v) | ||
| 168 | (aset table k v)) | ||
| 169 | utf-8-subst-rev-table) | ||
| 170 | (register-char-codings 'mule-utf-8 table))) | ||
| 171 | (set-default s v)) | ||
| 172 | :version "21.4" | ||
| 173 | :type 'boolean | ||
| 174 | :group 'mule) | ||
| 175 | |||
| 64 | (define-ccl-program ccl-decode-mule-utf-8 | 176 | (define-ccl-program ccl-decode-mule-utf-8 |
| 65 | ;; | 177 | ;; |
| 66 | ;; charset | bytes in utf-8 | bytes in emacs | 178 | ;; charset | bytes in utf-8 | bytes in emacs |
| @@ -90,66 +202,16 @@ | |||
| 90 | ;; 1byte encoding, i.e., ascii | 202 | ;; 1byte encoding, i.e., ascii |
| 91 | (if (r0 < #x80) | 203 | (if (r0 < #x80) |
| 92 | (write r0) | 204 | (write r0) |
| 93 | 205 | (if (r0 < #xc0) ; continuation byte (invalid here) | |
| 94 | ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | 206 | (if (r0 < #xa0) |
| 95 | (if (r0 < #xe0) | 207 | (write-multibyte-character r5 r0) |
| 96 | ((read r1) | 208 | (write-multibyte-character r6 r0)) |
| 97 | 209 | ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | |
| 98 | (if ((r1 & #b11000000) != #b10000000) | 210 | (if (r0 < #xe0) |
| 99 | ;; Invalid 2-byte sequence | 211 | ((read r1) |
| 100 | ((if (r0 < #xa0) | 212 | |
| 101 | (write-multibyte-character r5 r0) | 213 | (if ((r1 & #b11000000) != #b10000000) |
| 102 | (write-multibyte-character r6 r0)) | 214 | ;; Invalid 2-byte sequence |
| 103 | (if (r1 < #x80) | ||
| 104 | (write r1) | ||
| 105 | (if (r1 < #xa0) | ||
| 106 | (write-multibyte-character r5 r1) | ||
| 107 | (write-multibyte-character r6 r1)))) | ||
| 108 | |||
| 109 | ((r0 &= #x1f) | ||
| 110 | (r0 <<= 6) | ||
| 111 | (r1 &= #x3f) | ||
| 112 | (r1 += r0) | ||
| 113 | ;; Now r1 holds scalar value | ||
| 114 | |||
| 115 | ;; eight-bit-control | ||
| 116 | (if (r1 < 160) | ||
| 117 | ((write-multibyte-character r5 r1)) | ||
| 118 | |||
| 119 | ;; latin-iso8859-1 | ||
| 120 | (if (r1 < 256) | ||
| 121 | ((r0 = ,(charset-id 'latin-iso8859-1)) | ||
| 122 | (r1 -= 128) | ||
| 123 | (write-multibyte-character r0 r1)) | ||
| 124 | |||
| 125 | ;; mule-unicode-0100-24ff (< 0800) | ||
| 126 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 127 | (r1 -= #x0100) | ||
| 128 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 129 | (r1 %= 96) | ||
| 130 | (r1 += (r2 + 32)) | ||
| 131 | (write-multibyte-character r0 r1))))))) | ||
| 132 | |||
| 133 | ;; 3byte encoding | ||
| 134 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | ||
| 135 | (if (r0 < #xf0) | ||
| 136 | ((read r1 r2) | ||
| 137 | |||
| 138 | ;; This is set to 1 if the encoding is invalid. | ||
| 139 | (r4 = 0) | ||
| 140 | |||
| 141 | (r3 = (r1 & #b11000000)) | ||
| 142 | (r3 |= ((r2 >> 2) & #b00110000)) | ||
| 143 | (if (r3 != #b10100000) | ||
| 144 | (r4 = 1) | ||
| 145 | ((r3 = ((r0 & #x0f) << 12)) | ||
| 146 | (r3 += ((r1 & #x3f) << 6)) | ||
| 147 | (r3 += (r2 & #x3f)) | ||
| 148 | (if (r3 < #x0800) | ||
| 149 | (r4 = 1)))) | ||
| 150 | |||
| 151 | (if (r4 != 0) | ||
| 152 | ;; Invalid 3-byte sequence | ||
| 153 | ((if (r0 < #xa0) | 215 | ((if (r0 < #xa0) |
| 154 | (write-multibyte-character r5 r0) | 216 | (write-multibyte-character r5 r0) |
| 155 | (write-multibyte-character r6 r0)) | 217 | (write-multibyte-character r6 r0)) |
| @@ -157,75 +219,195 @@ | |||
| 157 | (write r1) | 219 | (write r1) |
| 158 | (if (r1 < #xa0) | 220 | (if (r1 < #xa0) |
| 159 | (write-multibyte-character r5 r1) | 221 | (write-multibyte-character r5 r1) |
| 160 | (write-multibyte-character r6 r1))) | 222 | (write-multibyte-character r6 r1)))) |
| 161 | (if (r2 < #x80) | 223 | |
| 162 | (write r2) | 224 | ((r3 = r0) ; save in case of overlong sequence |
| 163 | (if (r2 < #xa0) | 225 | (r2 = r1) |
| 164 | (write-multibyte-character r5 r2) | 226 | (r0 &= #x1f) |
| 165 | (write-multibyte-character r6 r2)))) | 227 | (r0 <<= 6) |
| 228 | (r2 = r1) ; save in case of overlong sequence | ||
| 229 | (r1 &= #x3f) | ||
| 230 | (r1 += r0) | ||
| 231 | ;; Now r1 holds scalar value | ||
| 232 | |||
| 233 | (if (r1 < 128) ; `overlong sequence' | ||
| 234 | ((if (r3 < #xa0) | ||
| 235 | (write-multibyte-character r5 r3) | ||
| 236 | (write-multibyte-character r6 r3)) | ||
| 237 | (if (r2 < #x80) | ||
| 238 | (write r2) | ||
| 239 | (if (r2 < #xa0) | ||
| 240 | (write-multibyte-character r5 r2) | ||
| 241 | (write-multibyte-character r6 r2)))) | ||
| 242 | |||
| 243 | ;; eight-bit-control | ||
| 244 | (if (r1 < 160) | ||
| 245 | ((write-multibyte-character r5 r1)) | ||
| 246 | |||
| 247 | ;; latin-iso8859-1 | ||
| 248 | (if (r1 < 256) | ||
| 249 | ((r0 = ,(charset-id 'latin-iso8859-1)) | ||
| 250 | (r1 -= 128) | ||
| 251 | (write-multibyte-character r0 r1)) | ||
| 252 | |||
| 253 | ;; mule-unicode-0100-24ff (< 0800) | ||
| 254 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 255 | (r1 -= #x0100) | ||
| 256 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 257 | (r1 %= 96) | ||
| 258 | (r1 += (r2 + 32)) | ||
| 259 | (translate-character | ||
| 260 | utf-8-translation-table-for-decode r0 r1) | ||
| 261 | (write-multibyte-character r0 r1)))))))) | ||
| 262 | |||
| 263 | ;; 3byte encoding | ||
| 264 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | ||
| 265 | (if (r0 < #xf0) | ||
| 266 | ((read r1 r2) | ||
| 267 | |||
| 268 | ;; This is set to 1 if the encoding is invalid. | ||
| 269 | (r4 = 0) | ||
| 270 | |||
| 271 | (r3 = (r1 & #b11000000)) | ||
| 272 | (r3 |= ((r2 >> 2) & #b00110000)) | ||
| 273 | (if (r3 != #b10100000) | ||
| 274 | (r4 = 1) | ||
| 275 | ((r3 = ((r0 & #x0f) << 12)) | ||
| 276 | (r3 += ((r1 & #x3f) << 6)) | ||
| 277 | (r3 += (r2 & #x3f)) | ||
| 278 | (if (r3 < #x0800) | ||
| 279 | (r4 = 1)))) | ||
| 280 | |||
| 281 | (if (r4 != 0) | ||
| 282 | ;; Invalid 3-byte sequence | ||
| 283 | ((if (r0 < #xa0) | ||
| 284 | (write-multibyte-character r5 r0) | ||
| 285 | (write-multibyte-character r6 r0)) | ||
| 286 | (if (r1 < #x80) | ||
| 287 | (write r1) | ||
| 288 | (if (r1 < #xa0) | ||
| 289 | (write-multibyte-character r5 r1) | ||
| 290 | (write-multibyte-character r6 r1))) | ||
| 291 | (if (r2 < #x80) | ||
| 292 | (write r2) | ||
| 293 | (if (r2 < #xa0) | ||
| 294 | (write-multibyte-character r5 r2) | ||
| 295 | (write-multibyte-character r6 r2)))) | ||
| 166 | 296 | ||
| 167 | ;; mule-unicode-0100-24ff (>= 0800) | 297 | ;; mule-unicode-0100-24ff (>= 0800) |
| 168 | ((if (r3 < #x2500) | 298 | ((if (r3 < #x2500) |
| 169 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 299 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 170 | (r3 -= #x0100) | 300 | (r3 -= #x0100) |
| 171 | (r3 //= 96) | ||
| 172 | (r1 = (r7 + 32)) | ||
| 173 | (r1 += ((r3 + 32) << 7)) | ||
| 174 | (write-multibyte-character r0 r1)) | ||
| 175 | |||
| 176 | ;; mule-unicode-2500-33ff | ||
| 177 | (if (r3 < #x3400) | ||
| 178 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 179 | (r3 -= #x2500) | ||
| 180 | (r3 //= 96) | 301 | (r3 //= 96) |
| 181 | (r1 = (r7 + 32)) | 302 | (r1 = (r7 + 32)) |
| 182 | (r1 += ((r3 + 32) << 7)) | 303 | (r1 += ((r3 + 32) << 7)) |
| 304 | (translate-character | ||
| 305 | utf-8-translation-table-for-decode r0 r1) | ||
| 183 | (write-multibyte-character r0 r1)) | 306 | (write-multibyte-character r0 r1)) |
| 184 | 307 | ||
| 185 | ;; U+3400 .. U+DFFF | 308 | ;; mule-unicode-2500-33ff |
| 186 | ;; keep those bytes as eight-bit-{control|graphic} | 309 | ;; Fixme: Perhaps allow translation via |
| 187 | (if (r3 < #xe000) | 310 | ;; utf-8-subst-table for #x2e80 up, so that we use |
| 188 | ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | 311 | ;; consistent charsets for all of CJK. Would need |
| 189 | (r3 = r6) | 312 | ;; corresponding change to encoding tables. |
| 190 | (write-multibyte-character r3 r0) | 313 | (if (r3 < #x3400) |
| 191 | (if (r1 < #xa0) | 314 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
| 192 | (r3 = r5)) | 315 | (r3 -= #x2500) |
| 193 | (write-multibyte-character r3 r1) | 316 | (r3 //= 96) |
| 194 | (if (r2 < #xa0) | 317 | (r1 = (r7 + 32)) |
| 195 | (r3 = r5) | 318 | (r1 += ((r3 + 32) << 7)) |
| 196 | (r3 = r6)) | 319 | (write-multibyte-character r0 r1)) |
| 197 | (write-multibyte-character r3 r2)) | 320 | |
| 321 | ;; U+3400 .. U+D7FF | ||
| 322 | ;; Try to convert to CJK chars, else keep | ||
| 323 | ;; them as eight-bit-{control|graphic}. | ||
| 324 | (if (r3 < #xd800) | ||
| 325 | ((r4 = r3) ; don't zap r3 | ||
| 326 | (lookup-integer utf-8-subst-table r4 r5) | ||
| 327 | (if r7 | ||
| 328 | ;; got a translation | ||
| 329 | ((write-multibyte-character r4 r5) | ||
| 330 | ;; Zapped through register starvation. | ||
| 331 | (r5 = ,(charset-id 'eight-bit-control))) | ||
| 332 | ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | ||
| 333 | ((r3 = r6) | ||
| 334 | (write-multibyte-character r3 r0) | ||
| 335 | (if (r1 < #xa0) | ||
| 336 | (r3 = r5)) | ||
| 337 | (write-multibyte-character r3 r1) | ||
| 338 | (if (r2 < #xa0) | ||
| 339 | (r3 = r5) | ||
| 340 | (r3 = r6)) | ||
| 341 | (write-multibyte-character r3 r2)))) | ||
| 342 | |||
| 343 | ;; Surrogates, U+D800 .. U+DFFF | ||
| 344 | ;; Fixme: process them properly. | ||
| 345 | (if (r3 < #xe000) | ||
| 346 | ((r3 = r6) | ||
| 347 | (write-multibyte-character r3 r0) ; eight-bit-graphic | ||
| 348 | (if (r1 < #xa0) | ||
| 349 | (r3 = r5)) | ||
| 350 | (write-multibyte-character r3 r1) | ||
| 351 | (if (r2 < #xa0) | ||
| 352 | (r3 = r5) | ||
| 353 | (r3 = r6)) | ||
| 354 | (write-multibyte-character r3 r2)) | ||
| 198 | 355 | ||
| 199 | ;; mule-unicode-e000-ffff | 356 | ;; mule-unicode-e000-ffff |
| 200 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 357 | ;; Fixme: fffe and ffff are invalid. |
| 201 | (r3 -= #xe000) | 358 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 202 | (r3 //= 96) | 359 | (r3 -= #xe000) |
| 203 | (r1 = (r7 + 32)) | 360 | (r3 //= 96) |
| 204 | (r1 += ((r3 + 32) << 7)) | 361 | (r1 = (r7 + 32)) |
| 205 | (write-multibyte-character r0 r1)))))))) | 362 | (r1 += ((r3 + 32) << 7)) |
| 206 | 363 | (write-multibyte-character r0 r1))))))))) | |
| 207 | ;; 4byte encoding | 364 | |
| 208 | ;; keep those bytes as eight-bit-{control|graphic} | 365 | (if (r0 < #xfe) |
| 209 | ((read r1 r2 r3) | 366 | ;; 4byte encoding |
| 210 | ;; r0 > #xf0, thus eight-bit-graphic | 367 | ;; keep those bytes as eight-bit-{control|graphic} |
| 211 | (write-multibyte-character r6 r0) | 368 | ;; Fixme: allow lookup in utf-8-subst-table. |
| 212 | (if (r1 < #xa0) | 369 | ((read r1 r2 r3) |
| 213 | (write-multibyte-character r5 r1) | 370 | ;; r0 > #xf0, thus eight-bit-graphic |
| 214 | (write-multibyte-character r6 r1)) | 371 | (write-multibyte-character r6 r0) |
| 215 | (if (r2 < #xa0) | 372 | (if (r1 < #xa0) |
| 216 | (write-multibyte-character r5 r2) | 373 | (if (r1 < #x80) ; invalid byte |
| 217 | (write-multibyte-character r6 r2)) | 374 | (write r1) |
| 218 | (if (r3 < #xa0) | 375 | (write-multibyte-character r5 r1)) |
| 219 | (write-multibyte-character r5 r3) | 376 | (write-multibyte-character r6 r1)) |
| 220 | (write-multibyte-character r6 r3)))))) | 377 | (if (r2 < #xa0) |
| 221 | 378 | (if (r2 < #x80) ; invalid byte | |
| 379 | (write r2) | ||
| 380 | (write-multibyte-character r5 r2)) | ||
| 381 | (write-multibyte-character r6 r2)) | ||
| 382 | (if (r3 < #xa0) | ||
| 383 | (if (r3 < #x80) ; invalid byte | ||
| 384 | (write r3) | ||
| 385 | (write-multibyte-character r5 r3)) | ||
| 386 | (write-multibyte-character r6 r3)) | ||
| 387 | (if (r0 >= #xf8) ; 5- or 6-byte encoding | ||
| 388 | ((read r1) | ||
| 389 | (if (r1 < #xa0) | ||
| 390 | (if (r1 < #x80) ; invalid byte | ||
| 391 | (write r1) | ||
| 392 | (write-multibyte-character r5 r1)) | ||
| 393 | (write-multibyte-character r6 r1)) | ||
| 394 | (if (r0 >= #xfc) ; 6-byte | ||
| 395 | ((read r1) | ||
| 396 | (if (r1 < #xa0) | ||
| 397 | (if (r1 < #x80) ; invalid byte | ||
| 398 | (write r1) | ||
| 399 | (write-multibyte-character r5 r1)) | ||
| 400 | (write-multibyte-character r6 r1))))))) | ||
| 401 | ;; else invalid byte >= #xfe | ||
| 402 | (write-multibyte-character r6 r0)))))) | ||
| 222 | (repeat)))) | 403 | (repeat)))) |
| 223 | 404 | ||
| 224 | "CCL program to decode UTF-8. | 405 | "CCL program to decode UTF-8. |
| 225 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 406 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 226 | mule-unicode-*. Encodings of un-representable Unicode characters are | 407 | mule-unicode-*, but see also `utf-8-translation-table-for-decode' and |
| 227 | decoded asis into eight-bit-control and eight-bit-graphic | 408 | `utf-8-subst-table'. |
| 228 | characters.") | 409 | Encodings of un-representable Unicode characters are decoded asis into |
| 410 | eight-bit-control and eight-bit-graphic characters.") | ||
| 229 | 411 | ||
| 230 | (define-ccl-program ccl-encode-mule-utf-8 | 412 | (define-ccl-program ccl-encode-mule-utf-8 |
| 231 | `(1 | 413 | `(1 |
| @@ -288,7 +470,7 @@ characters.") | |||
| 288 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | 470 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
| 289 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | 471 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
| 290 | (r1 &= #x7f) | 472 | (r1 &= #x7f) |
| 291 | (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 | 473 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 |
| 292 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | 474 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
| 293 | (r2 = ((r1 & #x3f) | #x80)) | 475 | (r2 = ((r1 & #x3f) | #x80)) |
| 294 | (r1 &= #x0fc0) | 476 | (r1 &= #x0fc0) |
| @@ -329,11 +511,19 @@ characters.") | |||
| 329 | ((write #xc2) | 511 | ((write #xc2) |
| 330 | (write r1))))))) | 512 | (write r1))))))) |
| 331 | 513 | ||
| 332 | ;; Unsupported character. | 514 | ((lookup-character utf-8-subst-rev-table r0 r1) |
| 333 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | 515 | (if r7 ; lookup succeeded |
| 334 | ((write #xef) | 516 | ((r1 = (((r0 & #xf000) >> 12) | #xe0)) |
| 335 | (write #xbf) | 517 | (r2 = ((r0 & #x3f) | #x80)) |
| 336 | (write #xbd))))))))) | 518 | (r0 &= #x0fc0) |
| 519 | (r0 >>= 6) | ||
| 520 | (r0 |= #x80) | ||
| 521 | (write r1 r0 r2)) | ||
| 522 | ;; Unsupported character. | ||
| 523 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | ||
| 524 | ((write #xef) | ||
| 525 | (write #xbf) | ||
| 526 | (write #xbd))))))))))) | ||
| 337 | (repeat))) | 527 | (repeat))) |
| 338 | (if (r1 >= #xa0) | 528 | (if (r1 >= #xa0) |
| 339 | (write r1) | 529 | (write r1) |
| @@ -341,69 +531,89 @@ characters.") | |||
| 341 | ((write #xc2) | 531 | ((write #xc2) |
| 342 | (write r1))))) | 532 | (write r1))))) |
| 343 | 533 | ||
| 344 | "CCL program to encode into UTF-8. | 534 | "CCL program to encode into UTF-8.") |
| 345 | Only characters from the charsets ascii, eight-bit-control, | ||
| 346 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. | ||
| 347 | Others are encoded as U+FFFD.") | ||
| 348 | 535 | ||
| 349 | ;; Dummy definition so that the CCL can be checked correctly; the | 536 | ;; Dummy definition so that the CCL can be checked correctly; the |
| 350 | ;; actual data are loaded on demand. | 537 | ;; actual data are loaded on demand. |
| 351 | (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it | 538 | (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it |
| 352 | (define-translation-table 'ucs-mule-8859-to-mule-unicode)) | 539 | (define-translation-table 'ucs-mule-8859-to-mule-unicode)) |
| 353 | 540 | ||
| 541 | (define-ccl-program ccl-untranslated-to-ucs | ||
| 542 | `(0 | ||
| 543 | (if (r0 < #xf0) ; 3-byte encoding, as above | ||
| 544 | ((r4 = 0) | ||
| 545 | (r3 = (r1 & #b11000000)) | ||
| 546 | (r3 |= ((r2 >> 2) & #b00110000)) | ||
| 547 | (if (r3 != #b10100000) | ||
| 548 | (r4 = 1) | ||
| 549 | ((r3 = ((r0 & #x0f) << 12)) | ||
| 550 | (r3 += ((r1 & #x3f) << 6)) | ||
| 551 | (r3 += (r2 & #x3f)) | ||
| 552 | (if (r3 < #x0800) | ||
| 553 | (r4 = 1)))) | ||
| 554 | (if (r4 != 0) | ||
| 555 | (r0 = 0) | ||
| 556 | (r0 = r3))) | ||
| 557 | (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | ||
| 558 | ((r4 = (r1 >> 6)) | ||
| 559 | (if (r4 != #b10) | ||
| 560 | (r0 = 0) | ||
| 561 | ((r4 = (r2 >> 6)) | ||
| 562 | (if (r4 != #b10) | ||
| 563 | (r0 = 0) | ||
| 564 | ((r4 = (r3 >> 6)) | ||
| 565 | (if (r4 != #b10) | ||
| 566 | (r0 = 0) | ||
| 567 | ((r1 = ((r1 & #x3F) << 12)) | ||
| 568 | (r2 = ((r2 & #x3F) << 6)) | ||
| 569 | (r3 &= #x3F) | ||
| 570 | (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | ||
| 571 | (r0 = 0)))) | ||
| 572 | "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | ||
| 573 | r0 == 0 for invalid sequence.") | ||
| 574 | |||
| 575 | (defvar utf-8-ccl-regs (make-vector 8 0)) | ||
| 576 | |||
| 354 | (defsubst utf-8-untranslated-to-ucs () | 577 | (defsubst utf-8-untranslated-to-ucs () |
| 355 | (let ((b1 (char-after)) | 578 | "Return the UCS code for an untranslated sequence of raw bytes t point. |
| 356 | (b2 (char-after (1+ (point)))) | 579 | Only for 3- or 4-byte sequences." |
| 357 | (b3 (char-after (+ 2 (point)))) | 580 | (aset utf-8-ccl-regs 0 (or (char-after) 0)) |
| 358 | (b4 (char-after (+ 4 (point))))) | 581 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) |
| 359 | (if (and b1 b2 b3) | 582 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) |
| 360 | (cond ((< b1 ?\xf0) | 583 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) |
| 361 | (setq b2 (lsh (logand b2 ?\x3f) 6)) | 584 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) |
| 362 | (setq b3 (logand b3 ?\x3f)) | 585 | (aref utf-8-ccl-regs 0)) |
| 363 | (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12)))) | ||
| 364 | (b4 | ||
| 365 | (setq b2 (lsh (logand b2 ?\x3f) 12)) | ||
| 366 | (setq b3 (lsh (logand b3 ?\x3f) 6)) | ||
| 367 | (setq b4 (logand b4 ?\x3f)) | ||
| 368 | (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07) | ||
| 369 | 18))))))))) | ||
| 370 | 586 | ||
| 371 | (defun utf-8-help-echo (window object position) | 587 | (defun utf-8-help-echo (window object position) |
| 372 | (format "Untranslated Unicode U+%04X" | 588 | (format "Untranslated Unicode U+%04X" |
| 373 | (get-char-property position 'untranslated-utf-8 object))) | 589 | (get-char-property position 'untranslated-utf-8 object))) |
| 374 | 590 | ||
| 375 | (defvar utf-8-subst-table nil | ||
| 376 | "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.") | ||
| 377 | |||
| 378 | ;; We compose the untranslatable sequences into a single character. | 591 | ;; We compose the untranslatable sequences into a single character. |
| 379 | ;; This is infelicitous for editing, because there's currently no | 592 | ;; This is infelicitous for editing, because there's currently no |
| 380 | ;; mechanism for treating compositions as atomic, but is OK for | 593 | ;; mechanism for treating compositions as atomic, but is OK for |
| 381 | ;; display. We try to compose an appropriate character from a hash | 594 | ;; display. They are composed to U+FFFD with help-echo which |
| 382 | ;; table of CJK characters to display correctly. Otherwise we use | 595 | ;; indicates the unicodes they represent. This function GCs too much. |
| 383 | ;; U+FFFD. What we really should have is hash table lookup from CCL | ||
| 384 | ;; so that we could do this properly. This function GCs too much. | ||
| 385 | (defsubst utf-8-compose () | 596 | (defsubst utf-8-compose () |
| 386 | "Put a suitable composition on an untranslatable sequence. | 597 | "Put a suitable composition on an untranslatable sequence. |
| 387 | Return the sequence's length." | 598 | Return the sequence's length." |
| 388 | (let* ((u (utf-8-untranslated-to-ucs)) | 599 | (let* ((u (utf-8-untranslated-to-ucs)) |
| 389 | (l (and u (if (>= u ?\x10000) | 600 | (l (unless (zerop u) |
| 601 | (if (>= u #x10000) | ||
| 390 | 4 | 602 | 4 |
| 391 | 3))) | 603 | 3)))) |
| 392 | (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) | 604 | (when l |
| 393 | (when u | ||
| 394 | (put-text-property (point) (min (point-max) (+ l (point))) | 605 | (put-text-property (point) (min (point-max) (+ l (point))) |
| 395 | 'untranslated-utf-8 u) | 606 | 'untranslated-utf-8 u) |
| 396 | (unless subst | 607 | (put-text-property (point) (min (point-max) (+ l (point))) |
| 397 | (put-text-property (point) (min (point-max) (+ l (point))) | 608 | 'help-echo 'utf-8-help-echo) |
| 398 | 'help-echo 'utf-8-help-echo) | 609 | (compose-region (point) (+ l (point)) ?$,3u=(B) |
| 399 | (setq subst ?$,3u=(B)) | ||
| 400 | (compose-region (point) (+ l (point)) subst) | ||
| 401 | l))) | 610 | l))) |
| 402 | 611 | ||
| 403 | (defcustom utf-8-compose-scripts nil | 612 | (defcustom utf-8-compose-scripts nil |
| 404 | "*Non-nil means compose various scipts on decoding utf-8 text." | 613 | "*Non-nil means compose various scripts on decoding utf-8 text." |
| 405 | :group 'mule | 614 | :group 'mule |
| 406 | :type 'boolean) ; omitted in Emacs 21.1 | 615 | :version "21.4" |
| 616 | :type 'boolean) | ||
| 407 | 617 | ||
| 408 | (defun utf-8-post-read-conversion (length) | 618 | (defun utf-8-post-read-conversion (length) |
| 409 | "Compose untranslated utf-8 sequences into single characters. | 619 | "Compose untranslated utf-8 sequences into single characters. |
| @@ -412,38 +622,39 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | |||
| 412 | ;; Can't do eval-when-compile to insert a multibyte constant | 622 | ;; Can't do eval-when-compile to insert a multibyte constant |
| 413 | ;; version of the string in the loop, since it's always loaded as | 623 | ;; version of the string in the loop, since it's always loaded as |
| 414 | ;; unibyte from a byte-compiled file. | 624 | ;; unibyte from a byte-compiled file. |
| 415 | (let ((range (string-as-multibyte "^\341-\377"))) | 625 | (let ((range (string-as-multibyte "^\xe1-\xf7"))) |
| 416 | (while (and (skip-chars-forward | 626 | (while (and (skip-chars-forward range) |
| 417 | range) | ||
| 418 | (not (eobp))) | 627 | (not (eobp))) |
| 419 | (forward-char (utf-8-compose))))) | 628 | (forward-char (utf-8-compose))))) |
| 420 | ;; Fixme: Takahashi-san implies it may not work this easily -- needs | 629 | ;; Fixme: Takahashi-san implies it may not work this easily. I |
| 421 | ;; checking with him. | 630 | ;; asked why but didn't get a reply. -- fx |
| 422 | (when (and utf-8-compose-scripts (> length 1)) | 631 | (when (and utf-8-compose-scripts (> length 1)) |
| 423 | ;; These currently have definitions which cover the relevant | 632 | ;; These currently have definitions which cover the relevant |
| 424 | ;; Unicodes. We could avoid loading thai-util &c by checking | 633 | ;; unicodes. We could avoid loading thai-util &c by checking |
| 425 | ;; whether the region contains any characters with the appropriate | 634 | ;; whether the region contains any characters with the appropriate |
| 426 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | 635 | ;; categories. There aren't yet Unicode-based rules for Tibetan. |
| 427 | (save-excursion (setq length (diacritic-post-read-conversion length))) | 636 | (save-excursion (setq length (diacritic-post-read-conversion length))) |
| 428 | (save-excursion (setq length (thai-post-read-conversion length))) | 637 | (save-excursion (setq length (thai-post-read-conversion length))) |
| 429 | (save-excursion (setq length (lao-post-read-conversion length))) | 638 | (save-excursion (setq length (lao-post-read-conversion length))) |
| 430 | (save-excursion (setq length (devanagari-post-read-conversion length)))) | 639 | (save-excursion |
| 640 | (setq length (in-is13194-devanagari-post-read-conversion length)))) | ||
| 431 | length) | 641 | length) |
| 432 | 642 | ||
| 433 | (defun utf-8-pre-write-conversion (beg end) | 643 | ;; ucs-tables is preloaded |
| 434 | "Semi-dummy pre-write function effectively to autoload ucs-tables." | 644 | ;; (defun utf-8-pre-write-conversion (beg end) |
| 435 | ;; Ensure translation table is loaded. | 645 | ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." |
| 436 | (require 'ucs-tables) | 646 | ;; ;; Ensure translation table is loaded. |
| 437 | ;; Don't do this again. | 647 | ;; (require 'ucs-tables) |
| 438 | (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | 648 | ;; ;; Don't do this again. |
| 439 | nil) | 649 | ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) |
| 650 | ;; nil) | ||
| 440 | 651 | ||
| 441 | (make-coding-system | 652 | (make-coding-system |
| 442 | 'mule-utf-8 4 ?u | 653 | 'mule-utf-8 4 ?u |
| 443 | "UTF-8 encoding for Emacs-supported Unicode characters. | 654 | "UTF-8 encoding for Emacs-supported Unicode characters. |
| 444 | The supported Emacs character sets are the following, plus others | 655 | The supported Emacs character sets are the following, plus any other |
| 445 | which may be included in the translation table | 656 | characters included in the tables `ucs-mule-to-mule-unicode' and |
| 446 | `ucs-mule-to-mule-unicode': | 657 | `utf-8-subst-rev-table': |
| 447 | ascii | 658 | ascii |
| 448 | eight-bit-control | 659 | eight-bit-control |
| 449 | eight-bit-graphic | 660 | eight-bit-graphic |
| @@ -462,10 +673,12 @@ which may be included in the translation table | |||
| 462 | mule-unicode-e000-ffff | 673 | mule-unicode-e000-ffff |
| 463 | 674 | ||
| 464 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF | 675 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
| 465 | are decoded into sequences of eight-bit-control and eight-bit-graphic | 676 | may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208 |
| 466 | characters to preserve their byte sequences and composed to display as | 677 | \(see user option `utf-8-translate-cjk'); otherwise, sequences of |
| 467 | a single character. Emacs characters that can't be encoded to these | 678 | eight-bit-control and eight-bit-graphic characters are used to |
| 468 | ranges are encoded as U+FFFD." | 679 | preserve their byte sequences, and these are composed to display as a |
| 680 | single character. Emacs characters that otherwise can't be encoded | ||
| 681 | are encoded as U+FFFD." | ||
| 469 | 682 | ||
| 470 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 683 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
| 471 | '((safe-charsets | 684 | '((safe-charsets |
| @@ -497,7 +710,7 @@ ranges are encoded as U+FFFD." | |||
| 497 | (mime-charset . utf-8) | 710 | (mime-charset . utf-8) |
| 498 | (coding-category . coding-category-utf-8) | 711 | (coding-category . coding-category-utf-8) |
| 499 | (valid-codes (0 . 255)) | 712 | (valid-codes (0 . 255)) |
| 500 | (pre-write-conversion . utf-8-pre-write-conversion) | 713 | ;; (pre-write-conversion . utf-8-pre-write-conversion) |
| 501 | (post-read-conversion . utf-8-post-read-conversion))) | 714 | (post-read-conversion . utf-8-post-read-conversion))) |
| 502 | 715 | ||
| 503 | (define-coding-system-alias 'utf-8 'mule-utf-8) | 716 | (define-coding-system-alias 'utf-8 'mule-utf-8) |