aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Love2002-07-17 15:04:25 +0000
committerDave Love2002-07-17 15:04:25 +0000
commit9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505 (patch)
treed58351e721a9e0160ba884fc0c5a3be858708c84
parentf9bd23fdb89db3c6abbae24e332a300cbf1cf5ac (diff)
downloademacs-9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505.tar.gz
emacs-9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505.zip
(utf-8-subst-table)
(utf-8-subst-rev-table, utf-8-translation-table-for-decode) (utf-8-fragment-on-decoding, ccl-untranslated-to-ucs) (utf-8-ccl-regs, utf-8-translate-cjk): New. (ccl-encode-mule-utf-8): Use utf-8-subst-rev-table. (ccl-decode-mule-utf-8, ccl-untranslated-to-ucs) (utf-8-untranslated-to-ucs, utf-8-compose): Rewritten. (mule-utf-8): Remove pre-write-conversion. (utf-8-post-read-conversion): Comment out.
-rw-r--r--lisp/international/utf-8.el591
1 files changed, 402 insertions, 189 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 068a7bbeaa1..b3f6390322e 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -1,10 +1,11 @@
1;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- 1;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
2 2
3;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4;; Licensed to the Free Software Foundation. 4;; Licensed to the Free Software Foundation.
5;; Copyright (C) 2001 Free Software Foundation, Inc. 5;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 6
7;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> 7;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8;; Maintainer: FSF
8;; Keywords: multilingual, Unicode, UTF-8, i18n 9;; Keywords: multilingual, Unicode, UTF-8, i18n
9 10
10;; This file is part of GNU Emacs. 11;; This file is part of GNU Emacs.
@@ -39,11 +40,18 @@
39;; On decoding, Unicode characters that do not fit into the above 40;; On decoding, Unicode characters that do not fit into the above
40;; character sets are handled as `eight-bit-control' or 41;; character sets are handled as `eight-bit-control' or
41;; `eight-bit-graphic' characters to retain the information about the 42;; `eight-bit-graphic' characters to retain the information about the
42;; original byte sequence. 43;; original byte sequence and text properties record the corresponding
44;; unicode.
45;;
46;; Fixme: note that reading and writing invalid utf-8 may not be
47;; idempotent -- to represent the bytes to fix that needs a new charset.
43;; 48;;
44;; Characters from other character sets can be encoded with 49;; Characters from other character sets can be encoded with
45;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and 50;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46;; registering the translation with `register-char-codings'. 51;; registering the translation with `register-char-codings'. Hash
52;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
53;; support encoding and decoding of about a quarter of the CJK space
54;; between U+3400 and U+DFFF.
47 55
48;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: 56;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
49 57
@@ -60,7 +68,111 @@
60 "Translation table for encoding to `mule-utf-8'.") 68 "Translation table for encoding to `mule-utf-8'.")
61;; Could have been done by ucs-tables loaded before. 69;; Could have been done by ucs-tables loaded before.
62(unless (get 'ucs-mule-to-mule-unicode 'translation-table) 70(unless (get 'ucs-mule-to-mule-unicode 'translation-table)
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode)) 71 (define-translation-table 'ucs-mule-to-mule-unicode
72 ucs-mule-to-mule-unicode))
73
74(defvar utf-8-subst-table (make-hash-table :test 'eq))
75(defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
76(define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
77(define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
78
79(defvar utf-8-translation-table-for-decode (make-translation-table)
80 "Translation table applied after decoding utf-8 to mule-unicode.
81This is only actually applied to characters which would normally be
82decoded into mule-unicode-0100-24ff.")
83(define-translation-table 'utf-8-translation-table-for-decode
84 utf-8-translation-table-for-decode)
85
86;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
87;; space of mule-unicode. For Latin scripts this isn't very
88;; important. Hebrew and Arabic might go here too when there's proper
89;; support for them.
90(mapc
91 (lambda (pair)
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
93 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
94 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
95 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
96 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
97 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
98 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
99 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
100 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
101 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
102 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
103 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
104 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
105 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
106 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
107 (?$,1'N(B . ?,F~(B)
108
109 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
110 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
111 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
112 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
113 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
114 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
115 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
116 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
117 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
118 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
119 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
120 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
121 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
122 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
123 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
124 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
125 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
126 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
127 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
128
129(defcustom utf-8-fragment-on-decoding nil
130 "Whether or not to decode some scripts in UTF-8 text into 8-bit characters.
131Setting this means that the relevant Cyrillic and Greek characters are
132decoded into the iso8859 charsets rather than into
133mule-unicode-0100-24ff. The 8-bit characters take half as much space
134in the buffer, but using them may affect how the buffer can be re-encoded
135and may require a different input method to search for them, for instance.
136See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
137for mechanisms to make this largely transparent."
138 :set (lambda (s v)
139 (if v
140 (define-translation-table 'utf-8-translation-table-for-decode
141 utf-8-translation-table-for-decode)
142 (define-translation-table 'utf-8-translation-table-for-decode))
143 (set-default s v))
144 :version "21.4"
145 :type 'boolean
146 :group 'mule)
147
148(defcustom utf-8-translate-cjk nil
149 "Whether the `mule-utf-8' coding system should encode many CJK characters.
150
151Enabling this loads tables which enable the coding system to encode
152characters in the charsets `korean-ksc5601', `chinese-gb2312' and
153`japanese-jisx0208', and to decode the corresponding unicodes into
154such characters. This works by loading the library `utf-8-subst'; see
155its commentary. The tables are fairly large (about 33000 entries), so this
156option is not the default."
157 :link '(emacs-commentary-link "utf-8-subst")
158 :set (lambda (s v)
159 (when v
160 (require 'utf-8-subst)
161 (let ((table (make-char-table 'translation-table)))
162 (coding-system-put 'mule-utf-8 'safe-charsets
163 (append (coding-system-get 'mule-utf-8
164 'safe-charsets)
165 '(korean-ksc5601 chinese-gb2312
166 japanese-jisx0208)))
167 (maphash (lambda (k v)
168 (aset table k v))
169 utf-8-subst-rev-table)
170 (register-char-codings 'mule-utf-8 table)))
171 (set-default s v))
172 :version "21.4"
173 :type 'boolean
174 :group 'mule)
175
64(define-ccl-program ccl-decode-mule-utf-8 176(define-ccl-program ccl-decode-mule-utf-8
65 ;; 177 ;;
66 ;; charset | bytes in utf-8 | bytes in emacs 178 ;; charset | bytes in utf-8 | bytes in emacs
@@ -90,66 +202,16 @@
90 ;; 1byte encoding, i.e., ascii 202 ;; 1byte encoding, i.e., ascii
91 (if (r0 < #x80) 203 (if (r0 < #x80)
92 (write r0) 204 (write r0)
93 205 (if (r0 < #xc0) ; continuation byte (invalid here)
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 206 (if (r0 < #xa0)
95 (if (r0 < #xe0) 207 (write-multibyte-character r5 r0)
96 ((read r1) 208 (write-multibyte-character r6 r0))
97 209 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
98 (if ((r1 & #b11000000) != #b10000000) 210 (if (r0 < #xe0)
99 ;; Invalid 2-byte sequence 211 ((read r1)
100 ((if (r0 < #xa0) 212
101 (write-multibyte-character r5 r0) 213 (if ((r1 & #b11000000) != #b10000000)
102 (write-multibyte-character r6 r0)) 214 ;; Invalid 2-byte sequence
103 (if (r1 < #x80)
104 (write r1)
105 (if (r1 < #xa0)
106 (write-multibyte-character r5 r1)
107 (write-multibyte-character r6 r1))))
108
109 ((r0 &= #x1f)
110 (r0 <<= 6)
111 (r1 &= #x3f)
112 (r1 += r0)
113 ;; Now r1 holds scalar value
114
115 ;; eight-bit-control
116 (if (r1 < 160)
117 ((write-multibyte-character r5 r1))
118
119 ;; latin-iso8859-1
120 (if (r1 < 256)
121 ((r0 = ,(charset-id 'latin-iso8859-1))
122 (r1 -= 128)
123 (write-multibyte-character r0 r1))
124
125 ;; mule-unicode-0100-24ff (< 0800)
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
127 (r1 -= #x0100)
128 (r2 = (((r1 / 96) + 32) << 7))
129 (r1 %= 96)
130 (r1 += (r2 + 32))
131 (write-multibyte-character r0 r1)))))))
132
133 ;; 3byte encoding
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
135 (if (r0 < #xf0)
136 ((read r1 r2)
137
138 ;; This is set to 1 if the encoding is invalid.
139 (r4 = 0)
140
141 (r3 = (r1 & #b11000000))
142 (r3 |= ((r2 >> 2) & #b00110000))
143 (if (r3 != #b10100000)
144 (r4 = 1)
145 ((r3 = ((r0 & #x0f) << 12))
146 (r3 += ((r1 & #x3f) << 6))
147 (r3 += (r2 & #x3f))
148 (if (r3 < #x0800)
149 (r4 = 1))))
150
151 (if (r4 != 0)
152 ;; Invalid 3-byte sequence
153 ((if (r0 < #xa0) 215 ((if (r0 < #xa0)
154 (write-multibyte-character r5 r0) 216 (write-multibyte-character r5 r0)
155 (write-multibyte-character r6 r0)) 217 (write-multibyte-character r6 r0))
@@ -157,75 +219,195 @@
157 (write r1) 219 (write r1)
158 (if (r1 < #xa0) 220 (if (r1 < #xa0)
159 (write-multibyte-character r5 r1) 221 (write-multibyte-character r5 r1)
160 (write-multibyte-character r6 r1))) 222 (write-multibyte-character r6 r1))))
161 (if (r2 < #x80) 223
162 (write r2) 224 ((r3 = r0) ; save in case of overlong sequence
163 (if (r2 < #xa0) 225 (r2 = r1)
164 (write-multibyte-character r5 r2) 226 (r0 &= #x1f)
165 (write-multibyte-character r6 r2)))) 227 (r0 <<= 6)
228 (r2 = r1) ; save in case of overlong sequence
229 (r1 &= #x3f)
230 (r1 += r0)
231 ;; Now r1 holds scalar value
232
233 (if (r1 < 128) ; `overlong sequence'
234 ((if (r3 < #xa0)
235 (write-multibyte-character r5 r3)
236 (write-multibyte-character r6 r3))
237 (if (r2 < #x80)
238 (write r2)
239 (if (r2 < #xa0)
240 (write-multibyte-character r5 r2)
241 (write-multibyte-character r6 r2))))
242
243 ;; eight-bit-control
244 (if (r1 < 160)
245 ((write-multibyte-character r5 r1))
246
247 ;; latin-iso8859-1
248 (if (r1 < 256)
249 ((r0 = ,(charset-id 'latin-iso8859-1))
250 (r1 -= 128)
251 (write-multibyte-character r0 r1))
252
253 ;; mule-unicode-0100-24ff (< 0800)
254 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
255 (r1 -= #x0100)
256 (r2 = (((r1 / 96) + 32) << 7))
257 (r1 %= 96)
258 (r1 += (r2 + 32))
259 (translate-character
260 utf-8-translation-table-for-decode r0 r1)
261 (write-multibyte-character r0 r1))))))))
262
263 ;; 3byte encoding
264 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
265 (if (r0 < #xf0)
266 ((read r1 r2)
267
268 ;; This is set to 1 if the encoding is invalid.
269 (r4 = 0)
270
271 (r3 = (r1 & #b11000000))
272 (r3 |= ((r2 >> 2) & #b00110000))
273 (if (r3 != #b10100000)
274 (r4 = 1)
275 ((r3 = ((r0 & #x0f) << 12))
276 (r3 += ((r1 & #x3f) << 6))
277 (r3 += (r2 & #x3f))
278 (if (r3 < #x0800)
279 (r4 = 1))))
280
281 (if (r4 != 0)
282 ;; Invalid 3-byte sequence
283 ((if (r0 < #xa0)
284 (write-multibyte-character r5 r0)
285 (write-multibyte-character r6 r0))
286 (if (r1 < #x80)
287 (write r1)
288 (if (r1 < #xa0)
289 (write-multibyte-character r5 r1)
290 (write-multibyte-character r6 r1)))
291 (if (r2 < #x80)
292 (write r2)
293 (if (r2 < #xa0)
294 (write-multibyte-character r5 r2)
295 (write-multibyte-character r6 r2))))
166 296
167 ;; mule-unicode-0100-24ff (>= 0800) 297 ;; mule-unicode-0100-24ff (>= 0800)
168 ((if (r3 < #x2500) 298 ((if (r3 < #x2500)
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 299 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
170 (r3 -= #x0100) 300 (r3 -= #x0100)
171 (r3 //= 96)
172 (r1 = (r7 + 32))
173 (r1 += ((r3 + 32) << 7))
174 (write-multibyte-character r0 r1))
175
176 ;; mule-unicode-2500-33ff
177 (if (r3 < #x3400)
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
179 (r3 -= #x2500)
180 (r3 //= 96) 301 (r3 //= 96)
181 (r1 = (r7 + 32)) 302 (r1 = (r7 + 32))
182 (r1 += ((r3 + 32) << 7)) 303 (r1 += ((r3 + 32) << 7))
304 (translate-character
305 utf-8-translation-table-for-decode r0 r1)
183 (write-multibyte-character r0 r1)) 306 (write-multibyte-character r0 r1))
184 307
185 ;; U+3400 .. U+DFFF 308 ;; mule-unicode-2500-33ff
186 ;; keep those bytes as eight-bit-{control|graphic} 309 ;; Fixme: Perhaps allow translation via
187 (if (r3 < #xe000) 310 ;; utf-8-subst-table for #x2e80 up, so that we use
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic 311 ;; consistent charsets for all of CJK. Would need
189 (r3 = r6) 312 ;; corresponding change to encoding tables.
190 (write-multibyte-character r3 r0) 313 (if (r3 < #x3400)
191 (if (r1 < #xa0) 314 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
192 (r3 = r5)) 315 (r3 -= #x2500)
193 (write-multibyte-character r3 r1) 316 (r3 //= 96)
194 (if (r2 < #xa0) 317 (r1 = (r7 + 32))
195 (r3 = r5) 318 (r1 += ((r3 + 32) << 7))
196 (r3 = r6)) 319 (write-multibyte-character r0 r1))
197 (write-multibyte-character r3 r2)) 320
321 ;; U+3400 .. U+D7FF
322 ;; Try to convert to CJK chars, else keep
323 ;; them as eight-bit-{control|graphic}.
324 (if (r3 < #xd800)
325 ((r4 = r3) ; don't zap r3
326 (lookup-integer utf-8-subst-table r4 r5)
327 (if r7
328 ;; got a translation
329 ((write-multibyte-character r4 r5)
330 ;; Zapped through register starvation.
331 (r5 = ,(charset-id 'eight-bit-control)))
332 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
333 ((r3 = r6)
334 (write-multibyte-character r3 r0)
335 (if (r1 < #xa0)
336 (r3 = r5))
337 (write-multibyte-character r3 r1)
338 (if (r2 < #xa0)
339 (r3 = r5)
340 (r3 = r6))
341 (write-multibyte-character r3 r2))))
342
343 ;; Surrogates, U+D800 .. U+DFFF
344 ;; Fixme: process them properly.
345 (if (r3 < #xe000)
346 ((r3 = r6)
347 (write-multibyte-character r3 r0) ; eight-bit-graphic
348 (if (r1 < #xa0)
349 (r3 = r5))
350 (write-multibyte-character r3 r1)
351 (if (r2 < #xa0)
352 (r3 = r5)
353 (r3 = r6))
354 (write-multibyte-character r3 r2))
198 355
199 ;; mule-unicode-e000-ffff 356 ;; mule-unicode-e000-ffff
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 357 ;; Fixme: fffe and ffff are invalid.
201 (r3 -= #xe000) 358 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
202 (r3 //= 96) 359 (r3 -= #xe000)
203 (r1 = (r7 + 32)) 360 (r3 //= 96)
204 (r1 += ((r3 + 32) << 7)) 361 (r1 = (r7 + 32))
205 (write-multibyte-character r0 r1)))))))) 362 (r1 += ((r3 + 32) << 7))
206 363 (write-multibyte-character r0 r1)))))))))
207 ;; 4byte encoding 364
208 ;; keep those bytes as eight-bit-{control|graphic} 365 (if (r0 < #xfe)
209 ((read r1 r2 r3) 366 ;; 4byte encoding
210 ;; r0 > #xf0, thus eight-bit-graphic 367 ;; keep those bytes as eight-bit-{control|graphic}
211 (write-multibyte-character r6 r0) 368 ;; Fixme: allow lookup in utf-8-subst-table.
212 (if (r1 < #xa0) 369 ((read r1 r2 r3)
213 (write-multibyte-character r5 r1) 370 ;; r0 > #xf0, thus eight-bit-graphic
214 (write-multibyte-character r6 r1)) 371 (write-multibyte-character r6 r0)
215 (if (r2 < #xa0) 372 (if (r1 < #xa0)
216 (write-multibyte-character r5 r2) 373 (if (r1 < #x80) ; invalid byte
217 (write-multibyte-character r6 r2)) 374 (write r1)
218 (if (r3 < #xa0) 375 (write-multibyte-character r5 r1))
219 (write-multibyte-character r5 r3) 376 (write-multibyte-character r6 r1))
220 (write-multibyte-character r6 r3)))))) 377 (if (r2 < #xa0)
221 378 (if (r2 < #x80) ; invalid byte
379 (write r2)
380 (write-multibyte-character r5 r2))
381 (write-multibyte-character r6 r2))
382 (if (r3 < #xa0)
383 (if (r3 < #x80) ; invalid byte
384 (write r3)
385 (write-multibyte-character r5 r3))
386 (write-multibyte-character r6 r3))
387 (if (r0 >= #xf8) ; 5- or 6-byte encoding
388 ((read r1)
389 (if (r1 < #xa0)
390 (if (r1 < #x80) ; invalid byte
391 (write r1)
392 (write-multibyte-character r5 r1))
393 (write-multibyte-character r6 r1))
394 (if (r0 >= #xfc) ; 6-byte
395 ((read r1)
396 (if (r1 < #xa0)
397 (if (r1 < #x80) ; invalid byte
398 (write r1)
399 (write-multibyte-character r5 r1))
400 (write-multibyte-character r6 r1)))))))
401 ;; else invalid byte >= #xfe
402 (write-multibyte-character r6 r0))))))
222 (repeat)))) 403 (repeat))))
223 404
224 "CCL program to decode UTF-8. 405 "CCL program to decode UTF-8.
225Basic decoding is done into the charsets ascii, latin-iso8859-1 and 406Basic decoding is done into the charsets ascii, latin-iso8859-1 and
226mule-unicode-*. Encodings of un-representable Unicode characters are 407mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
227decoded asis into eight-bit-control and eight-bit-graphic 408`utf-8-subst-table'.
228characters.") 409Encodings of un-representable Unicode characters are decoded asis into
410eight-bit-control and eight-bit-graphic characters.")
229 411
230(define-ccl-program ccl-encode-mule-utf-8 412(define-ccl-program ccl-encode-mule-utf-8
231 `(1 413 `(1
@@ -288,7 +470,7 @@ characters.")
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 470 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 471 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
290 (r1 &= #x7f) 472 (r1 &= #x7f)
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 473 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
292 (r0 = (((r1 & #xf000) >> 12) | #xe0)) 474 (r0 = (((r1 & #xf000) >> 12) | #xe0))
293 (r2 = ((r1 & #x3f) | #x80)) 475 (r2 = ((r1 & #x3f) | #x80))
294 (r1 &= #x0fc0) 476 (r1 &= #x0fc0)
@@ -329,11 +511,19 @@ characters.")
329 ((write #xc2) 511 ((write #xc2)
330 (write r1))))))) 512 (write r1)))))))
331 513
332 ;; Unsupported character. 514 ((lookup-character utf-8-subst-rev-table r0 r1)
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. 515 (if r7 ; lookup succeeded
334 ((write #xef) 516 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
335 (write #xbf) 517 (r2 = ((r0 & #x3f) | #x80))
336 (write #xbd))))))))) 518 (r0 &= #x0fc0)
519 (r0 >>= 6)
520 (r0 |= #x80)
521 (write r1 r0 r2))
522 ;; Unsupported character.
523 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
524 ((write #xef)
525 (write #xbf)
526 (write #xbd)))))))))))
337 (repeat))) 527 (repeat)))
338 (if (r1 >= #xa0) 528 (if (r1 >= #xa0)
339 (write r1) 529 (write r1)
@@ -341,69 +531,89 @@ characters.")
341 ((write #xc2) 531 ((write #xc2)
342 (write r1))))) 532 (write r1)))))
343 533
344 "CCL program to encode into UTF-8. 534 "CCL program to encode into UTF-8.")
345Only characters from the charsets ascii, eight-bit-control,
346eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
347Others are encoded as U+FFFD.")
348 535
349;; Dummy definition so that the CCL can be checked correctly; the 536;; Dummy definition so that the CCL can be checked correctly; the
350;; actual data are loaded on demand. 537;; actual data are loaded on demand.
351(unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it 538(unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode)) 539 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
353 540
541(define-ccl-program ccl-untranslated-to-ucs
542 `(0
543 (if (r0 < #xf0) ; 3-byte encoding, as above
544 ((r4 = 0)
545 (r3 = (r1 & #b11000000))
546 (r3 |= ((r2 >> 2) & #b00110000))
547 (if (r3 != #b10100000)
548 (r4 = 1)
549 ((r3 = ((r0 & #x0f) << 12))
550 (r3 += ((r1 & #x3f) << 6))
551 (r3 += (r2 & #x3f))
552 (if (r3 < #x0800)
553 (r4 = 1))))
554 (if (r4 != 0)
555 (r0 = 0)
556 (r0 = r3)))
557 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
558 ((r4 = (r1 >> 6))
559 (if (r4 != #b10)
560 (r0 = 0)
561 ((r4 = (r2 >> 6))
562 (if (r4 != #b10)
563 (r0 = 0)
564 ((r4 = (r3 >> 6))
565 (if (r4 != #b10)
566 (r0 = 0)
567 ((r1 = ((r1 & #x3F) << 12))
568 (r2 = ((r2 & #x3F) << 6))
569 (r3 &= #x3F)
570 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
571 (r0 = 0))))
572 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
573r0 == 0 for invalid sequence.")
574
575(defvar utf-8-ccl-regs (make-vector 8 0))
576
354(defsubst utf-8-untranslated-to-ucs () 577(defsubst utf-8-untranslated-to-ucs ()
355 (let ((b1 (char-after)) 578 "Return the UCS code for an untranslated sequence of raw bytes t point.
356 (b2 (char-after (1+ (point)))) 579Only for 3- or 4-byte sequences."
357 (b3 (char-after (+ 2 (point)))) 580 (aset utf-8-ccl-regs 0 (or (char-after) 0))
358 (b4 (char-after (+ 4 (point))))) 581 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
359 (if (and b1 b2 b3) 582 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
360 (cond ((< b1 ?\xf0) 583 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
361 (setq b2 (lsh (logand b2 ?\x3f) 6)) 584 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
362 (setq b3 (logand b3 ?\x3f)) 585 (aref utf-8-ccl-regs 0))
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
364 (b4
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
367 (setq b4 (logand b4 ?\x3f))
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
369 18)))))))))
370 586
371(defun utf-8-help-echo (window object position) 587(defun utf-8-help-echo (window object position)
372 (format "Untranslated Unicode U+%04X" 588 (format "Untranslated Unicode U+%04X"
373 (get-char-property position 'untranslated-utf-8 object))) 589 (get-char-property position 'untranslated-utf-8 object)))
374 590
375(defvar utf-8-subst-table nil
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
377
378;; We compose the untranslatable sequences into a single character. 591;; We compose the untranslatable sequences into a single character.
379;; This is infelicitous for editing, because there's currently no 592;; This is infelicitous for editing, because there's currently no
380;; mechanism for treating compositions as atomic, but is OK for 593;; mechanism for treating compositions as atomic, but is OK for
381;; display. We try to compose an appropriate character from a hash 594;; display. They are composed to U+FFFD with help-echo which
382;; table of CJK characters to display correctly. Otherwise we use 595;; indicates the unicodes they represent. This function GCs too much.
383;; U+FFFD. What we really should have is hash table lookup from CCL
384;; so that we could do this properly. This function GCs too much.
385(defsubst utf-8-compose () 596(defsubst utf-8-compose ()
386 "Put a suitable composition on an untranslatable sequence. 597 "Put a suitable composition on an untranslatable sequence.
387Return the sequence's length." 598Return the sequence's length."
388 (let* ((u (utf-8-untranslated-to-ucs)) 599 (let* ((u (utf-8-untranslated-to-ucs))
389 (l (and u (if (>= u ?\x10000) 600 (l (unless (zerop u)
601 (if (>= u #x10000)
390 4 602 4
391 3))) 603 3))))
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table)))) 604 (when l
393 (when u
394 (put-text-property (point) (min (point-max) (+ l (point))) 605 (put-text-property (point) (min (point-max) (+ l (point)))
395 'untranslated-utf-8 u) 606 'untranslated-utf-8 u)
396 (unless subst 607 (put-text-property (point) (min (point-max) (+ l (point)))
397 (put-text-property (point) (min (point-max) (+ l (point))) 608 'help-echo 'utf-8-help-echo)
398 'help-echo 'utf-8-help-echo) 609 (compose-region (point) (+ l (point)) ?$,3u=(B)
399 (setq subst ?$,3u=(B))
400 (compose-region (point) (+ l (point)) subst)
401 l))) 610 l)))
402 611
403(defcustom utf-8-compose-scripts nil 612(defcustom utf-8-compose-scripts nil
404 "*Non-nil means compose various scipts on decoding utf-8 text." 613 "*Non-nil means compose various scripts on decoding utf-8 text."
405 :group 'mule 614 :group 'mule
406 :type 'boolean) ; omitted in Emacs 21.1 615 :version "21.4"
616 :type 'boolean)
407 617
408(defun utf-8-post-read-conversion (length) 618(defun utf-8-post-read-conversion (length)
409 "Compose untranslated utf-8 sequences into single characters. 619 "Compose untranslated utf-8 sequences into single characters.
@@ -412,38 +622,39 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
412 ;; Can't do eval-when-compile to insert a multibyte constant 622 ;; Can't do eval-when-compile to insert a multibyte constant
413 ;; version of the string in the loop, since it's always loaded as 623 ;; version of the string in the loop, since it's always loaded as
414 ;; unibyte from a byte-compiled file. 624 ;; unibyte from a byte-compiled file.
415 (let ((range (string-as-multibyte "^\341-\377"))) 625 (let ((range (string-as-multibyte "^\xe1-\xf7")))
416 (while (and (skip-chars-forward 626 (while (and (skip-chars-forward range)
417 range)
418 (not (eobp))) 627 (not (eobp)))
419 (forward-char (utf-8-compose))))) 628 (forward-char (utf-8-compose)))))
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs 629 ;; Fixme: Takahashi-san implies it may not work this easily. I
421 ;; checking with him. 630 ;; asked why but didn't get a reply. -- fx
422 (when (and utf-8-compose-scripts (> length 1)) 631 (when (and utf-8-compose-scripts (> length 1))
423 ;; These currently have definitions which cover the relevant 632 ;; These currently have definitions which cover the relevant
424 ;; Unicodes. We could avoid loading thai-util &c by checking 633 ;; unicodes. We could avoid loading thai-util &c by checking
425 ;; whether the region contains any characters with the appropriate 634 ;; whether the region contains any characters with the appropriate
426 ;; categories. There aren't yet Unicode-based rules for Tibetan. 635 ;; categories. There aren't yet Unicode-based rules for Tibetan.
427 (save-excursion (setq length (diacritic-post-read-conversion length))) 636 (save-excursion (setq length (diacritic-post-read-conversion length)))
428 (save-excursion (setq length (thai-post-read-conversion length))) 637 (save-excursion (setq length (thai-post-read-conversion length)))
429 (save-excursion (setq length (lao-post-read-conversion length))) 638 (save-excursion (setq length (lao-post-read-conversion length)))
430 (save-excursion (setq length (devanagari-post-read-conversion length)))) 639 (save-excursion
640 (setq length (in-is13194-devanagari-post-read-conversion length))))
431 length) 641 length)
432 642
433(defun utf-8-pre-write-conversion (beg end) 643;; ucs-tables is preloaded
434 "Semi-dummy pre-write function effectively to autoload ucs-tables." 644;; (defun utf-8-pre-write-conversion (beg end)
435 ;; Ensure translation table is loaded. 645;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
436 (require 'ucs-tables) 646;; ;; Ensure translation table is loaded.
437 ;; Don't do this again. 647;; (require 'ucs-tables)
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil) 648;; ;; Don't do this again.
439 nil) 649;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
650;; nil)
440 651
441(make-coding-system 652(make-coding-system
442 'mule-utf-8 4 ?u 653 'mule-utf-8 4 ?u
443 "UTF-8 encoding for Emacs-supported Unicode characters. 654 "UTF-8 encoding for Emacs-supported Unicode characters.
444The supported Emacs character sets are the following, plus others 655The supported Emacs character sets are the following, plus any other
445which may be included in the translation table 656characters included in the tables `ucs-mule-to-mule-unicode' and
446`ucs-mule-to-mule-unicode': 657`utf-8-subst-rev-table':
447 ascii 658 ascii
448 eight-bit-control 659 eight-bit-control
449 eight-bit-graphic 660 eight-bit-graphic
@@ -462,10 +673,12 @@ which may be included in the translation table
462 mule-unicode-e000-ffff 673 mule-unicode-e000-ffff
463 674
464Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF 675Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
465are decoded into sequences of eight-bit-control and eight-bit-graphic 676may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
466characters to preserve their byte sequences and composed to display as 677\(see user option `utf-8-translate-cjk'); otherwise, sequences of
467a single character. Emacs characters that can't be encoded to these 678eight-bit-control and eight-bit-graphic characters are used to
468ranges are encoded as U+FFFD." 679preserve their byte sequences, and these are composed to display as a
680single character. Emacs characters that otherwise can't be encoded
681are encoded as U+FFFD."
469 682
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 683 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
471 '((safe-charsets 684 '((safe-charsets
@@ -497,7 +710,7 @@ ranges are encoded as U+FFFD."
497 (mime-charset . utf-8) 710 (mime-charset . utf-8)
498 (coding-category . coding-category-utf-8) 711 (coding-category . coding-category-utf-8)
499 (valid-codes (0 . 255)) 712 (valid-codes (0 . 255))
500 (pre-write-conversion . utf-8-pre-write-conversion) 713;; (pre-write-conversion . utf-8-pre-write-conversion)
501 (post-read-conversion . utf-8-post-read-conversion))) 714 (post-read-conversion . utf-8-post-read-conversion)))
502 715
503(define-coding-system-alias 'utf-8 'mule-utf-8) 716(define-coding-system-alias 'utf-8 'mule-utf-8)