aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKenichi Handa2004-06-12 02:10:37 +0000
committerKenichi Handa2004-06-12 02:10:37 +0000
commitc71c26e9295573509ceaa468ea973299a8b311e1 (patch)
treebea8ba45657f2c84350359f97f0eeeec84d7d38d
parent7805cdbd4f39ef8904b68f687be8715306910769 (diff)
downloademacs-c71c26e9295573509ceaa468ea973299a8b311e1.tar.gz
emacs-c71c26e9295573509ceaa468ea973299a8b311e1.zip
(utf-translate-cjk-charsets): New
variable. (utf-translate-cjk-unicode-range): New variable. (utf-translate-cjk-load-tables): New function. (utf-lookup-subst-table-for-decode): New function. (utf-lookup-subst-table-for-encode): New function. (utf-translate-cjk-mode): Init-value changed to t. Don't load tables here. Update safe-charsets of utf-* coding systems. (ccl-mule-utf-untrans): New CCL. (ccl-decode-mule-utf-8): Call ccl-mule-utf-untrans. Use `repeat' at end of each branch. (ccl-mule-utf-8-encode-untrans): New CCL. (ccl-encode-mule-utf-8): Call ccl-mule-utf-8-encode-untrans. (ccl-untranslated-to-ucs): Handle 2-byte encoding. Set r1 to the length of encoding. Don't return r0. (utf-8-compose): New arg hash-table. Handle 2-byte encoding. (utf-8-post-read-conversion): Narrow to region properly. If utf-translate-cjk-mode is on, load tables if necessary. Call utf-8-compose with hash-table arg if necessary. Call XXX-compose-region instead of XXX-post-read-convesion. (utf-8-pre-write-conversion): New function. (mule-utf-8): Include CJK charsets in safe-charsets if utf-translate-cjk-mode is on. Add pre-write-conversion.
-rw-r--r--lisp/international/utf-8.el1086
1 files changed, 613 insertions, 473 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index d4dd7b6c882..e324d0c0270 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -190,9 +190,102 @@ Setting this variable outside customize has no effect."
190 :type 'boolean 190 :type 'boolean
191 :group 'mule) 191 :group 'mule)
192 192
193
194(defconst utf-translate-cjk-charsets '(chinese-gb2312
195 chinese-big5-1 chinese-big5-2
196 japanese-jisx0208 japanese-jisx0212
197 korean-ksc5601)
198 "List of charsets supported by `utf-translate-cjk-mode'.")
199
200(defconst utf-translate-cjk-unicode-range
201 '((#x2e80 . #xd7a3)
202 (#xff00 . #xffef))
203 "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
204
205;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
206(defsubst utf-translate-cjk-substitutable-p (code-point)
207 (let ((tail utf-translate-cjk-unicode-range)
208 elt)
209 (while tail
210 (setq elt (car tail) tail (cdr tail))
211 (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
212 (setq tail nil)
213 (setq elt nil)))
214 elt))
215
216(defvar utf-translate-cjk-lang-env nil
217 "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
218The value nil means that the tables are not yet loaded.")
219
220(defun utf-translate-cjk-load-tables ()
221 "Load tables for `utf-translate-cjk-mode'."
222 ;; Fixme: Allow the use of the CJK charsets to be
223 ;; customized by reordering and possible omission.
224 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
225 (if redefined
226 ;; Redefine them with realistic initial sizes and a
227 ;; smallish rehash size to avoid wasting significant
228 ;; space after they're built.
229 (setq ucs-mule-cjk-to-unicode
230 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
231 ucs-unicode-to-mule-cjk
232 (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
233
234 ;; Load the files explicitly, to avoid having to keep
235 ;; around the large tables they contain (as well as the
236 ;; ones which get built).
237 (cond ((string= "Korean" current-language-environment)
238 (load "subst-jis")
239 (load "subst-big5")
240 (load "subst-gb2312")
241 (load "subst-ksc"))
242 ((string= "Chinese-BIG5" current-language-environment)
243 (load "subst-jis")
244 (load "subst-ksc")
245 (load "subst-gb2312")
246 (load "subst-big5"))
247 ((string= "Chinese-GB" current-language-environment)
248 (load "subst-jis")
249 (load "subst-ksc")
250 (load "subst-big5")
251 (load "subst-gb2312"))
252 (t
253 (load "subst-ksc")
254 (load "subst-gb2312")
255 (load "subst-big5")
256 (load "subst-jis"))) ; jis covers as much as big5, gb2312
257
258 (when redefined
259 (define-translation-hash-table 'utf-subst-table-for-decode
260 ucs-unicode-to-mule-cjk)
261 (define-translation-hash-table 'utf-subst-table-for-encode
262 ucs-mule-cjk-to-unicode)
263 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
264 'translation-table)
265 1 ucs-mule-cjk-to-unicode))
266
267 (setq utf-translate-cjk-lang-env current-language-environment)))
268
269(defun utf-lookup-subst-table-for-decode (code-point)
270 (if (and utf-translate-cjk-mode
271 (not utf-translate-cjk-lang-env)
272 (utf-translate-cjk-substitutable-p code-point))
273 (utf-translate-cjk-load-tables))
274 (gethash code-point
275 (get 'utf-subst-table-for-decode 'translation-hash-table)))
276
277
278(defun utf-lookup-subst-table-for-encode (char)
279 (if (and utf-translate-cjk-mode
280 (not utf-translate-cjk-lang-env)
281 (memq (char-charset char) utf-translate-cjk-charsets))
282 (utf-translate-cjk-load-tables))
283 (gethash char
284 (get 'utf-subst-table-for-encode 'translation-hash-table)))
285
193(define-minor-mode utf-translate-cjk-mode 286(define-minor-mode utf-translate-cjk-mode
194 "Whether the UTF based coding systems should decode/encode CJK characters. 287 "Whether the UTF based coding systems should decode/encode CJK characters.
195Enabling this loads tables which allow the coding systems mule-utf-8, 288Enabling this allows the coding systems mule-utf-8,
196mule-utf-16le and mule-utf-16be to encode characters in the charsets 289mule-utf-16le and mule-utf-16be to encode characters in the charsets
197`korean-ksc5601', `chinese-gb2312', `chinese-big5-1', 290`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
198`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to 291`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
@@ -203,49 +296,16 @@ according to the language environment in effect when this option is
203turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for 296turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
204Chinese-Big5 and jisx for other environments. 297Chinese-Big5 and jisx for other environments.
205 298
206The tables are large (over 40000 entries), so this option is not the 299This option is on by default. If you are not interested in CJK
207default. Also, installing them may be rather slow." 300characters and want to avoid some overhead on encoding/decoding
208 :init-value nil 301by the above coding systems, you can customize this option to nil."
302 :init-value t
209 :version "21.4" 303 :version "21.4"
210 :type 'boolean 304 :type 'boolean
211 :set-after '(current-language-environment)
212 :group 'mule 305 :group 'mule
213 :global t 306 :global t
214 (if utf-translate-cjk-mode 307 (if utf-translate-cjk-mode
215 ;; Fixme: Allow the use of the CJK charsets to be
216 ;; customized by reordering and possible omission.
217 (progn 308 (progn
218 ;; Redefine them with realistic initial sizes and a
219 ;; smallish rehash size to avoid wasting significant
220 ;; space after they're built.
221 (setq ucs-mule-cjk-to-unicode
222 (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
223 ucs-unicode-to-mule-cjk
224 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))
225 ;; Load the files explicitly, to avoid having to keep
226 ;; around the large tables they contain (as well as the
227 ;; ones which get built).
228 (cond
229 ((string= "Korean" current-language-environment)
230 (load "subst-jis")
231 (load "subst-big5")
232 (load "subst-gb2312")
233 (load "subst-ksc"))
234 ((string= "Chinese-BIG5" current-language-environment)
235 (load "subst-jis")
236 (load "subst-ksc")
237 (load "subst-gb2312")
238 (load "subst-big5"))
239 ((string= "Chinese-GB" current-language-environment)
240 (load "subst-jis")
241 (load "subst-ksc")
242 (load "subst-big5")
243 (load "subst-gb2312"))
244 (t
245 (load "subst-ksc")
246 (load "subst-gb2312")
247 (load "subst-big5")
248 (load "subst-jis"))) ; jis covers as much as big5, gb2312
249 (define-translation-hash-table 'utf-subst-table-for-decode 309 (define-translation-hash-table 'utf-subst-table-for-decode
250 ucs-unicode-to-mule-cjk) 310 ucs-unicode-to-mule-cjk)
251 (define-translation-hash-table 'utf-subst-table-for-encode 311 (define-translation-hash-table 'utf-subst-table-for-encode
@@ -259,7 +319,58 @@ default. Also, installing them may be rather slow."
259 (make-hash-table :test 'eq)) 319 (make-hash-table :test 'eq))
260 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 320 (set-char-table-extra-slot (get 'utf-translation-table-for-encode
261 'translation-table) 321 'translation-table)
262 1 nil))) 322 1 nil))
323
324 ;; Update safe-chars of mule-utf-* coding systems.
325 (dolist (elt (coding-system-list t))
326 (if (string-match "^mule-utf" (symbol-name elt))
327 (let ((safe-charsets (coding-system-get elt 'safe-charsets))
328 (safe-chars (coding-system-get elt 'safe-chars))
329 (need-update nil))
330 (dolist (charset utf-translate-cjk-charsets)
331 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
332 (setq safe-charsets
333 (if utf-translate-cjk-mode
334 (cons charset safe-charsets)
335 (delq charset safe-charsets))
336 need-update t)
337 (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
338 (when need-update
339 (coding-system-put elt 'safe-charsets safe-charsets)
340 (define-coding-system-internal elt))))))
341
342(define-ccl-program ccl-mule-utf-untrans
343 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
344 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
345 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
346 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
347 ;;
348 ;; This is a subrountine because we assume that this is called very
349 ;; rarely (so we don't have to worry about the overhead of the
350 ;; call).
351 `(0
352 ((r5 = ,(charset-id 'eight-bit-control))
353 (r6 = ,(charset-id 'eight-bit-graphic))
354 (if (r0 < #x100)
355 ((r4 = ((r0 >> 6) | #xC0))
356 (write-multibyte-character r6 r4))
357 ((if (r0 < #x10000)
358 ((r4 = ((r0 >> 12) | #xE0))
359 (write-multibyte-character r6 r4))
360 ((r4 = ((r0 >> 18) | #xF0))
361 (write-multibyte-character r6 r4)
362 (r4 = (((r0 >> 12) & #x3F) | #x80))
363 (if (r4 < #xA0)
364 (write-multibyte-character r5 r4)
365 (write-multibyte-character r6 r4))))
366 (r4 = (((r0 >> 6) & #x3F) | #x80))
367 (if (r4 < #xA0)
368 (write-multibyte-character r5 r4)
369 (write-multibyte-character r6 r4))))
370 (r4 = ((r0 & #x3F) | #x80))
371 (if (r4 < #xA0)
372 (write-multibyte-character r5 r4)
373 (write-multibyte-character r6 r4)))))
263 374
264(define-ccl-program ccl-decode-mule-utf-8 375(define-ccl-program ccl-decode-mule-utf-8
265 ;; 376 ;;
@@ -278,260 +389,206 @@ default. Also, installing them may be rather slow."
278 ;; (>= 8000) | | 389 ;; (>= 8000) | |
279 ;; mule-unicode-2500-33ff | 3 | 4 390 ;; mule-unicode-2500-33ff | 3 | 4
280 ;; mule-unicode-e000-ffff | 3 | 4 391 ;; mule-unicode-e000-ffff | 3 | 4
392 ;; -----------------------+----------------+---------------
393 ;; invalid byte | 1 | 2
281 ;; 394 ;;
282 ;; Thus magnification factor is two. 395 ;; Thus magnification factor is two.
283 ;; 396 ;;
284 `(2 397 `(2
285 ((r5 = ,(charset-id 'eight-bit-control)) 398 ((r0 = -1)
286 (r6 = ,(charset-id 'eight-bit-graphic))
287 (loop 399 (loop
288 (r0 = -1) 400 (if (r0 < 0)
289 (read r0) 401 (read r0))
290
291 ;; 1byte encoding, i.e., ascii
292 (if (r0 < #x80) 402 (if (r0 < #x80)
293 ((write r0)) 403 ;; 1-byte encoding, i.e., ascii
294 (if (r0 < #xc0) ; continuation byte (invalid here) 404 ((write r0)
295 ((if (r0 < #xa0) 405 (r0 = -1)
296 (write-multibyte-character r5 r0) 406 (repeat)))
297 (write-multibyte-character r6 r0))) 407 (if (r0 < #xc0) ; continuation byte (invalid here)
298 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 408 ((call ccl-mule-utf-untrans)
299 (if (r0 < #xe0) 409 (r0 = -1)
300 ((r1 = -1) 410 (repeat)))
301 (read r1) 411
302 412 ;; Read the 2nd byte.
303 (if ((r1 & #b11000000) != #b10000000) 413 (r1 = -1)
304 ;; Invalid 2-byte sequence 414 (read r1)
305 ((if (r0 < #xa0) 415 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
306 (write-multibyte-character r5 r0) 416 ((call ccl-mule-utf-untrans)
307 (write-multibyte-character r6 r0)) 417 ;; Handle it in the next loop.
308 (if (r1 < #x80) 418 (r0 = r1)
309 (write r1) 419 (repeat)))
310 (if (r1 < #xa0) 420
311 (write-multibyte-character r5 r1) 421 (if (r0 < #xe0)
312 (write-multibyte-character r6 r1)))) 422 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
313 423 ((r2 = ((r0 & #x1F) << 6))
314 ((r3 = r0) ; save in case of overlong sequence 424 (r2 |= (r1 & #x3F))
315 (r2 = r1) 425 ;; Now r2 holds scalar value
316 (r0 &= #x1f) 426
317 (r0 <<= 6) 427 (if (r2 < 128) ; `overlong sequence'
318 (r1 &= #x3f) 428 ((call ccl-mule-utf-untrans)
319 (r1 += r0) 429 (r0 = r1)
320 ;; Now r1 holds scalar value 430 (call ccl-mule-utf-untrans)
321 431 (r0 = -1)
322 (if (r1 < 128) ; `overlong sequence' 432 (repeat)))
323 ((if (r3 < #xa0) 433
324 (write-multibyte-character r5 r3) 434 (r1 = r2)
325 (write-multibyte-character r6 r3)) 435 (if (r1 < 160)
326 (if (r2 < #x80) 436 ;; eight-bit-control
327 (write r2) 437 (r0 = ,(charset-id 'eight-bit-control))
328 (if (r2 < #xa0) 438 (if (r1 < 256)
329 (write-multibyte-character r5 r2) 439 ;; latin-iso8859-1
330 (write-multibyte-character r6 r2)))) 440 ((r0 = ,(charset-id 'latin-iso8859-1))
331 441 (r1 -= 128))
332 ;; eight-bit-control 442 ;; mule-unicode-0100-24ff (< 0800)
333 (if (r1 < 160) 443 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
334 ((write-multibyte-character r5 r1)) 444 (r1 -= #x0100)
335 445 (r2 = (((r1 / 96) + 32) << 7))
336 ;; latin-iso8859-1 446 (r1 %= 96)
337 (if (r1 < 256) 447 (r1 += (r2 + 32))
338 ((r0 = ,(charset-id 'latin-iso8859-1)) 448 (translate-character
339 (r1 -= 128) 449 utf-translation-table-for-decode r0 r1))))
340 (write-multibyte-character r0 r1)) 450 (write-multibyte-character r0 r1)
341 451 (r0 = -1)
342 ;; mule-unicode-0100-24ff (< 0800) 452 (repeat)))
343 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 453
344 (r1 -= #x0100) 454 ;; Read the 3rd bytes.
345 (r2 = (((r1 / 96) + 32) << 7)) 455 (r2 = -1)
346 (r1 %= 96) 456 (read r2)
347 (r1 += (r2 + 32)) 457 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
348 (translate-character 458 ((call ccl-mule-utf-untrans)
349 utf-translation-table-for-decode r0 r1) 459 (r0 = r1)
350 (write-multibyte-character r0 r1)))))))) 460 (call ccl-mule-utf-untrans)
351 461 ;; Handle it in the next loop.
352 ;; 3byte encoding 462 (r0 = r2)
353 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 463 (repeat)))
354 (if (r0 < #xf0) 464
355 ((r1 = -1) 465 (if (r0 < #xF0)
356 (r2 = -1) 466 ;; 3byte encoding
357 (read r1 r2) 467 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
358 468 ((r3 = ((r0 & #xF) << 12))
359 ;; This is set to 1 if the encoding is invalid. 469 (r3 |= ((r1 & #x3F) << 6))
360 (r4 = 0) 470 (r3 |= (r2 & #x3F))
361 471
362 (r3 = (r1 & #b11000000)) 472 (if (r3 < #x800) ; `overlong sequence'
363 (r3 |= ((r2 >> 2) & #b00110000)) 473 ((call ccl-mule-utf-untrans)
364 (if (r3 != #b10100000) 474 (r0 = r1)
365 (r4 = 1) 475 (call ccl-mule-utf-untrans)
366 ((r3 = ((r0 & #x0f) << 12)) 476 (r0 = r2)
367 (r3 += ((r1 & #x3f) << 6)) 477 (call ccl-mule-utf-untrans)
368 (r3 += (r2 & #x3f)) 478 (r0 = -1)
369 (if (r3 < #x0800) 479 (repeat)))
370 (r4 = 1)))) 480
371 481 (if (r3 < #x2500)
372 (if (r4 != 0) 482 ;; mule-unicode-0100-24ff (>= 0800)
373 ;; Invalid 3-byte sequence 483 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
374 ((if (r0 < #xa0) 484 (r3 -= #x0100)
375 (write-multibyte-character r5 r0) 485 (r3 //= 96)
376 (write-multibyte-character r6 r0)) 486 (r1 = (r7 + 32))
377 (if (r1 < #x80) 487 (r1 += ((r3 + 32) << 7))
378 (write r1) 488 (translate-character
379 (if (r1 < #xa0) 489 utf-translation-table-for-decode r0 r1)
380 (write-multibyte-character r5 r1) 490 (write-multibyte-character r0 r1)
381 (write-multibyte-character r6 r1))) 491 (r0 = -1)
382 (if (r2 < #x80) 492 (repeat)))
383 (write r2) 493
384 (if (r2 < #xa0) 494 (if (r3 < #x3400)
385 (write-multibyte-character r5 r2) 495 ;; mule-unicode-2500-33ff
386 (write-multibyte-character r6 r2)))) 496 ((r0 = r3) ; don't zap r3
387 497 (lookup-integer utf-subst-table-for-decode r0 r1)
388 ;; mule-unicode-0100-24ff (>= 0800) 498 (if (r7 == 0)
389 ((if (r3 < #x2500) 499 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
390 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 500 (r3 -= #x2500)
391 (r3 -= #x0100) 501 (r3 //= 96)
392 (r3 //= 96) 502 (r1 = (r7 + 32))
393 (r1 = (r7 + 32)) 503 (r1 += ((r3 + 32) << 7))))
394 (r1 += ((r3 + 32) << 7)) 504 (write-multibyte-character r0 r1)
395 (translate-character 505 (r0 = -1)
396 utf-translation-table-for-decode r0 r1) 506 (repeat)))
397 (write-multibyte-character r0 r1)) 507
398 508 (if (r3 < #xE000)
399 ;; mule-unicode-2500-33ff 509 ;; Try to convert to CJK chars, else
400 (if (r3 < #x3400) 510 ;; keep them as eight-bit-{control|graphic}.
401 ((r4 = r3) ; don't zap r3 511 ((r0 = r3)
402 (lookup-integer utf-subst-table-for-decode r4 r5) 512 (lookup-integer utf-subst-table-for-decode r3 r1)
403 (if r7 513 (if r7
404 ;; got a translation 514 ;; got a translation
405 ((write-multibyte-character r4 r5) 515 (write-multibyte-character r3 r1)
406 ;; Zapped through register starvation. 516 (call ccl-mule-utf-untrans))
407 (r5 = ,(charset-id 'eight-bit-control))) 517 (r0 = -1)
408 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 518 (repeat)))
409 (r3 -= #x2500) 519
410 (r3 //= 96) 520 ;; mule-unicode-e000-ffff
411 (r1 = (r7 + 32)) 521 ;; Fixme: fffe and ffff are invalid.
412 (r1 += ((r3 + 32) << 7)) 522 (r0 = r3) ; don't zap r3
413 (write-multibyte-character r0 r1)))) 523 (lookup-integer utf-subst-table-for-decode r0 r1)
414 524 (if (r7 == 0)
415 ;; U+3400 .. U+D7FF 525 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
416 ;; Try to convert to CJK chars, else keep 526 (r3 -= #xe000)
417 ;; them as eight-bit-{control|graphic}. 527 (r3 //= 96)
418 (if (r3 < #xd800) 528 (r1 = (r7 + 32))
419 ((r4 = r3) ; don't zap r3 529 (r1 += ((r3 + 32) << 7))))
420 (lookup-integer utf-subst-table-for-decode r4 r5) 530 (write-multibyte-character r0 r1)
421 (if r7 531 (r0 = -1)
422 ;; got a translation 532 (repeat)))
423 ((write-multibyte-character r4 r5) 533
424 ;; Zapped through register starvation. 534 ;; Read the 4th bytes.
425 (r5 = ,(charset-id 'eight-bit-control))) 535 (r3 = -1)
426 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic 536 (read r3)
427 ((r3 = r6) 537 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
428 (write-multibyte-character r3 r0) 538 ((call ccl-mule-utf-untrans)
429 (if (r1 < #xa0) 539 (r0 = r1)
430 (r3 = r5)) 540 (call ccl-mule-utf-untrans)
431 (write-multibyte-character r3 r1) 541 ;; Handle it in the next loop.
432 (if (r2 < #xa0) 542 (r0 = r3)
433 (r3 = r5) 543 (repeat)))
434 (r3 = r6)) 544
435 (write-multibyte-character r3 r2)))) 545 (if (r3 < #xF8)
436 546 ;; 4-byte encoding:
437 ;; Surrogates, U+D800 .. U+DFFF 547 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
438 (if (r3 < #xe000) 548 ;; keep those bytes as eight-bit-{control|graphic}
439 ((r3 = r6) 549 ;; Fixme: allow lookup in utf-subst-table-for-decode.
440 (write-multibyte-character r3 r0) ; eight-bit-graphic 550 ((r4 = ((r0 & #x7) << 18))
441 (if (r1 < #xa0) 551 (r4 |= ((r1 & #x3F) << 12))
442 (r3 = r5)) 552 (r4 |= ((r2 & #x3F) << 6))
443 (write-multibyte-character r3 r1) 553 (r4 |= (r3 & #x3F))
444 (if (r2 < #xa0) 554
445 (r3 = r5) 555 (if (r4 < #x10000) ; `overlong sequence'
446 (r3 = r6)) 556 ((call ccl-mule-utf-untrans)
447 (write-multibyte-character r3 r2)) 557 (r0 = r1)
448 558 (call ccl-mule-utf-untrans)
449 ;; mule-unicode-e000-ffff 559 (r0 = r2)
450 ;; Fixme: fffe and ffff are invalid. 560 (call ccl-mule-utf-untrans)
451 ((r4 = r3) ; don't zap r3 561 (r0 = r3)
452 (lookup-integer utf-subst-table-for-decode r4 r5) 562 (call ccl-mule-utf-untrans))
453 (if r7 563 ((r0 = r4)
454 ;; got a translation 564 (call ccl-mule-utf-untrans)))
455 ((write-multibyte-character r4 r5) 565 (r0 = -1)
456 ;; Zapped through register starvation. 566 (repeat)))
457 (r5 = ,(charset-id 'eight-bit-control))) 567
458 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 568 ;; Unsupported sequence.
459 (r3 -= #xe000) 569 (call ccl-mule-utf-untrans)
460 (r3 //= 96) 570 (r0 = r1)
461 (r1 = (r7 + 32)) 571 (call ccl-mule-utf-untrans)
462 (r1 += ((r3 + 32) << 7)) 572 (r0 = r2)
463 (write-multibyte-character r0 r1))))))))))) 573 (call ccl-mule-utf-untrans)
464 574 (r0 = r3)
465 (if (r0 < #xfe) 575 (call ccl-mule-utf-untrans)
466 ;; 4byte encoding 576 (r0 = -1)
467 ;; keep those bytes as eight-bit-{control|graphic}
468 ;; Fixme: allow lookup in utf-subst-table-for-decode.
469 ((r1 = -1)
470 (r2 = -1)
471 (r3 = -1)
472 (read r1 r2 r3)
473 ;; r0 > #xf0, thus eight-bit-graphic
474 (write-multibyte-character r6 r0)
475 (if (r1 < #xa0)
476 (if (r1 < #x80) ; invalid byte
477 (write r1)
478 (write-multibyte-character r5 r1))
479 (write-multibyte-character r6 r1))
480 (if (r2 < #xa0)
481 (if (r2 < #x80) ; invalid byte
482 (write r2)
483 (write-multibyte-character r5 r2))
484 (write-multibyte-character r6 r2))
485 (if (r3 < #xa0)
486 (if (r3 < #x80) ; invalid byte
487 (write r3)
488 (write-multibyte-character r5 r3))
489 (write-multibyte-character r6 r3))
490 (if (r0 >= #xf8) ; 5- or 6-byte encoding
491 ((r0 = -1)
492 (read r0)
493 (if (r0 < #xa0)
494 (if (r0 < #x80) ; invalid byte
495 (write r0)
496 (write-multibyte-character r5 r0))
497 (write-multibyte-character r6 r0))
498 (if (r0 >= #xfc) ; 6-byte
499 ((r0 = -1)
500 (read r0)
501 (if (r0 < #xa0)
502 (if (r0 < #x80) ; invalid byte
503 (write r0)
504 (write-multibyte-character r5 r0))
505 (write-multibyte-character r6 r0)))))))
506 ;; else invalid byte >= #xfe
507 (write-multibyte-character r6 r0))))))
508 (repeat))) 577 (repeat)))
509 578
510 ;; At EOF... 579 ;; At EOF...
511 (if (r0 >= 0) 580 (if (r0 >= 0)
512 ((if (r0 < #x80) 581 ;; r0 >= #x80
513 (write r0) 582 ((call ccl-mule-utf-untrans)
514 (if (r0 < #xa0)
515 (write-multibyte-character r5 r0)
516 ((write-multibyte-character r6 r0))))
517 (if (r1 >= 0) 583 (if (r1 >= 0)
518 ((if (r1 < #x80) 584 ((r0 = r1)
519 (write r1) 585 (call ccl-mule-utf-untrans)
520 (if (r1 < #xa0)
521 (write-multibyte-character r5 r1)
522 ((write-multibyte-character r6 r1))))
523 (if (r2 >= 0) 586 (if (r2 >= 0)
524 ((if (r2 < #x80) 587 ((r0 = r2)
525 (write r2) 588 (call ccl-mule-utf-untrans)
526 (if (r2 < #xa0)
527 (write-multibyte-character r5 r2)
528 ((write-multibyte-character r6 r2))))
529 (if (r3 >= 0) 589 (if (r3 >= 0)
530 (if (r3 < #x80) 590 ((r0 = r3)
531 (write r3) 591 (call ccl-mule-utf-untrans))))))))))
532 (if (r3 < #xa0)
533 (write-multibyte-character r5 r3)
534 ((write-multibyte-character r6 r3))))))))))))
535 592
536 "CCL program to decode UTF-8. 593 "CCL program to decode UTF-8.
537Basic decoding is done into the charsets ascii, latin-iso8859-1 and 594Basic decoding is done into the charsets ascii, latin-iso8859-1 and
@@ -540,164 +597,206 @@ mule-unicode-*, but see also `utf-fragmentation-table' and
540Encodings of un-representable Unicode characters are decoded asis into 597Encodings of un-representable Unicode characters are decoded asis into
541eight-bit-control and eight-bit-graphic characters.") 598eight-bit-control and eight-bit-graphic characters.")
542 599
600(define-ccl-program ccl-mule-utf-8-encode-untrans
601 ;; UTF-8 decoder generates an UTF-8 sequence represented by a
602 ;; sequence eight-bit-control/graphic chars for an untranslatable
603 ;; character and an invalid byte.
604 ;;
605 ;; This CCL parses that sequence (the first byte is already in r1),
606 ;; writes out the original bytes of that sequence, and sets r5 to
607 ;; -1.
608 ;;
609 ;; If the eight-bit-control/graphic sequence is shorter than what r1
610 ;; suggests, it sets r5 and r6 to the last character read that
611 ;; should be handled by the next loop of a caller.
612 ;;
613 ;; Note: For UTF-8 validation, we only check if a character is
614 ;; eight-bit-control/graphic or not. It may result in incorrect
615 ;; handling of random binary data, but such a data can't be encoded
616 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
617 ;; a sequence even if a source contains invalid byte-sequence.
618 `(0
619 (;; Read the 2nd byte.
620 (read-multibyte-character r5 r6)
621 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
622 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
623 ((write r1) ; invalid UTF-8
624 (r1 = -1)
625 (end)))
626
627 (if (r1 <= #xC3)
628 ;; 2-byte sequence for an originally invalid byte.
629 ((r6 &= #x3F)
630 (r6 |= ((r1 & #x1F) << 6))
631 (write r6)
632 (r5 = -1)
633 (end)))
634
635 (write r1 r6)
636 (r2 = r1)
637 (r1 = -1)
638 ;; Read the 3rd byte.
639 (read-multibyte-character r5 r6)
640 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
641 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
642 (end)) ; invalid UTF-8
643 (write r6)
644 (if (r2 < #xF0)
645 ;; 3-byte sequence for an untranslated character.
646 ((r5 = -1)
647 (end)))
648 ;; Read the 4th byte.
649 (read-multibyte-character r5 r6)
650 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
651 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
652 (end)) ; invalid UTF-8
653 ;; 4-byte sequence for an untranslated character.
654 (write r6)
655 (r5 = -1)
656 (end))
657
658 ;; At EOF...
659 ((r5 = -1)
660 (if (r1 >= 0)
661 (write r1)))))
662
543(define-ccl-program ccl-encode-mule-utf-8 663(define-ccl-program ccl-encode-mule-utf-8
544 `(1 664 `(1
545 ((r5 = -1) 665 ((r5 = -1)
546 (loop 666 (loop
547 (if (r5 < 0) 667 (if (r5 < 0)
548 ((r1 = -1) 668 (read-multibyte-character r0 r1)
549 (read-multibyte-character r0 r1) 669 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
550 (translate-character utf-translation-table-for-encode r0 r1)) 670 ((r0 = r5)
551 (;; We have already done read-multibyte-character.
552 (r0 = r5)
553 (r1 = r6) 671 (r1 = r6)
554 (r5 = -1))) 672 (r5 = -1)))
673 (translate-character utf-translation-table-for-encode r0 r1)
555 674
556 (if (r0 == ,(charset-id 'ascii)) 675 (if (r0 == ,(charset-id 'ascii))
557 (write r1) 676 (write-repeat r1))
558 677
559 (if (r0 == ,(charset-id 'latin-iso8859-1)) 678 (if (r0 == ,(charset-id 'latin-iso8859-1))
560 ;; r1 scalar utf-8 679 ;; r1 scalar utf-8
561 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 680 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
562 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 681 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
563 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 682 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
564 ((r0 = (((r1 & #x40) >> 6) | #xc2)) 683 ((r0 = (((r1 & #x40) >> 6) | #xc2))
565 (r1 &= #x3f) 684 (r1 &= #x3f)
685 (r1 |= #x80)
686 (write r0)
687 (write-repeat r1)))
688
689 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
690 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
691 ;; #x3f80 == (0011 1111 1000 0000)b
692 (r1 &= #x7f)
693 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
694 ;; now r1 holds scalar value
695 (if (r1 < #x0800)
696 ;; 2byte encoding
697 ((write ((r1 >> 6) | #xC0))
698 (r1 &= #x3F)
699 (r1 |= #x80)
700 (write-repeat r1))
701 ;; 3byte encoding
702 ((write ((r1 >> 12) | #xE0))
703 (write (((r1 & #x0FC0) >> 6) | #x80))
704 (r1 &= #x3F)
705 (r1 |= #x80)
706 (write-repeat r1)))))
707
708 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
709 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
710 (r1 &= #x7f)
711 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
712 ;; now r1 holds scalar value
713 (write ((r1 >> 12) | #xE0))
714 (write (((r1 & #x0FC0) >> 6) | #x80))
715 (r1 &= #x3F)
716 (r1 |= #x80)
717 (write-repeat r1)))
718
719 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
720 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
721 (r1 &= #x7f)
722 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
723 ;; now r1 holds scalar value
724 (write ((r1 >> 12) | #xE0))
725 (write (((r1 & #x0FC0) >> 6) | #x80))
726 (r1 &= #x3F)
727 (r1 |= #x80)
728 (write-repeat r1)))
729
730 (if (r0 == ,(charset-id 'eight-bit-control))
731 ;; r1 scalar utf-8
732 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
733 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
734 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
735 ((write #xC2)
736 (write-repeat r1)))
737
738 (if (r0 == ,(charset-id 'eight-bit-graphic))
739 ;; r1 scalar utf-8
740 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
741 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
742 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
743 ((r0 = (r1 >= #xC0))
744 (r0 &= (r1 <= #xC3))
745 (r4 = (r1 >= #xE1))
746 (r4 &= (r1 <= #xF7))
747 (r0 |= r4)
748 (if r0
749 ((call ccl-mule-utf-8-encode-untrans)
750 (repeat))
751 (write-repeat r1))))
752
753 (lookup-character utf-subst-table-for-encode r0 r1)
754 (if r7 ; lookup succeeded
755 (if (r0 < #x800)
756 ;; 2byte encoding
757 ((write ((r0 >> 6) | #xC0))
758 (r1 &= #x3F)
759 (r1 |= #x80)
760 (write-repeat r1))
761 ;; 3byte encoding
762 ((write ((r0 >> 12) | #xE0))
763 (write (((r0 & #x0FC0) >> 6) | #x80))
764 (r1 &= #x3F)
566 (r1 |= #x80) 765 (r1 |= #x80)
567 (write r0 r1)) 766 (write-repeat r1))))
568
569 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
570 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
571 ;; #x3f80 == (0011 1111 1000 0000)b
572 (r1 &= #x7f)
573 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
574 ;; now r1 holds scalar value
575 (if (r1 < #x0800)
576 ;; 2byte encoding
577 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
578 ;; #x07c0 == (0000 0111 1100 0000)b
579 (r1 &= #x3f)
580 (r1 |= #x80)
581 (write r0 r1))
582 ;; 3byte encoding
583 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
584 (r2 = ((r1 & #x3f) | #x80))
585 (r1 &= #x0fc0)
586 (r1 >>= 6)
587 (r1 |= #x80)
588 (write r0 r1 r2))))
589
590 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
591 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
592 (r1 &= #x7f)
593 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
594 (r0 = (((r1 & #xf000) >> 12) | #xe0))
595 (r2 = ((r1 & #x3f) | #x80))
596 (r1 &= #x0fc0)
597 (r1 >>= 6)
598 (r1 |= #x80)
599 (write r0 r1 r2))
600
601 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
602 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
603 (r1 &= #x7f)
604 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
605 (r0 = (((r1 & #xf000) >> 12) | #xe0))
606 (r2 = ((r1 & #x3f) | #x80))
607 (r1 &= #x0fc0)
608 (r1 >>= 6)
609 (r1 |= #x80)
610 (write r0 r1 r2))
611
612 (if (r0 == ,(charset-id 'eight-bit-control))
613 ;; r1 scalar utf-8
614 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
615 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
616 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
617 ((write #xc2)
618 (write r1))
619
620 (if (r0 == ,(charset-id 'eight-bit-graphic))
621 ;; r1 scalar utf-8
622 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
623 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
624 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
625 ((write r1)
626 (r1 = -1)
627 (read-multibyte-character r0 r1)
628 (if (r0 != ,(charset-id 'eight-bit-graphic))
629 (if (r0 != ,(charset-id 'eight-bit-control))
630 ((r5 = r0)
631 (r6 = r1))))
632 (if (r5 < 0)
633 ((read-multibyte-character r0 r2)
634 (if (r0 != ,(charset-id 'eight-bit-graphic))
635 (if (r0 != ,(charset-id 'eight-bit-control))
636 ((r5 = r0)
637 (r6 = r2))))
638 (if (r5 < 0)
639 (write r1 r2)
640 (if (r1 < #xa0)
641 (write r1)
642 ((write #xc2)
643 (write r1)))))))
644
645 ((lookup-character utf-subst-table-for-encode r0 r1)
646 (if r7 ; lookup succeeded
647 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
648 (r2 = ((r0 & #x3f) | #x80))
649 (r0 &= #x0fc0)
650 (r0 >>= 6)
651 (r0 |= #x80)
652 (write r1 r0 r2))
653 ;; Unsupported character.
654 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
655 ((write #xef)
656 (write #xbf)
657 (write #xbd)))))))))))
658 (repeat)))
659 (if (r1 >= #xa0)
660 (write r1)
661 (if (r1 >= #x80)
662 ((write #xc2)
663 (write r1)))))
664 767
768 ;; Unsupported character.
769 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
770 (write #xef)
771 (write #xbf)
772 (write-repeat #xbd))))
665 "CCL program to encode into UTF-8.") 773 "CCL program to encode into UTF-8.")
666 774
667 775
668(define-ccl-program ccl-untranslated-to-ucs 776(define-ccl-program ccl-untranslated-to-ucs
669 `(0 777 `(0
670 (if (r0 < #xf0) ; 3-byte encoding, as above 778 (if (r1 == 0)
671 ((r4 = 0) 779 nil
672 (r3 = (r1 & #b11000000)) 780 (if (r0 <= #xC3) ; 2-byte encoding
673 (r3 |= ((r2 >> 2) & #b00110000)) 781 ((r0 = ((r0 & #x3) << 6))
674 (if (r3 != #b10100000) 782 (r0 |= (r1 & #x3F))
675 (r4 = 1) 783 (r1 = 2))
676 ((r3 = ((r0 & #x0f) << 12)) 784 (if (r2 == 0)
677 (r3 += ((r1 & #x3f) << 6)) 785 (r1 = 0)
678 (r3 += (r2 & #x3f)) 786 (if (r0 < #xF0) ; 3-byte encoding, as above
679 (if (r3 < #x0800) 787 ((r0 = ((r0 & #xF) << 12))
680 (r4 = 1)))) 788 (r0 |= ((r1 & #x3F) << 6))
681 (if (r4 != 0) 789 (r0 |= (r1 & #x3F))
682 (r0 = 0) 790 (r1 = 3))
683 (r0 = r3))) 791 (if (r3 == 0)
684 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) 792 (r1 = 0)
685 ((r4 = (r1 >> 6)) 793 ((r0 = ((r0 & #x7) << 18))
686 (if (r4 != #b10) 794 (r0 |= ((r1 & #x3F) << 12))
687 (r0 = 0) 795 (r0 |= ((r2 & #x3F) << 6))
688 ((r4 = (r2 >> 6)) 796 (r0 |= (r3 & #x3F))
689 (if (r4 != #b10) 797 (r1 = 4))))))))
690 (r0 = 0) 798 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
691 ((r4 = (r3 >> 6)) 799Set r1 to the byte length. r0 == 0 for invalid sequence.")
692 (if (r4 != #b10)
693 (r0 = 0)
694 ((r1 = ((r1 & #x3F) << 12))
695 (r2 = ((r2 & #x3F) << 6))
696 (r3 &= #x3F)
697 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
698 (r0 = 0))))
699 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
700r0 == 0 for invalid sequence.")
701 800
702(defvar utf-8-ccl-regs (make-vector 8 0)) 801(defvar utf-8-ccl-regs (make-vector 8 0))
703 802
@@ -708,33 +807,47 @@ Only for 3- or 4-byte sequences."
708 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) 807 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
709 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) 808 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
710 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) 809 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
711 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) 810 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
712 (aref utf-8-ccl-regs 0))
713 811
714(defun utf-8-help-echo (window object position) 812(defun utf-8-help-echo (window object position)
715 (format "Untranslated Unicode U+%04X" 813 (format "Untranslated Unicode U+%04X"
716 (get-char-property position 'untranslated-utf-8 object))) 814 (get-char-property position 'untranslated-utf-8 object)))
717 815
718;; We compose the untranslatable sequences into a single character. 816;; We compose the untranslatable sequences into a single character,
817;; and move point to the next character.
719;; This is infelicitous for editing, because there's currently no 818;; This is infelicitous for editing, because there's currently no
720;; mechanism for treating compositions as atomic, but is OK for 819;; mechanism for treating compositions as atomic, but is OK for
721;; display. They are composed to U+FFFD with help-echo which 820;; display. They are composed to U+FFFD with help-echo which
722;; indicates the unicodes they represent. This function GCs too much. 821;; indicates the unicodes they represent. This function GCs too much.
723(defsubst utf-8-compose () 822
724 "Put a suitable composition on an untranslatable sequence. 823;; If utf-translate-cjk-mode is non-nil, this function is called with
725Return the sequence's length." 824;; HASH-TABLE which translates CJK characters into some of CJK
726 (let* ((u (utf-8-untranslated-to-ucs)) 825;; charsets.
727 (l (unless (zerop u) 826
728 (if (>= u #x10000) 827(defsubst utf-8-compose (hash-table)
729 4 828 "Put a suitable composition on an untranslatable sequence at point.
730 3)))) 829If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
731 (when l 830Move point to the end of the sequence."
732 (put-text-property (point) (min (point-max) (+ l (point))) 831 (utf-8-untranslated-to-ucs)
733 'untranslated-utf-8 u) 832 (let ((l (aref utf-8-ccl-regs 1))
734 (put-text-property (point) (min (point-max) (+ l (point))) 833 ch)
735 'help-echo 'utf-8-help-echo) 834 (if (> l 0)
736 (compose-region (point) (+ l (point)) ?$,3u=(B) 835 (if (and hash-table
737 l))) 836 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
837 (progn
838 (insert ch)
839 (delete-region (point) (min (point-max) (+ l (point)))))
840 (setq ch (aref utf-8-ccl-regs 0))
841 (put-text-property (point) (min (point-max) (+ l (point)))
842 'untranslated-utf-8 ch)
843 (put-text-property (point) (min (point-max) (+ l (point)))
844 'help-echo 'utf-8-help-echo)
845 (if (= l 2)
846 (put-text-property (point) (min (point-max) (+ l (point)))
847 'display (format "\\%03o" ch))
848 (compose-region (point) (+ l (point)) ?$,3u=(B))
849 (forward-char l))
850 (forward-char 1))))
738 851
739(defcustom utf-8-compose-scripts nil 852(defcustom utf-8-compose-scripts nil
740 "*Non-nil means compose various scripts on decoding utf-8 text." 853 "*Non-nil means compose various scripts on decoding utf-8 text."
@@ -744,38 +857,63 @@ Return the sequence's length."
744 857
745(defun utf-8-post-read-conversion (length) 858(defun utf-8-post-read-conversion (length)
746 "Compose untranslated utf-8 sequences into single characters. 859 "Compose untranslated utf-8 sequences into single characters.
860If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
747Also compose particular scripts if `utf-8-compose-scripts' is non-nil." 861Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
748 (save-excursion 862 (save-excursion
749 ;; Can't do eval-when-compile to insert a multibyte constant 863 (save-restriction
750 ;; version of the string in the loop, since it's always loaded as 864 (narrow-to-region (point) (+ (point) length))
751 ;; unibyte from a byte-compiled file. 865 ;; Can't do eval-when-compile to insert a multibyte constant
752 (let ((range (string-as-multibyte "^\xe1-\xf7"))) 866 ;; version of the string in the loop, since it's always loaded as
753 (while (and (skip-chars-forward range) 867 ;; unibyte from a byte-compiled file.
754 (not (eobp))) 868 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
755 (forward-char (utf-8-compose))))) 869 hash-table ch)
756 ;; Fixme: Takahashi-san implies it may not work this easily. I 870 (when utf-translate-cjk-mode
757 ;; asked why but didn't get a reply. -- fx 871 (if (not utf-translate-cjk-lang-env)
758 (when (and utf-8-compose-scripts (> length 1)) 872 ;; Check these characters:
759 ;; These currently have definitions which cover the relevant 873 ;; "U+2e80-U+33ff", "U+ff00-U+ffef"
760 ;; unicodes. We could avoid loading thai-util &c by checking 874 ;; We may have to translate them to CJK charsets.
761 ;; whether the region contains any characters with the appropriate 875 (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B"))
762 ;; categories. There aren't yet Unicode-based rules for Tibetan. 876 (skip-chars-forward (concat range range2))
763 (save-excursion (setq length (diacritic-post-read-conversion length))) 877 (unless (eobp)
764 (save-excursion (setq length (thai-post-read-conversion length))) 878 (utf-translate-cjk-load-tables)
765 (save-excursion (setq length (lao-post-read-conversion length))) 879 (setq range (concat range range2)))
766 (save-excursion (setq length (devanagari-post-read-conversion length))) 880 (setq hash-table (get 'utf-subst-table-for-decode
767 (save-excursion (setq length (malayalam-post-read-conversion length))) 881 'translation-hash-table)))))
768 (save-excursion (setq length (tamil-post-read-conversion length)))) 882 (while (and (skip-chars-forward range)
769 length) 883 (not (eobp)))
770 884 (setq ch (following-char))
771;; ucs-tables is preloaded 885 (if (< ch 256)
772;; (defun utf-8-pre-write-conversion (beg end) 886 (utf-8-compose hash-table)
773;; "Semi-dummy pre-write function effectively to autoload ucs-tables." 887 (if (and hash-table
774;; ;; Ensure translation-table is loaded. 888 (setq ch (gethash (encode-char ch 'ucs) hash-table)))
775;; (require 'ucs-tables) 889 (progn
776;; ;; Don't do this again. 890 (insert ch)
777;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) 891 (delete-char 1))
778;; nil) 892 (forward-char 1)))))
893
894 (when (and utf-8-compose-scripts (> length 1))
895 ;; These currently have definitions which cover the relevant
896 ;; unicodes. We could avoid loading thai-util &c by checking
897 ;; whether the region contains any characters with the appropriate
898 ;; categories. There aren't yet Unicode-based rules for Tibetan.
899 (diacritic-compose-region (point-max) (point-min))
900 (thai-compose-region (point-max) (point-min))
901 (lao-compose-region (point-max) (point-min))
902 (devanagari-compose-region (point-max) (point-min))
903 (malayalam-compose-region (point-max) (point-min))
904 (tamil-compose-region (point-max) (point-min)))
905 (- (point-max) (point-min)))))
906
907(defun utf-8-pre-write-conversion (beg end)
908 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
909This is used as a post-read-conversion of utf-8 coding system."
910 (if (and utf-translate-cjk-mode
911 (not utf-translate-cjk-lang-env)
912 (save-excursion
913 (goto-char beg)
914 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
915 (utf-translate-cjk-load-tables))
916 nil)
779 917
780(make-coding-system 918(make-coding-system
781 'mule-utf-8 4 ?u 919 'mule-utf-8 4 ?u
@@ -797,18 +935,20 @@ any of the character sets listed above are encoded into the UTF-8 byte
797sequence representing U+FFFD (REPLACEMENT CHARACTER)." 935sequence representing U+FFFD (REPLACEMENT CHARACTER)."
798 936
799 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 937 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
800 '((safe-charsets 938 `((safe-charsets
801 ascii 939 ascii
802 eight-bit-control 940 eight-bit-control
803 eight-bit-graphic 941 eight-bit-graphic
804 latin-iso8859-1 942 latin-iso8859-1
805 mule-unicode-0100-24ff 943 mule-unicode-0100-24ff
806 mule-unicode-2500-33ff 944 mule-unicode-2500-33ff
807 mule-unicode-e000-ffff) 945 mule-unicode-e000-ffff
946 ,@(if utf-translate-cjk-mode
947 utf-translate-cjk-charsets))
808 (mime-charset . utf-8) 948 (mime-charset . utf-8)
809 (coding-category . coding-category-utf-8) 949 (coding-category . coding-category-utf-8)
810 (valid-codes (0 . 255)) 950 (valid-codes (0 . 255))
811;; (pre-write-conversion . utf-8-pre-write-conversion) 951 (pre-write-conversion . utf-8-pre-write-conversion)
812 (post-read-conversion . utf-8-post-read-conversion) 952 (post-read-conversion . utf-8-post-read-conversion)
813 (translation-table-for-encode . utf-translation-table-for-encode) 953 (translation-table-for-encode . utf-translation-table-for-encode)
814 (dependency unify-8859-on-encoding-mode 954 (dependency unify-8859-on-encoding-mode