diff options
| author | Kenichi Handa | 2004-06-12 02:10:37 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2004-06-12 02:10:37 +0000 |
| commit | c71c26e9295573509ceaa468ea973299a8b311e1 (patch) | |
| tree | bea8ba45657f2c84350359f97f0eeeec84d7d38d | |
| parent | 7805cdbd4f39ef8904b68f687be8715306910769 (diff) | |
| download | emacs-c71c26e9295573509ceaa468ea973299a8b311e1.tar.gz emacs-c71c26e9295573509ceaa468ea973299a8b311e1.zip | |
(utf-translate-cjk-charsets): New
variable.
(utf-translate-cjk-unicode-range): New variable.
(utf-translate-cjk-load-tables): New function.
(utf-lookup-subst-table-for-decode): New function.
(utf-lookup-subst-table-for-encode): New function.
(utf-translate-cjk-mode): Init-value changed to t. Don't load
tables here. Update safe-charsets of utf-* coding systems.
(ccl-mule-utf-untrans): New CCL.
(ccl-decode-mule-utf-8): Call ccl-mule-utf-untrans. Use `repeat'
at end of each branch.
(ccl-mule-utf-8-encode-untrans): New CCL.
(ccl-encode-mule-utf-8): Call ccl-mule-utf-8-encode-untrans.
(ccl-untranslated-to-ucs): Handle 2-byte encoding. Set r1 to the
length of encoding. Don't return r0.
(utf-8-compose): New arg hash-table. Handle 2-byte encoding.
(utf-8-post-read-conversion): Narrow to region properly. If
utf-translate-cjk-mode is on, load tables if necessary. Call
utf-8-compose with hash-table arg if necessary. Call
XXX-compose-region instead of XXX-post-read-convesion.
(utf-8-pre-write-conversion): New function.
(mule-utf-8): Include CJK charsets in safe-charsets if
utf-translate-cjk-mode is on. Add pre-write-conversion.
| -rw-r--r-- | lisp/international/utf-8.el | 1086 |
1 files changed, 613 insertions, 473 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index d4dd7b6c882..e324d0c0270 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el | |||
| @@ -190,9 +190,102 @@ Setting this variable outside customize has no effect." | |||
| 190 | :type 'boolean | 190 | :type 'boolean |
| 191 | :group 'mule) | 191 | :group 'mule) |
| 192 | 192 | ||
| 193 | |||
| 194 | (defconst utf-translate-cjk-charsets '(chinese-gb2312 | ||
| 195 | chinese-big5-1 chinese-big5-2 | ||
| 196 | japanese-jisx0208 japanese-jisx0212 | ||
| 197 | korean-ksc5601) | ||
| 198 | "List of charsets supported by `utf-translate-cjk-mode'.") | ||
| 199 | |||
| 200 | (defconst utf-translate-cjk-unicode-range | ||
| 201 | '((#x2e80 . #xd7a3) | ||
| 202 | (#xff00 . #xffef)) | ||
| 203 | "List of Unicode code ranges supported by `utf-translate-cjk-mode'.") | ||
| 204 | |||
| 205 | ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. | ||
| 206 | (defsubst utf-translate-cjk-substitutable-p (code-point) | ||
| 207 | (let ((tail utf-translate-cjk-unicode-range) | ||
| 208 | elt) | ||
| 209 | (while tail | ||
| 210 | (setq elt (car tail) tail (cdr tail)) | ||
| 211 | (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) | ||
| 212 | (setq tail nil) | ||
| 213 | (setq elt nil))) | ||
| 214 | elt)) | ||
| 215 | |||
| 216 | (defvar utf-translate-cjk-lang-env nil | ||
| 217 | "Language environment in which tables for `utf-translate-cjk-mode' is loaded. | ||
| 218 | The value nil means that the tables are not yet loaded.") | ||
| 219 | |||
| 220 | (defun utf-translate-cjk-load-tables () | ||
| 221 | "Load tables for `utf-translate-cjk-mode'." | ||
| 222 | ;; Fixme: Allow the use of the CJK charsets to be | ||
| 223 | ;; customized by reordering and possible omission. | ||
| 224 | (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) | ||
| 225 | (if redefined | ||
| 226 | ;; Redefine them with realistic initial sizes and a | ||
| 227 | ;; smallish rehash size to avoid wasting significant | ||
| 228 | ;; space after they're built. | ||
| 229 | (setq ucs-mule-cjk-to-unicode | ||
| 230 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | ||
| 231 | ucs-unicode-to-mule-cjk | ||
| 232 | (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) | ||
| 233 | |||
| 234 | ;; Load the files explicitly, to avoid having to keep | ||
| 235 | ;; around the large tables they contain (as well as the | ||
| 236 | ;; ones which get built). | ||
| 237 | (cond ((string= "Korean" current-language-environment) | ||
| 238 | (load "subst-jis") | ||
| 239 | (load "subst-big5") | ||
| 240 | (load "subst-gb2312") | ||
| 241 | (load "subst-ksc")) | ||
| 242 | ((string= "Chinese-BIG5" current-language-environment) | ||
| 243 | (load "subst-jis") | ||
| 244 | (load "subst-ksc") | ||
| 245 | (load "subst-gb2312") | ||
| 246 | (load "subst-big5")) | ||
| 247 | ((string= "Chinese-GB" current-language-environment) | ||
| 248 | (load "subst-jis") | ||
| 249 | (load "subst-ksc") | ||
| 250 | (load "subst-big5") | ||
| 251 | (load "subst-gb2312")) | ||
| 252 | (t | ||
| 253 | (load "subst-ksc") | ||
| 254 | (load "subst-gb2312") | ||
| 255 | (load "subst-big5") | ||
| 256 | (load "subst-jis"))) ; jis covers as much as big5, gb2312 | ||
| 257 | |||
| 258 | (when redefined | ||
| 259 | (define-translation-hash-table 'utf-subst-table-for-decode | ||
| 260 | ucs-unicode-to-mule-cjk) | ||
| 261 | (define-translation-hash-table 'utf-subst-table-for-encode | ||
| 262 | ucs-mule-cjk-to-unicode) | ||
| 263 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode | ||
| 264 | 'translation-table) | ||
| 265 | 1 ucs-mule-cjk-to-unicode)) | ||
| 266 | |||
| 267 | (setq utf-translate-cjk-lang-env current-language-environment))) | ||
| 268 | |||
| 269 | (defun utf-lookup-subst-table-for-decode (code-point) | ||
| 270 | (if (and utf-translate-cjk-mode | ||
| 271 | (not utf-translate-cjk-lang-env) | ||
| 272 | (utf-translate-cjk-substitutable-p code-point)) | ||
| 273 | (utf-translate-cjk-load-tables)) | ||
| 274 | (gethash code-point | ||
| 275 | (get 'utf-subst-table-for-decode 'translation-hash-table))) | ||
| 276 | |||
| 277 | |||
| 278 | (defun utf-lookup-subst-table-for-encode (char) | ||
| 279 | (if (and utf-translate-cjk-mode | ||
| 280 | (not utf-translate-cjk-lang-env) | ||
| 281 | (memq (char-charset char) utf-translate-cjk-charsets)) | ||
| 282 | (utf-translate-cjk-load-tables)) | ||
| 283 | (gethash char | ||
| 284 | (get 'utf-subst-table-for-encode 'translation-hash-table))) | ||
| 285 | |||
| 193 | (define-minor-mode utf-translate-cjk-mode | 286 | (define-minor-mode utf-translate-cjk-mode |
| 194 | "Whether the UTF based coding systems should decode/encode CJK characters. | 287 | "Whether the UTF based coding systems should decode/encode CJK characters. |
| 195 | Enabling this loads tables which allow the coding systems mule-utf-8, | 288 | Enabling this allows the coding systems mule-utf-8, |
| 196 | mule-utf-16le and mule-utf-16be to encode characters in the charsets | 289 | mule-utf-16le and mule-utf-16be to encode characters in the charsets |
| 197 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', | 290 | `korean-ksc5601', `chinese-gb2312', `chinese-big5-1', |
| 198 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to | 291 | `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to |
| @@ -203,49 +296,16 @@ according to the language environment in effect when this option is | |||
| 203 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for | 296 | turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for |
| 204 | Chinese-Big5 and jisx for other environments. | 297 | Chinese-Big5 and jisx for other environments. |
| 205 | 298 | ||
| 206 | The tables are large (over 40000 entries), so this option is not the | 299 | This option is on by default. If you are not interested in CJK |
| 207 | default. Also, installing them may be rather slow." | 300 | characters and want to avoid some overhead on encoding/decoding |
| 208 | :init-value nil | 301 | by the above coding systems, you can customize this option to nil." |
| 302 | :init-value t | ||
| 209 | :version "21.4" | 303 | :version "21.4" |
| 210 | :type 'boolean | 304 | :type 'boolean |
| 211 | :set-after '(current-language-environment) | ||
| 212 | :group 'mule | 305 | :group 'mule |
| 213 | :global t | 306 | :global t |
| 214 | (if utf-translate-cjk-mode | 307 | (if utf-translate-cjk-mode |
| 215 | ;; Fixme: Allow the use of the CJK charsets to be | ||
| 216 | ;; customized by reordering and possible omission. | ||
| 217 | (progn | 308 | (progn |
| 218 | ;; Redefine them with realistic initial sizes and a | ||
| 219 | ;; smallish rehash size to avoid wasting significant | ||
| 220 | ;; space after they're built. | ||
| 221 | (setq ucs-mule-cjk-to-unicode | ||
| 222 | (make-hash-table :test 'eq :size 43000 :rehash-size 1000) | ||
| 223 | ucs-unicode-to-mule-cjk | ||
| 224 | (make-hash-table :test 'eq :size 21500 :rehash-size 1000)) | ||
| 225 | ;; Load the files explicitly, to avoid having to keep | ||
| 226 | ;; around the large tables they contain (as well as the | ||
| 227 | ;; ones which get built). | ||
| 228 | (cond | ||
| 229 | ((string= "Korean" current-language-environment) | ||
| 230 | (load "subst-jis") | ||
| 231 | (load "subst-big5") | ||
| 232 | (load "subst-gb2312") | ||
| 233 | (load "subst-ksc")) | ||
| 234 | ((string= "Chinese-BIG5" current-language-environment) | ||
| 235 | (load "subst-jis") | ||
| 236 | (load "subst-ksc") | ||
| 237 | (load "subst-gb2312") | ||
| 238 | (load "subst-big5")) | ||
| 239 | ((string= "Chinese-GB" current-language-environment) | ||
| 240 | (load "subst-jis") | ||
| 241 | (load "subst-ksc") | ||
| 242 | (load "subst-big5") | ||
| 243 | (load "subst-gb2312")) | ||
| 244 | (t | ||
| 245 | (load "subst-ksc") | ||
| 246 | (load "subst-gb2312") | ||
| 247 | (load "subst-big5") | ||
| 248 | (load "subst-jis"))) ; jis covers as much as big5, gb2312 | ||
| 249 | (define-translation-hash-table 'utf-subst-table-for-decode | 309 | (define-translation-hash-table 'utf-subst-table-for-decode |
| 250 | ucs-unicode-to-mule-cjk) | 310 | ucs-unicode-to-mule-cjk) |
| 251 | (define-translation-hash-table 'utf-subst-table-for-encode | 311 | (define-translation-hash-table 'utf-subst-table-for-encode |
| @@ -259,7 +319,58 @@ default. Also, installing them may be rather slow." | |||
| 259 | (make-hash-table :test 'eq)) | 319 | (make-hash-table :test 'eq)) |
| 260 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode | 320 | (set-char-table-extra-slot (get 'utf-translation-table-for-encode |
| 261 | 'translation-table) | 321 | 'translation-table) |
| 262 | 1 nil))) | 322 | 1 nil)) |
| 323 | |||
| 324 | ;; Update safe-chars of mule-utf-* coding systems. | ||
| 325 | (dolist (elt (coding-system-list t)) | ||
| 326 | (if (string-match "^mule-utf" (symbol-name elt)) | ||
| 327 | (let ((safe-charsets (coding-system-get elt 'safe-charsets)) | ||
| 328 | (safe-chars (coding-system-get elt 'safe-chars)) | ||
| 329 | (need-update nil)) | ||
| 330 | (dolist (charset utf-translate-cjk-charsets) | ||
| 331 | (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) | ||
| 332 | (setq safe-charsets | ||
| 333 | (if utf-translate-cjk-mode | ||
| 334 | (cons charset safe-charsets) | ||
| 335 | (delq charset safe-charsets)) | ||
| 336 | need-update t) | ||
| 337 | (aset safe-chars (make-char charset) utf-translate-cjk-mode))) | ||
| 338 | (when need-update | ||
| 339 | (coding-system-put elt 'safe-charsets safe-charsets) | ||
| 340 | (define-coding-system-internal elt)))))) | ||
| 341 | |||
| 342 | (define-ccl-program ccl-mule-utf-untrans | ||
| 343 | ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or | ||
| 344 | ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write | ||
| 345 | ;; eight-bit-control/graphic sequence (2 to 4 chars) representing | ||
| 346 | ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. | ||
| 347 | ;; | ||
| 348 | ;; This is a subrountine because we assume that this is called very | ||
| 349 | ;; rarely (so we don't have to worry about the overhead of the | ||
| 350 | ;; call). | ||
| 351 | `(0 | ||
| 352 | ((r5 = ,(charset-id 'eight-bit-control)) | ||
| 353 | (r6 = ,(charset-id 'eight-bit-graphic)) | ||
| 354 | (if (r0 < #x100) | ||
| 355 | ((r4 = ((r0 >> 6) | #xC0)) | ||
| 356 | (write-multibyte-character r6 r4)) | ||
| 357 | ((if (r0 < #x10000) | ||
| 358 | ((r4 = ((r0 >> 12) | #xE0)) | ||
| 359 | (write-multibyte-character r6 r4)) | ||
| 360 | ((r4 = ((r0 >> 18) | #xF0)) | ||
| 361 | (write-multibyte-character r6 r4) | ||
| 362 | (r4 = (((r0 >> 12) & #x3F) | #x80)) | ||
| 363 | (if (r4 < #xA0) | ||
| 364 | (write-multibyte-character r5 r4) | ||
| 365 | (write-multibyte-character r6 r4)))) | ||
| 366 | (r4 = (((r0 >> 6) & #x3F) | #x80)) | ||
| 367 | (if (r4 < #xA0) | ||
| 368 | (write-multibyte-character r5 r4) | ||
| 369 | (write-multibyte-character r6 r4)))) | ||
| 370 | (r4 = ((r0 & #x3F) | #x80)) | ||
| 371 | (if (r4 < #xA0) | ||
| 372 | (write-multibyte-character r5 r4) | ||
| 373 | (write-multibyte-character r6 r4))))) | ||
| 263 | 374 | ||
| 264 | (define-ccl-program ccl-decode-mule-utf-8 | 375 | (define-ccl-program ccl-decode-mule-utf-8 |
| 265 | ;; | 376 | ;; |
| @@ -278,260 +389,206 @@ default. Also, installing them may be rather slow." | |||
| 278 | ;; (>= 8000) | | | 389 | ;; (>= 8000) | | |
| 279 | ;; mule-unicode-2500-33ff | 3 | 4 | 390 | ;; mule-unicode-2500-33ff | 3 | 4 |
| 280 | ;; mule-unicode-e000-ffff | 3 | 4 | 391 | ;; mule-unicode-e000-ffff | 3 | 4 |
| 392 | ;; -----------------------+----------------+--------------- | ||
| 393 | ;; invalid byte | 1 | 2 | ||
| 281 | ;; | 394 | ;; |
| 282 | ;; Thus magnification factor is two. | 395 | ;; Thus magnification factor is two. |
| 283 | ;; | 396 | ;; |
| 284 | `(2 | 397 | `(2 |
| 285 | ((r5 = ,(charset-id 'eight-bit-control)) | 398 | ((r0 = -1) |
| 286 | (r6 = ,(charset-id 'eight-bit-graphic)) | ||
| 287 | (loop | 399 | (loop |
| 288 | (r0 = -1) | 400 | (if (r0 < 0) |
| 289 | (read r0) | 401 | (read r0)) |
| 290 | |||
| 291 | ;; 1byte encoding, i.e., ascii | ||
| 292 | (if (r0 < #x80) | 402 | (if (r0 < #x80) |
| 293 | ((write r0)) | 403 | ;; 1-byte encoding, i.e., ascii |
| 294 | (if (r0 < #xc0) ; continuation byte (invalid here) | 404 | ((write r0) |
| 295 | ((if (r0 < #xa0) | 405 | (r0 = -1) |
| 296 | (write-multibyte-character r5 r0) | 406 | (repeat))) |
| 297 | (write-multibyte-character r6 r0))) | 407 | (if (r0 < #xc0) ; continuation byte (invalid here) |
| 298 | ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx | 408 | ((call ccl-mule-utf-untrans) |
| 299 | (if (r0 < #xe0) | 409 | (r0 = -1) |
| 300 | ((r1 = -1) | 410 | (repeat))) |
| 301 | (read r1) | 411 | |
| 302 | 412 | ;; Read the 2nd byte. | |
| 303 | (if ((r1 & #b11000000) != #b10000000) | 413 | (r1 = -1) |
| 304 | ;; Invalid 2-byte sequence | 414 | (read r1) |
| 305 | ((if (r0 < #xa0) | 415 | (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte |
| 306 | (write-multibyte-character r5 r0) | 416 | ((call ccl-mule-utf-untrans) |
| 307 | (write-multibyte-character r6 r0)) | 417 | ;; Handle it in the next loop. |
| 308 | (if (r1 < #x80) | 418 | (r0 = r1) |
| 309 | (write r1) | 419 | (repeat))) |
| 310 | (if (r1 < #xa0) | 420 | |
| 311 | (write-multibyte-character r5 r1) | 421 | (if (r0 < #xe0) |
| 312 | (write-multibyte-character r6 r1)))) | 422 | ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
| 313 | 423 | ((r2 = ((r0 & #x1F) << 6)) | |
| 314 | ((r3 = r0) ; save in case of overlong sequence | 424 | (r2 |= (r1 & #x3F)) |
| 315 | (r2 = r1) | 425 | ;; Now r2 holds scalar value |
| 316 | (r0 &= #x1f) | 426 | |
| 317 | (r0 <<= 6) | 427 | (if (r2 < 128) ; `overlong sequence' |
| 318 | (r1 &= #x3f) | 428 | ((call ccl-mule-utf-untrans) |
| 319 | (r1 += r0) | 429 | (r0 = r1) |
| 320 | ;; Now r1 holds scalar value | 430 | (call ccl-mule-utf-untrans) |
| 321 | 431 | (r0 = -1) | |
| 322 | (if (r1 < 128) ; `overlong sequence' | 432 | (repeat))) |
| 323 | ((if (r3 < #xa0) | 433 | |
| 324 | (write-multibyte-character r5 r3) | 434 | (r1 = r2) |
| 325 | (write-multibyte-character r6 r3)) | 435 | (if (r1 < 160) |
| 326 | (if (r2 < #x80) | 436 | ;; eight-bit-control |
| 327 | (write r2) | 437 | (r0 = ,(charset-id 'eight-bit-control)) |
| 328 | (if (r2 < #xa0) | 438 | (if (r1 < 256) |
| 329 | (write-multibyte-character r5 r2) | 439 | ;; latin-iso8859-1 |
| 330 | (write-multibyte-character r6 r2)))) | 440 | ((r0 = ,(charset-id 'latin-iso8859-1)) |
| 331 | 441 | (r1 -= 128)) | |
| 332 | ;; eight-bit-control | 442 | ;; mule-unicode-0100-24ff (< 0800) |
| 333 | (if (r1 < 160) | 443 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 334 | ((write-multibyte-character r5 r1)) | 444 | (r1 -= #x0100) |
| 335 | 445 | (r2 = (((r1 / 96) + 32) << 7)) | |
| 336 | ;; latin-iso8859-1 | 446 | (r1 %= 96) |
| 337 | (if (r1 < 256) | 447 | (r1 += (r2 + 32)) |
| 338 | ((r0 = ,(charset-id 'latin-iso8859-1)) | 448 | (translate-character |
| 339 | (r1 -= 128) | 449 | utf-translation-table-for-decode r0 r1)))) |
| 340 | (write-multibyte-character r0 r1)) | 450 | (write-multibyte-character r0 r1) |
| 341 | 451 | (r0 = -1) | |
| 342 | ;; mule-unicode-0100-24ff (< 0800) | 452 | (repeat))) |
| 343 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 453 | |
| 344 | (r1 -= #x0100) | 454 | ;; Read the 3rd bytes. |
| 345 | (r2 = (((r1 / 96) + 32) << 7)) | 455 | (r2 = -1) |
| 346 | (r1 %= 96) | 456 | (read r2) |
| 347 | (r1 += (r2 + 32)) | 457 | (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte |
| 348 | (translate-character | 458 | ((call ccl-mule-utf-untrans) |
| 349 | utf-translation-table-for-decode r0 r1) | 459 | (r0 = r1) |
| 350 | (write-multibyte-character r0 r1)))))))) | 460 | (call ccl-mule-utf-untrans) |
| 351 | 461 | ;; Handle it in the next loop. | |
| 352 | ;; 3byte encoding | 462 | (r0 = r2) |
| 353 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx | 463 | (repeat))) |
| 354 | (if (r0 < #xf0) | 464 | |
| 355 | ((r1 = -1) | 465 | (if (r0 < #xF0) |
| 356 | (r2 = -1) | 466 | ;; 3byte encoding |
| 357 | (read r1 r2) | 467 | ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
| 358 | 468 | ((r3 = ((r0 & #xF) << 12)) | |
| 359 | ;; This is set to 1 if the encoding is invalid. | 469 | (r3 |= ((r1 & #x3F) << 6)) |
| 360 | (r4 = 0) | 470 | (r3 |= (r2 & #x3F)) |
| 361 | 471 | ||
| 362 | (r3 = (r1 & #b11000000)) | 472 | (if (r3 < #x800) ; `overlong sequence' |
| 363 | (r3 |= ((r2 >> 2) & #b00110000)) | 473 | ((call ccl-mule-utf-untrans) |
| 364 | (if (r3 != #b10100000) | 474 | (r0 = r1) |
| 365 | (r4 = 1) | 475 | (call ccl-mule-utf-untrans) |
| 366 | ((r3 = ((r0 & #x0f) << 12)) | 476 | (r0 = r2) |
| 367 | (r3 += ((r1 & #x3f) << 6)) | 477 | (call ccl-mule-utf-untrans) |
| 368 | (r3 += (r2 & #x3f)) | 478 | (r0 = -1) |
| 369 | (if (r3 < #x0800) | 479 | (repeat))) |
| 370 | (r4 = 1)))) | 480 | |
| 371 | 481 | (if (r3 < #x2500) | |
| 372 | (if (r4 != 0) | 482 | ;; mule-unicode-0100-24ff (>= 0800) |
| 373 | ;; Invalid 3-byte sequence | 483 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 374 | ((if (r0 < #xa0) | 484 | (r3 -= #x0100) |
| 375 | (write-multibyte-character r5 r0) | 485 | (r3 //= 96) |
| 376 | (write-multibyte-character r6 r0)) | 486 | (r1 = (r7 + 32)) |
| 377 | (if (r1 < #x80) | 487 | (r1 += ((r3 + 32) << 7)) |
| 378 | (write r1) | 488 | (translate-character |
| 379 | (if (r1 < #xa0) | 489 | utf-translation-table-for-decode r0 r1) |
| 380 | (write-multibyte-character r5 r1) | 490 | (write-multibyte-character r0 r1) |
| 381 | (write-multibyte-character r6 r1))) | 491 | (r0 = -1) |
| 382 | (if (r2 < #x80) | 492 | (repeat))) |
| 383 | (write r2) | 493 | |
| 384 | (if (r2 < #xa0) | 494 | (if (r3 < #x3400) |
| 385 | (write-multibyte-character r5 r2) | 495 | ;; mule-unicode-2500-33ff |
| 386 | (write-multibyte-character r6 r2)))) | 496 | ((r0 = r3) ; don't zap r3 |
| 387 | 497 | (lookup-integer utf-subst-table-for-decode r0 r1) | |
| 388 | ;; mule-unicode-0100-24ff (>= 0800) | 498 | (if (r7 == 0) |
| 389 | ((if (r3 < #x2500) | 499 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
| 390 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 500 | (r3 -= #x2500) |
| 391 | (r3 -= #x0100) | 501 | (r3 //= 96) |
| 392 | (r3 //= 96) | 502 | (r1 = (r7 + 32)) |
| 393 | (r1 = (r7 + 32)) | 503 | (r1 += ((r3 + 32) << 7)))) |
| 394 | (r1 += ((r3 + 32) << 7)) | 504 | (write-multibyte-character r0 r1) |
| 395 | (translate-character | 505 | (r0 = -1) |
| 396 | utf-translation-table-for-decode r0 r1) | 506 | (repeat))) |
| 397 | (write-multibyte-character r0 r1)) | 507 | |
| 398 | 508 | (if (r3 < #xE000) | |
| 399 | ;; mule-unicode-2500-33ff | 509 | ;; Try to convert to CJK chars, else |
| 400 | (if (r3 < #x3400) | 510 | ;; keep them as eight-bit-{control|graphic}. |
| 401 | ((r4 = r3) ; don't zap r3 | 511 | ((r0 = r3) |
| 402 | (lookup-integer utf-subst-table-for-decode r4 r5) | 512 | (lookup-integer utf-subst-table-for-decode r3 r1) |
| 403 | (if r7 | 513 | (if r7 |
| 404 | ;; got a translation | 514 | ;; got a translation |
| 405 | ((write-multibyte-character r4 r5) | 515 | (write-multibyte-character r3 r1) |
| 406 | ;; Zapped through register starvation. | 516 | (call ccl-mule-utf-untrans)) |
| 407 | (r5 = ,(charset-id 'eight-bit-control))) | 517 | (r0 = -1) |
| 408 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 518 | (repeat))) |
| 409 | (r3 -= #x2500) | 519 | |
| 410 | (r3 //= 96) | 520 | ;; mule-unicode-e000-ffff |
| 411 | (r1 = (r7 + 32)) | 521 | ;; Fixme: fffe and ffff are invalid. |
| 412 | (r1 += ((r3 + 32) << 7)) | 522 | (r0 = r3) ; don't zap r3 |
| 413 | (write-multibyte-character r0 r1)))) | 523 | (lookup-integer utf-subst-table-for-decode r0 r1) |
| 414 | 524 | (if (r7 == 0) | |
| 415 | ;; U+3400 .. U+D7FF | 525 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 416 | ;; Try to convert to CJK chars, else keep | 526 | (r3 -= #xe000) |
| 417 | ;; them as eight-bit-{control|graphic}. | 527 | (r3 //= 96) |
| 418 | (if (r3 < #xd800) | 528 | (r1 = (r7 + 32)) |
| 419 | ((r4 = r3) ; don't zap r3 | 529 | (r1 += ((r3 + 32) << 7)))) |
| 420 | (lookup-integer utf-subst-table-for-decode r4 r5) | 530 | (write-multibyte-character r0 r1) |
| 421 | (if r7 | 531 | (r0 = -1) |
| 422 | ;; got a translation | 532 | (repeat))) |
| 423 | ((write-multibyte-character r4 r5) | 533 | |
| 424 | ;; Zapped through register starvation. | 534 | ;; Read the 4th bytes. |
| 425 | (r5 = ,(charset-id 'eight-bit-control))) | 535 | (r3 = -1) |
| 426 | ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic | 536 | (read r3) |
| 427 | ((r3 = r6) | 537 | (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte |
| 428 | (write-multibyte-character r3 r0) | 538 | ((call ccl-mule-utf-untrans) |
| 429 | (if (r1 < #xa0) | 539 | (r0 = r1) |
| 430 | (r3 = r5)) | 540 | (call ccl-mule-utf-untrans) |
| 431 | (write-multibyte-character r3 r1) | 541 | ;; Handle it in the next loop. |
| 432 | (if (r2 < #xa0) | 542 | (r0 = r3) |
| 433 | (r3 = r5) | 543 | (repeat))) |
| 434 | (r3 = r6)) | 544 | |
| 435 | (write-multibyte-character r3 r2)))) | 545 | (if (r3 < #xF8) |
| 436 | 546 | ;; 4-byte encoding: | |
| 437 | ;; Surrogates, U+D800 .. U+DFFF | 547 | ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx |
| 438 | (if (r3 < #xe000) | 548 | ;; keep those bytes as eight-bit-{control|graphic} |
| 439 | ((r3 = r6) | 549 | ;; Fixme: allow lookup in utf-subst-table-for-decode. |
| 440 | (write-multibyte-character r3 r0) ; eight-bit-graphic | 550 | ((r4 = ((r0 & #x7) << 18)) |
| 441 | (if (r1 < #xa0) | 551 | (r4 |= ((r1 & #x3F) << 12)) |
| 442 | (r3 = r5)) | 552 | (r4 |= ((r2 & #x3F) << 6)) |
| 443 | (write-multibyte-character r3 r1) | 553 | (r4 |= (r3 & #x3F)) |
| 444 | (if (r2 < #xa0) | 554 | |
| 445 | (r3 = r5) | 555 | (if (r4 < #x10000) ; `overlong sequence' |
| 446 | (r3 = r6)) | 556 | ((call ccl-mule-utf-untrans) |
| 447 | (write-multibyte-character r3 r2)) | 557 | (r0 = r1) |
| 448 | 558 | (call ccl-mule-utf-untrans) | |
| 449 | ;; mule-unicode-e000-ffff | 559 | (r0 = r2) |
| 450 | ;; Fixme: fffe and ffff are invalid. | 560 | (call ccl-mule-utf-untrans) |
| 451 | ((r4 = r3) ; don't zap r3 | 561 | (r0 = r3) |
| 452 | (lookup-integer utf-subst-table-for-decode r4 r5) | 562 | (call ccl-mule-utf-untrans)) |
| 453 | (if r7 | 563 | ((r0 = r4) |
| 454 | ;; got a translation | 564 | (call ccl-mule-utf-untrans))) |
| 455 | ((write-multibyte-character r4 r5) | 565 | (r0 = -1) |
| 456 | ;; Zapped through register starvation. | 566 | (repeat))) |
| 457 | (r5 = ,(charset-id 'eight-bit-control))) | 567 | |
| 458 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 568 | ;; Unsupported sequence. |
| 459 | (r3 -= #xe000) | 569 | (call ccl-mule-utf-untrans) |
| 460 | (r3 //= 96) | 570 | (r0 = r1) |
| 461 | (r1 = (r7 + 32)) | 571 | (call ccl-mule-utf-untrans) |
| 462 | (r1 += ((r3 + 32) << 7)) | 572 | (r0 = r2) |
| 463 | (write-multibyte-character r0 r1))))))))))) | 573 | (call ccl-mule-utf-untrans) |
| 464 | 574 | (r0 = r3) | |
| 465 | (if (r0 < #xfe) | 575 | (call ccl-mule-utf-untrans) |
| 466 | ;; 4byte encoding | 576 | (r0 = -1) |
| 467 | ;; keep those bytes as eight-bit-{control|graphic} | ||
| 468 | ;; Fixme: allow lookup in utf-subst-table-for-decode. | ||
| 469 | ((r1 = -1) | ||
| 470 | (r2 = -1) | ||
| 471 | (r3 = -1) | ||
| 472 | (read r1 r2 r3) | ||
| 473 | ;; r0 > #xf0, thus eight-bit-graphic | ||
| 474 | (write-multibyte-character r6 r0) | ||
| 475 | (if (r1 < #xa0) | ||
| 476 | (if (r1 < #x80) ; invalid byte | ||
| 477 | (write r1) | ||
| 478 | (write-multibyte-character r5 r1)) | ||
| 479 | (write-multibyte-character r6 r1)) | ||
| 480 | (if (r2 < #xa0) | ||
| 481 | (if (r2 < #x80) ; invalid byte | ||
| 482 | (write r2) | ||
| 483 | (write-multibyte-character r5 r2)) | ||
| 484 | (write-multibyte-character r6 r2)) | ||
| 485 | (if (r3 < #xa0) | ||
| 486 | (if (r3 < #x80) ; invalid byte | ||
| 487 | (write r3) | ||
| 488 | (write-multibyte-character r5 r3)) | ||
| 489 | (write-multibyte-character r6 r3)) | ||
| 490 | (if (r0 >= #xf8) ; 5- or 6-byte encoding | ||
| 491 | ((r0 = -1) | ||
| 492 | (read r0) | ||
| 493 | (if (r0 < #xa0) | ||
| 494 | (if (r0 < #x80) ; invalid byte | ||
| 495 | (write r0) | ||
| 496 | (write-multibyte-character r5 r0)) | ||
| 497 | (write-multibyte-character r6 r0)) | ||
| 498 | (if (r0 >= #xfc) ; 6-byte | ||
| 499 | ((r0 = -1) | ||
| 500 | (read r0) | ||
| 501 | (if (r0 < #xa0) | ||
| 502 | (if (r0 < #x80) ; invalid byte | ||
| 503 | (write r0) | ||
| 504 | (write-multibyte-character r5 r0)) | ||
| 505 | (write-multibyte-character r6 r0))))))) | ||
| 506 | ;; else invalid byte >= #xfe | ||
| 507 | (write-multibyte-character r6 r0)))))) | ||
| 508 | (repeat))) | 577 | (repeat))) |
| 509 | 578 | ||
| 510 | ;; At EOF... | 579 | ;; At EOF... |
| 511 | (if (r0 >= 0) | 580 | (if (r0 >= 0) |
| 512 | ((if (r0 < #x80) | 581 | ;; r0 >= #x80 |
| 513 | (write r0) | 582 | ((call ccl-mule-utf-untrans) |
| 514 | (if (r0 < #xa0) | ||
| 515 | (write-multibyte-character r5 r0) | ||
| 516 | ((write-multibyte-character r6 r0)))) | ||
| 517 | (if (r1 >= 0) | 583 | (if (r1 >= 0) |
| 518 | ((if (r1 < #x80) | 584 | ((r0 = r1) |
| 519 | (write r1) | 585 | (call ccl-mule-utf-untrans) |
| 520 | (if (r1 < #xa0) | ||
| 521 | (write-multibyte-character r5 r1) | ||
| 522 | ((write-multibyte-character r6 r1)))) | ||
| 523 | (if (r2 >= 0) | 586 | (if (r2 >= 0) |
| 524 | ((if (r2 < #x80) | 587 | ((r0 = r2) |
| 525 | (write r2) | 588 | (call ccl-mule-utf-untrans) |
| 526 | (if (r2 < #xa0) | ||
| 527 | (write-multibyte-character r5 r2) | ||
| 528 | ((write-multibyte-character r6 r2)))) | ||
| 529 | (if (r3 >= 0) | 589 | (if (r3 >= 0) |
| 530 | (if (r3 < #x80) | 590 | ((r0 = r3) |
| 531 | (write r3) | 591 | (call ccl-mule-utf-untrans)))))))))) |
| 532 | (if (r3 < #xa0) | ||
| 533 | (write-multibyte-character r5 r3) | ||
| 534 | ((write-multibyte-character r6 r3)))))))))))) | ||
| 535 | 592 | ||
| 536 | "CCL program to decode UTF-8. | 593 | "CCL program to decode UTF-8. |
| 537 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 594 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| @@ -540,164 +597,206 @@ mule-unicode-*, but see also `utf-fragmentation-table' and | |||
| 540 | Encodings of un-representable Unicode characters are decoded asis into | 597 | Encodings of un-representable Unicode characters are decoded asis into |
| 541 | eight-bit-control and eight-bit-graphic characters.") | 598 | eight-bit-control and eight-bit-graphic characters.") |
| 542 | 599 | ||
| 600 | (define-ccl-program ccl-mule-utf-8-encode-untrans | ||
| 601 | ;; UTF-8 decoder generates an UTF-8 sequence represented by a | ||
| 602 | ;; sequence eight-bit-control/graphic chars for an untranslatable | ||
| 603 | ;; character and an invalid byte. | ||
| 604 | ;; | ||
| 605 | ;; This CCL parses that sequence (the first byte is already in r1), | ||
| 606 | ;; writes out the original bytes of that sequence, and sets r5 to | ||
| 607 | ;; -1. | ||
| 608 | ;; | ||
| 609 | ;; If the eight-bit-control/graphic sequence is shorter than what r1 | ||
| 610 | ;; suggests, it sets r5 and r6 to the last character read that | ||
| 611 | ;; should be handled by the next loop of a caller. | ||
| 612 | ;; | ||
| 613 | ;; Note: For UTF-8 validation, we only check if a character is | ||
| 614 | ;; eight-bit-control/graphic or not. It may result in incorrect | ||
| 615 | ;; handling of random binary data, but such a data can't be encoded | ||
| 616 | ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such | ||
| 617 | ;; a sequence even if a source contains invalid byte-sequence. | ||
| 618 | `(0 | ||
| 619 | (;; Read the 2nd byte. | ||
| 620 | (read-multibyte-character r5 r6) | ||
| 621 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 622 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 623 | ((write r1) ; invalid UTF-8 | ||
| 624 | (r1 = -1) | ||
| 625 | (end))) | ||
| 626 | |||
| 627 | (if (r1 <= #xC3) | ||
| 628 | ;; 2-byte sequence for an originally invalid byte. | ||
| 629 | ((r6 &= #x3F) | ||
| 630 | (r6 |= ((r1 & #x1F) << 6)) | ||
| 631 | (write r6) | ||
| 632 | (r5 = -1) | ||
| 633 | (end))) | ||
| 634 | |||
| 635 | (write r1 r6) | ||
| 636 | (r2 = r1) | ||
| 637 | (r1 = -1) | ||
| 638 | ;; Read the 3rd byte. | ||
| 639 | (read-multibyte-character r5 r6) | ||
| 640 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 641 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 642 | (end)) ; invalid UTF-8 | ||
| 643 | (write r6) | ||
| 644 | (if (r2 < #xF0) | ||
| 645 | ;; 3-byte sequence for an untranslated character. | ||
| 646 | ((r5 = -1) | ||
| 647 | (end))) | ||
| 648 | ;; Read the 4th byte. | ||
| 649 | (read-multibyte-character r5 r6) | ||
| 650 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 651 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 652 | (end)) ; invalid UTF-8 | ||
| 653 | ;; 4-byte sequence for an untranslated character. | ||
| 654 | (write r6) | ||
| 655 | (r5 = -1) | ||
| 656 | (end)) | ||
| 657 | |||
| 658 | ;; At EOF... | ||
| 659 | ((r5 = -1) | ||
| 660 | (if (r1 >= 0) | ||
| 661 | (write r1))))) | ||
| 662 | |||
| 543 | (define-ccl-program ccl-encode-mule-utf-8 | 663 | (define-ccl-program ccl-encode-mule-utf-8 |
| 544 | `(1 | 664 | `(1 |
| 545 | ((r5 = -1) | 665 | ((r5 = -1) |
| 546 | (loop | 666 | (loop |
| 547 | (if (r5 < 0) | 667 | (if (r5 < 0) |
| 548 | ((r1 = -1) | 668 | (read-multibyte-character r0 r1) |
| 549 | (read-multibyte-character r0 r1) | 669 | ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). |
| 550 | (translate-character utf-translation-table-for-encode r0 r1)) | 670 | ((r0 = r5) |
| 551 | (;; We have already done read-multibyte-character. | ||
| 552 | (r0 = r5) | ||
| 553 | (r1 = r6) | 671 | (r1 = r6) |
| 554 | (r5 = -1))) | 672 | (r5 = -1))) |
| 673 | (translate-character utf-translation-table-for-encode r0 r1) | ||
| 555 | 674 | ||
| 556 | (if (r0 == ,(charset-id 'ascii)) | 675 | (if (r0 == ,(charset-id 'ascii)) |
| 557 | (write r1) | 676 | (write-repeat r1)) |
| 558 | 677 | ||
| 559 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | 678 | (if (r0 == ,(charset-id 'latin-iso8859-1)) |
| 560 | ;; r1 scalar utf-8 | 679 | ;; r1 scalar utf-8 |
| 561 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | 680 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
| 562 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 | 681 | ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
| 563 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 | 682 | ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
| 564 | ((r0 = (((r1 & #x40) >> 6) | #xc2)) | 683 | ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
| 565 | (r1 &= #x3f) | 684 | (r1 &= #x3f) |
| 685 | (r1 |= #x80) | ||
| 686 | (write r0) | ||
| 687 | (write-repeat r1))) | ||
| 688 | |||
| 689 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 690 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 691 | ;; #x3f80 == (0011 1111 1000 0000)b | ||
| 692 | (r1 &= #x7f) | ||
| 693 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | ||
| 694 | ;; now r1 holds scalar value | ||
| 695 | (if (r1 < #x0800) | ||
| 696 | ;; 2byte encoding | ||
| 697 | ((write ((r1 >> 6) | #xC0)) | ||
| 698 | (r1 &= #x3F) | ||
| 699 | (r1 |= #x80) | ||
| 700 | (write-repeat r1)) | ||
| 701 | ;; 3byte encoding | ||
| 702 | ((write ((r1 >> 12) | #xE0)) | ||
| 703 | (write (((r1 & #x0FC0) >> 6) | #x80)) | ||
| 704 | (r1 &= #x3F) | ||
| 705 | (r1 |= #x80) | ||
| 706 | (write-repeat r1))))) | ||
| 707 | |||
| 708 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 709 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 710 | (r1 &= #x7f) | ||
| 711 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | ||
| 712 | ;; now r1 holds scalar value | ||
| 713 | (write ((r1 >> 12) | #xE0)) | ||
| 714 | (write (((r1 & #x0FC0) >> 6) | #x80)) | ||
| 715 | (r1 &= #x3F) | ||
| 716 | (r1 |= #x80) | ||
| 717 | (write-repeat r1))) | ||
| 718 | |||
| 719 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 720 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 721 | (r1 &= #x7f) | ||
| 722 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | ||
| 723 | ;; now r1 holds scalar value | ||
| 724 | (write ((r1 >> 12) | #xE0)) | ||
| 725 | (write (((r1 & #x0FC0) >> 6) | #x80)) | ||
| 726 | (r1 &= #x3F) | ||
| 727 | (r1 |= #x80) | ||
| 728 | (write-repeat r1))) | ||
| 729 | |||
| 730 | (if (r0 == ,(charset-id 'eight-bit-control)) | ||
| 731 | ;; r1 scalar utf-8 | ||
| 732 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | ||
| 733 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | ||
| 734 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | ||
| 735 | ((write #xC2) | ||
| 736 | (write-repeat r1))) | ||
| 737 | |||
| 738 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | ||
| 739 | ;; r1 scalar utf-8 | ||
| 740 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | ||
| 741 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | ||
| 742 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | ||
| 743 | ((r0 = (r1 >= #xC0)) | ||
| 744 | (r0 &= (r1 <= #xC3)) | ||
| 745 | (r4 = (r1 >= #xE1)) | ||
| 746 | (r4 &= (r1 <= #xF7)) | ||
| 747 | (r0 |= r4) | ||
| 748 | (if r0 | ||
| 749 | ((call ccl-mule-utf-8-encode-untrans) | ||
| 750 | (repeat)) | ||
| 751 | (write-repeat r1)))) | ||
| 752 | |||
| 753 | (lookup-character utf-subst-table-for-encode r0 r1) | ||
| 754 | (if r7 ; lookup succeeded | ||
| 755 | (if (r0 < #x800) | ||
| 756 | ;; 2byte encoding | ||
| 757 | ((write ((r0 >> 6) | #xC0)) | ||
| 758 | (r1 &= #x3F) | ||
| 759 | (r1 |= #x80) | ||
| 760 | (write-repeat r1)) | ||
| 761 | ;; 3byte encoding | ||
| 762 | ((write ((r0 >> 12) | #xE0)) | ||
| 763 | (write (((r0 & #x0FC0) >> 6) | #x80)) | ||
| 764 | (r1 &= #x3F) | ||
| 566 | (r1 |= #x80) | 765 | (r1 |= #x80) |
| 567 | (write r0 r1)) | 766 | (write-repeat r1)))) |
| 568 | |||
| 569 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 570 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 571 | ;; #x3f80 == (0011 1111 1000 0000)b | ||
| 572 | (r1 &= #x7f) | ||
| 573 | (r1 += (r0 + 224)) ; 240 == -32 + #x0100 | ||
| 574 | ;; now r1 holds scalar value | ||
| 575 | (if (r1 < #x0800) | ||
| 576 | ;; 2byte encoding | ||
| 577 | ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) | ||
| 578 | ;; #x07c0 == (0000 0111 1100 0000)b | ||
| 579 | (r1 &= #x3f) | ||
| 580 | (r1 |= #x80) | ||
| 581 | (write r0 r1)) | ||
| 582 | ;; 3byte encoding | ||
| 583 | ((r0 = (((r1 & #xf000) >> 12) | #xe0)) | ||
| 584 | (r2 = ((r1 & #x3f) | #x80)) | ||
| 585 | (r1 &= #x0fc0) | ||
| 586 | (r1 >>= 6) | ||
| 587 | (r1 |= #x80) | ||
| 588 | (write r0 r1 r2)))) | ||
| 589 | |||
| 590 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 591 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 592 | (r1 &= #x7f) | ||
| 593 | (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 | ||
| 594 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | ||
| 595 | (r2 = ((r1 & #x3f) | #x80)) | ||
| 596 | (r1 &= #x0fc0) | ||
| 597 | (r1 >>= 6) | ||
| 598 | (r1 |= #x80) | ||
| 599 | (write r0 r1 r2)) | ||
| 600 | |||
| 601 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 602 | ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) | ||
| 603 | (r1 &= #x7f) | ||
| 604 | (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 | ||
| 605 | (r0 = (((r1 & #xf000) >> 12) | #xe0)) | ||
| 606 | (r2 = ((r1 & #x3f) | #x80)) | ||
| 607 | (r1 &= #x0fc0) | ||
| 608 | (r1 >>= 6) | ||
| 609 | (r1 |= #x80) | ||
| 610 | (write r0 r1 r2)) | ||
| 611 | |||
| 612 | (if (r0 == ,(charset-id 'eight-bit-control)) | ||
| 613 | ;; r1 scalar utf-8 | ||
| 614 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | ||
| 615 | ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 | ||
| 616 | ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 | ||
| 617 | ((write #xc2) | ||
| 618 | (write r1)) | ||
| 619 | |||
| 620 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | ||
| 621 | ;; r1 scalar utf-8 | ||
| 622 | ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx | ||
| 623 | ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 | ||
| 624 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | ||
| 625 | ((write r1) | ||
| 626 | (r1 = -1) | ||
| 627 | (read-multibyte-character r0 r1) | ||
| 628 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | ||
| 629 | (if (r0 != ,(charset-id 'eight-bit-control)) | ||
| 630 | ((r5 = r0) | ||
| 631 | (r6 = r1)))) | ||
| 632 | (if (r5 < 0) | ||
| 633 | ((read-multibyte-character r0 r2) | ||
| 634 | (if (r0 != ,(charset-id 'eight-bit-graphic)) | ||
| 635 | (if (r0 != ,(charset-id 'eight-bit-control)) | ||
| 636 | ((r5 = r0) | ||
| 637 | (r6 = r2)))) | ||
| 638 | (if (r5 < 0) | ||
| 639 | (write r1 r2) | ||
| 640 | (if (r1 < #xa0) | ||
| 641 | (write r1) | ||
| 642 | ((write #xc2) | ||
| 643 | (write r1))))))) | ||
| 644 | |||
| 645 | ((lookup-character utf-subst-table-for-encode r0 r1) | ||
| 646 | (if r7 ; lookup succeeded | ||
| 647 | ((r1 = (((r0 & #xf000) >> 12) | #xe0)) | ||
| 648 | (r2 = ((r0 & #x3f) | #x80)) | ||
| 649 | (r0 &= #x0fc0) | ||
| 650 | (r0 >>= 6) | ||
| 651 | (r0 |= #x80) | ||
| 652 | (write r1 r0 r2)) | ||
| 653 | ;; Unsupported character. | ||
| 654 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | ||
| 655 | ((write #xef) | ||
| 656 | (write #xbf) | ||
| 657 | (write #xbd))))))))))) | ||
| 658 | (repeat))) | ||
| 659 | (if (r1 >= #xa0) | ||
| 660 | (write r1) | ||
| 661 | (if (r1 >= #x80) | ||
| 662 | ((write #xc2) | ||
| 663 | (write r1))))) | ||
| 664 | 767 | ||
| 768 | ;; Unsupported character. | ||
| 769 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. | ||
| 770 | (write #xef) | ||
| 771 | (write #xbf) | ||
| 772 | (write-repeat #xbd)))) | ||
| 665 | "CCL program to encode into UTF-8.") | 773 | "CCL program to encode into UTF-8.") |
| 666 | 774 | ||
| 667 | 775 | ||
| 668 | (define-ccl-program ccl-untranslated-to-ucs | 776 | (define-ccl-program ccl-untranslated-to-ucs |
| 669 | `(0 | 777 | `(0 |
| 670 | (if (r0 < #xf0) ; 3-byte encoding, as above | 778 | (if (r1 == 0) |
| 671 | ((r4 = 0) | 779 | nil |
| 672 | (r3 = (r1 & #b11000000)) | 780 | (if (r0 <= #xC3) ; 2-byte encoding |
| 673 | (r3 |= ((r2 >> 2) & #b00110000)) | 781 | ((r0 = ((r0 & #x3) << 6)) |
| 674 | (if (r3 != #b10100000) | 782 | (r0 |= (r1 & #x3F)) |
| 675 | (r4 = 1) | 783 | (r1 = 2)) |
| 676 | ((r3 = ((r0 & #x0f) << 12)) | 784 | (if (r2 == 0) |
| 677 | (r3 += ((r1 & #x3f) << 6)) | 785 | (r1 = 0) |
| 678 | (r3 += (r2 & #x3f)) | 786 | (if (r0 < #xF0) ; 3-byte encoding, as above |
| 679 | (if (r3 < #x0800) | 787 | ((r0 = ((r0 & #xF) << 12)) |
| 680 | (r4 = 1)))) | 788 | (r0 |= ((r1 & #x3F) << 6)) |
| 681 | (if (r4 != 0) | 789 | (r0 |= (r1 & #x3F)) |
| 682 | (r0 = 0) | 790 | (r1 = 3)) |
| 683 | (r0 = r3))) | 791 | (if (r3 == 0) |
| 684 | (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) | 792 | (r1 = 0) |
| 685 | ((r4 = (r1 >> 6)) | 793 | ((r0 = ((r0 & #x7) << 18)) |
| 686 | (if (r4 != #b10) | 794 | (r0 |= ((r1 & #x3F) << 12)) |
| 687 | (r0 = 0) | 795 | (r0 |= ((r2 & #x3F) << 6)) |
| 688 | ((r4 = (r2 >> 6)) | 796 | (r0 |= (r3 & #x3F)) |
| 689 | (if (r4 != #b10) | 797 | (r1 = 4)))))))) |
| 690 | (r0 = 0) | 798 | "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. |
| 691 | ((r4 = (r3 >> 6)) | 799 | Set r1 to the byte length. r0 == 0 for invalid sequence.") |
| 692 | (if (r4 != #b10) | ||
| 693 | (r0 = 0) | ||
| 694 | ((r1 = ((r1 & #x3F) << 12)) | ||
| 695 | (r2 = ((r2 & #x3F) << 6)) | ||
| 696 | (r3 &= #x3F) | ||
| 697 | (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) | ||
| 698 | (r0 = 0)))) | ||
| 699 | "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. | ||
| 700 | r0 == 0 for invalid sequence.") | ||
| 701 | 800 | ||
| 702 | (defvar utf-8-ccl-regs (make-vector 8 0)) | 801 | (defvar utf-8-ccl-regs (make-vector 8 0)) |
| 703 | 802 | ||
| @@ -708,33 +807,47 @@ Only for 3- or 4-byte sequences." | |||
| 708 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) | 807 | (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) |
| 709 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) | 808 | (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) |
| 710 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) | 809 | (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) |
| 711 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) | 810 | (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) |
| 712 | (aref utf-8-ccl-regs 0)) | ||
| 713 | 811 | ||
| 714 | (defun utf-8-help-echo (window object position) | 812 | (defun utf-8-help-echo (window object position) |
| 715 | (format "Untranslated Unicode U+%04X" | 813 | (format "Untranslated Unicode U+%04X" |
| 716 | (get-char-property position 'untranslated-utf-8 object))) | 814 | (get-char-property position 'untranslated-utf-8 object))) |
| 717 | 815 | ||
| 718 | ;; We compose the untranslatable sequences into a single character. | 816 | ;; We compose the untranslatable sequences into a single character, |
| 817 | ;; and move point to the next character. | ||
| 719 | ;; This is infelicitous for editing, because there's currently no | 818 | ;; This is infelicitous for editing, because there's currently no |
| 720 | ;; mechanism for treating compositions as atomic, but is OK for | 819 | ;; mechanism for treating compositions as atomic, but is OK for |
| 721 | ;; display. They are composed to U+FFFD with help-echo which | 820 | ;; display. They are composed to U+FFFD with help-echo which |
| 722 | ;; indicates the unicodes they represent. This function GCs too much. | 821 | ;; indicates the unicodes they represent. This function GCs too much. |
| 723 | (defsubst utf-8-compose () | 822 | |
| 724 | "Put a suitable composition on an untranslatable sequence. | 823 | ;; If utf-translate-cjk-mode is non-nil, this function is called with |
| 725 | Return the sequence's length." | 824 | ;; HASH-TABLE which translates CJK characters into some of CJK |
| 726 | (let* ((u (utf-8-untranslated-to-ucs)) | 825 | ;; charsets. |
| 727 | (l (unless (zerop u) | 826 | |
| 728 | (if (>= u #x10000) | 827 | (defsubst utf-8-compose (hash-table) |
| 729 | 4 | 828 | "Put a suitable composition on an untranslatable sequence at point. |
| 730 | 3)))) | 829 | If HASH-TABLE is non-nil, try to translate CJK characters by it at first. |
| 731 | (when l | 830 | Move point to the end of the sequence." |
| 732 | (put-text-property (point) (min (point-max) (+ l (point))) | 831 | (utf-8-untranslated-to-ucs) |
| 733 | 'untranslated-utf-8 u) | 832 | (let ((l (aref utf-8-ccl-regs 1)) |
| 734 | (put-text-property (point) (min (point-max) (+ l (point))) | 833 | ch) |
| 735 | 'help-echo 'utf-8-help-echo) | 834 | (if (> l 0) |
| 736 | (compose-region (point) (+ l (point)) ?$,3u=(B) | 835 | (if (and hash-table |
| 737 | l))) | 836 | (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) |
| 837 | (progn | ||
| 838 | (insert ch) | ||
| 839 | (delete-region (point) (min (point-max) (+ l (point))))) | ||
| 840 | (setq ch (aref utf-8-ccl-regs 0)) | ||
| 841 | (put-text-property (point) (min (point-max) (+ l (point))) | ||
| 842 | 'untranslated-utf-8 ch) | ||
| 843 | (put-text-property (point) (min (point-max) (+ l (point))) | ||
| 844 | 'help-echo 'utf-8-help-echo) | ||
| 845 | (if (= l 2) | ||
| 846 | (put-text-property (point) (min (point-max) (+ l (point))) | ||
| 847 | 'display (format "\\%03o" ch)) | ||
| 848 | (compose-region (point) (+ l (point)) ?$,3u=(B)) | ||
| 849 | (forward-char l)) | ||
| 850 | (forward-char 1)))) | ||
| 738 | 851 | ||
| 739 | (defcustom utf-8-compose-scripts nil | 852 | (defcustom utf-8-compose-scripts nil |
| 740 | "*Non-nil means compose various scripts on decoding utf-8 text." | 853 | "*Non-nil means compose various scripts on decoding utf-8 text." |
| @@ -744,38 +857,63 @@ Return the sequence's length." | |||
| 744 | 857 | ||
| 745 | (defun utf-8-post-read-conversion (length) | 858 | (defun utf-8-post-read-conversion (length) |
| 746 | "Compose untranslated utf-8 sequences into single characters. | 859 | "Compose untranslated utf-8 sequences into single characters. |
| 860 | If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. | ||
| 747 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." | 861 | Also compose particular scripts if `utf-8-compose-scripts' is non-nil." |
| 748 | (save-excursion | 862 | (save-excursion |
| 749 | ;; Can't do eval-when-compile to insert a multibyte constant | 863 | (save-restriction |
| 750 | ;; version of the string in the loop, since it's always loaded as | 864 | (narrow-to-region (point) (+ (point) length)) |
| 751 | ;; unibyte from a byte-compiled file. | 865 | ;; Can't do eval-when-compile to insert a multibyte constant |
| 752 | (let ((range (string-as-multibyte "^\xe1-\xf7"))) | 866 | ;; version of the string in the loop, since it's always loaded as |
| 753 | (while (and (skip-chars-forward range) | 867 | ;; unibyte from a byte-compiled file. |
| 754 | (not (eobp))) | 868 | (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) |
| 755 | (forward-char (utf-8-compose))))) | 869 | hash-table ch) |
| 756 | ;; Fixme: Takahashi-san implies it may not work this easily. I | 870 | (when utf-translate-cjk-mode |
| 757 | ;; asked why but didn't get a reply. -- fx | 871 | (if (not utf-translate-cjk-lang-env) |
| 758 | (when (and utf-8-compose-scripts (> length 1)) | 872 | ;; Check these characters: |
| 759 | ;; These currently have definitions which cover the relevant | 873 | ;; "U+2e80-U+33ff", "U+ff00-U+ffef" |
| 760 | ;; unicodes. We could avoid loading thai-util &c by checking | 874 | ;; We may have to translate them to CJK charsets. |
| 761 | ;; whether the region contains any characters with the appropriate | 875 | (let ((range2 "$,29@(B-$,2G$,3r`(B-$,3u/(B")) |
| 762 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | 876 | (skip-chars-forward (concat range range2)) |
| 763 | (save-excursion (setq length (diacritic-post-read-conversion length))) | 877 | (unless (eobp) |
| 764 | (save-excursion (setq length (thai-post-read-conversion length))) | 878 | (utf-translate-cjk-load-tables) |
| 765 | (save-excursion (setq length (lao-post-read-conversion length))) | 879 | (setq range (concat range range2))) |
| 766 | (save-excursion (setq length (devanagari-post-read-conversion length))) | 880 | (setq hash-table (get 'utf-subst-table-for-decode |
| 767 | (save-excursion (setq length (malayalam-post-read-conversion length))) | 881 | 'translation-hash-table))))) |
| 768 | (save-excursion (setq length (tamil-post-read-conversion length)))) | 882 | (while (and (skip-chars-forward range) |
| 769 | length) | 883 | (not (eobp))) |
| 770 | 884 | (setq ch (following-char)) | |
| 771 | ;; ucs-tables is preloaded | 885 | (if (< ch 256) |
| 772 | ;; (defun utf-8-pre-write-conversion (beg end) | 886 | (utf-8-compose hash-table) |
| 773 | ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." | 887 | (if (and hash-table |
| 774 | ;; ;; Ensure translation-table is loaded. | 888 | (setq ch (gethash (encode-char ch 'ucs) hash-table))) |
| 775 | ;; (require 'ucs-tables) | 889 | (progn |
| 776 | ;; ;; Don't do this again. | 890 | (insert ch) |
| 777 | ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) | 891 | (delete-char 1)) |
| 778 | ;; nil) | 892 | (forward-char 1))))) |
| 893 | |||
| 894 | (when (and utf-8-compose-scripts (> length 1)) | ||
| 895 | ;; These currently have definitions which cover the relevant | ||
| 896 | ;; unicodes. We could avoid loading thai-util &c by checking | ||
| 897 | ;; whether the region contains any characters with the appropriate | ||
| 898 | ;; categories. There aren't yet Unicode-based rules for Tibetan. | ||
| 899 | (diacritic-compose-region (point-max) (point-min)) | ||
| 900 | (thai-compose-region (point-max) (point-min)) | ||
| 901 | (lao-compose-region (point-max) (point-min)) | ||
| 902 | (devanagari-compose-region (point-max) (point-min)) | ||
| 903 | (malayalam-compose-region (point-max) (point-min)) | ||
| 904 | (tamil-compose-region (point-max) (point-min))) | ||
| 905 | (- (point-max) (point-min))))) | ||
| 906 | |||
| 907 | (defun utf-8-pre-write-conversion (beg end) | ||
| 908 | "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. | ||
| 909 | This is used as a post-read-conversion of utf-8 coding system." | ||
| 910 | (if (and utf-translate-cjk-mode | ||
| 911 | (not utf-translate-cjk-lang-env) | ||
| 912 | (save-excursion | ||
| 913 | (goto-char beg) | ||
| 914 | (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))) | ||
| 915 | (utf-translate-cjk-load-tables)) | ||
| 916 | nil) | ||
| 779 | 917 | ||
| 780 | (make-coding-system | 918 | (make-coding-system |
| 781 | 'mule-utf-8 4 ?u | 919 | 'mule-utf-8 4 ?u |
| @@ -797,18 +935,20 @@ any of the character sets listed above are encoded into the UTF-8 byte | |||
| 797 | sequence representing U+FFFD (REPLACEMENT CHARACTER)." | 935 | sequence representing U+FFFD (REPLACEMENT CHARACTER)." |
| 798 | 936 | ||
| 799 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 937 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
| 800 | '((safe-charsets | 938 | `((safe-charsets |
| 801 | ascii | 939 | ascii |
| 802 | eight-bit-control | 940 | eight-bit-control |
| 803 | eight-bit-graphic | 941 | eight-bit-graphic |
| 804 | latin-iso8859-1 | 942 | latin-iso8859-1 |
| 805 | mule-unicode-0100-24ff | 943 | mule-unicode-0100-24ff |
| 806 | mule-unicode-2500-33ff | 944 | mule-unicode-2500-33ff |
| 807 | mule-unicode-e000-ffff) | 945 | mule-unicode-e000-ffff |
| 946 | ,@(if utf-translate-cjk-mode | ||
| 947 | utf-translate-cjk-charsets)) | ||
| 808 | (mime-charset . utf-8) | 948 | (mime-charset . utf-8) |
| 809 | (coding-category . coding-category-utf-8) | 949 | (coding-category . coding-category-utf-8) |
| 810 | (valid-codes (0 . 255)) | 950 | (valid-codes (0 . 255)) |
| 811 | ;; (pre-write-conversion . utf-8-pre-write-conversion) | 951 | (pre-write-conversion . utf-8-pre-write-conversion) |
| 812 | (post-read-conversion . utf-8-post-read-conversion) | 952 | (post-read-conversion . utf-8-post-read-conversion) |
| 813 | (translation-table-for-encode . utf-translation-table-for-encode) | 953 | (translation-table-for-encode . utf-translation-table-for-encode) |
| 814 | (dependency unify-8859-on-encoding-mode | 954 | (dependency unify-8859-on-encoding-mode |