diff options
| author | Kenichi Handa | 2002-09-30 06:37:00 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2002-09-30 06:37:00 +0000 |
| commit | 278ce9363dfd266930e2aa16b2e3c112ea08baf0 (patch) | |
| tree | a278593be620a90484f4b1611b8d769686675ba2 | |
| parent | 121223a948cdf1d09bc00d46b056ba9c3a311917 (diff) | |
| download | emacs-278ce9363dfd266930e2aa16b2e3c112ea08baf0.tar.gz emacs-278ce9363dfd266930e2aa16b2e3c112ea08baf0.zip | |
(utf-16-decode-ucs): Look up
utf-subst-table-for-decode. Fix for the case that the looking up
succeeds.
(ccl-decode-mule-utf-16-le): Translate characters by
utf-translation-table-for-decode.
(ccl-decode-mule-utf-16-be): Likewise.
(ccl-encode-mule-utf-16-le): Look up utf-subst-table-for-encode
at first. Translate characters by
utf-translation-table-for-encode.
(ccl-encode-mule-utf-16-be): Likewise.
(mule-utf-16-le, mule-utf-16-be): Add `dependency' property.
| -rw-r--r-- | lisp/international/utf-16.el | 159 |
1 files changed, 85 insertions, 74 deletions
diff --git a/lisp/international/utf-16.el b/lisp/international/utf-16.el index 7786795f4a5..6ccac419f57 100644 --- a/lisp/international/utf-16.el +++ b/lisp/international/utf-16.el | |||
| @@ -68,48 +68,49 @@ | |||
| 68 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. | 68 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
| 69 | (eval-and-compile | 69 | (eval-and-compile |
| 70 | (defconst utf-16-decode-ucs | 70 | (defconst utf-16-decode-ucs |
| 71 | ;; We have the unicode in r1. Output is character codes in r0, r1, | 71 | ;; We have the unicode in r1. Output is charset ID in r0, code point |
| 72 | ;; and r2 if appropriate. | 72 | ;; in r1. |
| 73 | `((lookup-integer utf-8-subst-table r0 r3) | 73 | `((lookup-integer utf-subst-table-for-decode r1 r3) |
| 74 | (if r7 (r1 = r3)) ; got a translation | 74 | (if r7 ; got a translation |
| 75 | (if (r1 < 128) | 75 | ((r0 = r1) (r1 = r3)) |
| 76 | (r0 = ,(charset-id 'ascii)) | 76 | (if (r1 < 128) |
| 77 | (if (r1 < 160) | 77 | (r0 = ,(charset-id 'ascii)) |
| 78 | (r0 = ,(charset-id 'eight-bit-control)) | 78 | (if (r1 < 160) |
| 79 | (if (r1 < 256) | 79 | (r0 = ,(charset-id 'eight-bit-control)) |
| 80 | ((r0 = ,(charset-id 'latin-iso8859-1)) | 80 | (if (r1 < 256) |
| 81 | (r1 -= 128)) | 81 | ((r0 = ,(charset-id 'latin-iso8859-1)) |
| 82 | (if (r1 < #x2500) | 82 | (r1 -= 128)) |
| 83 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 83 | (if (r1 < #x2500) |
| 84 | (r1 -= #x100) | 84 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 85 | (r2 = (((r1 / 96) + 32) << 7)) | 85 | (r1 -= #x100) |
| 86 | (r1 %= 96) | 86 | (r2 = (((r1 / 96) + 32) << 7)) |
| 87 | (r1 += (r2 + 32))) | 87 | (r1 %= 96) |
| 88 | (if (r1 < #x3400) | 88 | (r1 += (r2 + 32))) |
| 89 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 89 | (if (r1 < #x3400) |
| 90 | (r1 -= #x2500) | 90 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
| 91 | (r2 = (((r1 / 96) + 32) << 7)) | 91 | (r1 -= #x2500) |
| 92 | (r1 %= 96) | 92 | (r2 = (((r1 / 96) + 32) << 7)) |
| 93 | (r1 += (r2 + 32))) | 93 | (r1 %= 96) |
| 94 | (if (r1 < #xd800) ; 2 untranslated bytes | 94 | (r1 += (r2 + 32))) |
| 95 | ;; ;; Assume this is rare, so don't worry about the | 95 | (if (r1 < #xd800) ; 2 untranslated bytes |
| 96 | ;; ;; overhead of the call. | 96 | ;; ;; Assume this is rare, so don't worry about the |
| 97 | ;; (call mule-utf-16-untrans) | 97 | ;; ;; overhead of the call. |
| 98 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 98 | ;; (call mule-utf-16-untrans) |
| 99 | (r1 = 15037)) ; U+fffd | 99 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 100 | (if (r1 < #xe000) ; surrogate | 100 | (r1 = 15037)) ; U+fffd |
| 101 | ;; ((call mule-utf-16-untrans) | 101 | (if (r1 < #xe000) ; surrogate |
| 102 | ;; (write-multibyte-character r0 r1) | 102 | ;; ((call mule-utf-16-untrans) |
| 103 | ;; (read r3 r4) | 103 | ;; (write-multibyte-character r0 r1) |
| 104 | ;; (call mule-utf-16-untrans)) | 104 | ;; (read r3 r4) |
| 105 | ((read r3 r4) | 105 | ;; (call mule-utf-16-untrans)) |
| 106 | (r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 106 | ((read r3 r4) |
| 107 | (r1 = 15037)) | 107 | (r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 108 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 108 | (r1 = 15037)) |
| 109 | (r1 -= #xe000) | 109 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) |
| 110 | (r2 = (((r1 / 96) + 32) << 7)) | 110 | (r1 -= #xe000) |
| 111 | (r1 %= 96) | 111 | (r2 = (((r1 / 96) + 32) << 7)) |
| 112 | (r1 += (r2 + 32))))))))))))) | 112 | (r1 %= 96) |
| 113 | (r1 += (r2 + 32)))))))))))))) | ||
| 113 | 114 | ||
| 114 | (define-ccl-program ccl-decode-mule-utf-16-le | 115 | (define-ccl-program ccl-decode-mule-utf-16-le |
| 115 | `(2 ; 2 bytes -> 1 to 4 bytes | 116 | `(2 ; 2 bytes -> 1 to 4 bytes |
| @@ -118,14 +119,14 @@ | |||
| 118 | (read r3 r4) | 119 | (read r3 r4) |
| 119 | (r1 = (r4 <8 r3)) | 120 | (r1 = (r4 <8 r3)) |
| 120 | ,utf-16-decode-ucs | 121 | ,utf-16-decode-ucs |
| 121 | (translate-character utf-8-translation-table-for-decode r0 r1) | 122 | (translate-character utf-translation-table-for-decode r0 r1) |
| 122 | (write-multibyte-character r0 r1) | 123 | (write-multibyte-character r0 r1) |
| 123 | (repeat)))) | 124 | (repeat)))) |
| 124 | "Decode little endian UTF-16 (ignoring signature bytes). | 125 | "Decode little endian UTF-16 (ignoring signature bytes). |
| 125 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 126 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 126 | mule-unicode-*. Un-representable Unicode characters are | 127 | mule-unicode-*. Un-representable Unicode characters are decoded as |
| 127 | decoded as U+fffd. The result is run through translation table | 128 | U+fffd. The result is run through the translation-table named |
| 128 | `utf-8-translation-table-for-decode' if that is defined.") | 129 | `utf-translation-table-for-decode'.") |
| 129 | 130 | ||
| 130 | (define-ccl-program ccl-decode-mule-utf-16-be | 131 | (define-ccl-program ccl-decode-mule-utf-16-be |
| 131 | `(2 ; 2 bytes -> 1 to 4 bytes | 132 | `(2 ; 2 bytes -> 1 to 4 bytes |
| @@ -134,14 +135,14 @@ decoded as U+fffd. The result is run through translation table | |||
| 134 | (read r3 r4) | 135 | (read r3 r4) |
| 135 | (r1 = (r3 <8 r4)) | 136 | (r1 = (r3 <8 r4)) |
| 136 | ,utf-16-decode-ucs | 137 | ,utf-16-decode-ucs |
| 137 | (translate-character utf-8-translation-table-for-decode r0 r1) | 138 | (translate-character utf-translation-table-for-decode r0 r1) |
| 138 | (write-multibyte-character r0 r1) | 139 | (write-multibyte-character r0 r1) |
| 139 | (repeat)))) | 140 | (repeat)))) |
| 140 | "Decode big endian UTF-16 (ignoring signature bytes). | 141 | "Decode big endian UTF-16 (ignoring signature bytes). |
| 141 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 142 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 142 | mule-unicode-*. Un-representable Unicode characters are | 143 | mule-unicode-*. Un-representable Unicode characters are |
| 143 | decoded as U+fffd. The result is run through translation table | 144 | decoded as U+fffd. The result is run through the translation-table of |
| 144 | `utf-8-non-latin-8859-table'.") | 145 | name `utf-translation-table-for-decode'.") |
| 145 | 146 | ||
| 146 | (makunbound 'utf-16-decode-ucs) ; done with it | 147 | (makunbound 'utf-16-decode-ucs) ; done with it |
| 147 | 148 | ||
| @@ -176,15 +177,18 @@ decoded as U+fffd. The result is run through translation table | |||
| 176 | (write #xfe) | 177 | (write #xfe) |
| 177 | (loop | 178 | (loop |
| 178 | (read-multibyte-character r0 r1) | 179 | (read-multibyte-character r0 r1) |
| 179 | (translate-character ucs-mule-to-mule-unicode r0 r1) | 180 | (lookup-character utf-subst-table-for-encode r0 r1) |
| 180 | ,utf-16-decode-to-ucs | 181 | (if (r7 == 0) |
| 182 | ((translate-character utf-translation-table-for-encode r0 r1) | ||
| 183 | ,utf-16-decode-to-ucs)) | ||
| 181 | (write (r0 & 255)) | 184 | (write (r0 & 255)) |
| 182 | (write (r0 >> 8)) | 185 | (write (r0 >> 8)) |
| 183 | (repeat)))) | 186 | (repeat)))) |
| 184 | "Encode to little endian UTF-16 with signature. | 187 | "Encode to little endian UTF-16 with signature. |
| 185 | Characters from the charsets ascii, eight-bit-control, | 188 | Characters from the charsets ascii, eight-bit-control, |
| 186 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 189 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
| 187 | after translation through the table `ucs-mule-to-mule-unicode'. | 190 | after translation through the translation-table of name |
| 191 | `utf-translation-table-for-encode'. | ||
| 188 | Others are encoded as U+FFFD.") | 192 | Others are encoded as U+FFFD.") |
| 189 | 193 | ||
| 190 | (define-ccl-program ccl-encode-mule-utf-16-be | 194 | (define-ccl-program ccl-encode-mule-utf-16-be |
| @@ -193,15 +197,18 @@ Others are encoded as U+FFFD.") | |||
| 193 | (write #xff) | 197 | (write #xff) |
| 194 | (loop | 198 | (loop |
| 195 | (read-multibyte-character r0 r1) | 199 | (read-multibyte-character r0 r1) |
| 196 | (translate-character ucs-mule-to-mule-unicode r0 r1) | 200 | (lookup-character utf-subst-table-for-encode r0 r1) |
| 197 | ,utf-16-decode-to-ucs | 201 | (if (r7 == 0) |
| 202 | ((translate-character utf-translation-table-for-encode r0 r1) | ||
| 203 | ,utf-16-decode-to-ucs)) | ||
| 198 | (write (r0 >> 8)) | 204 | (write (r0 >> 8)) |
| 199 | (write (r0 & 255)) | 205 | (write (r0 & 255)) |
| 200 | (repeat)))) | 206 | (repeat)))) |
| 201 | "Encode to big endian UTF-16 with signature. | 207 | "Encode to big endian UTF-16 with signature. |
| 202 | Characters from the charsets ascii, eight-bit-control, | 208 | Characters from the charsets ascii, eight-bit-control, |
| 203 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 209 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
| 204 | after translation through the table `ucs-mule-to-mule-unicode'. | 210 | after translation through the translation-table named |
| 211 | `utf-translation-table-for-encode'. | ||
| 205 | Others are encoded as U+FFFD.") | 212 | Others are encoded as U+FFFD.") |
| 206 | 213 | ||
| 207 | (makunbound 'utf-16-decode-to-ucs) | 214 | (makunbound 'utf-16-decode-to-ucs) |
| @@ -210,20 +217,19 @@ Others are encoded as U+FFFD.") | |||
| 210 | 217 | ||
| 211 | Assumes and ignores the leading two-byte signature. | 218 | Assumes and ignores the leading two-byte signature. |
| 212 | 219 | ||
| 213 | The supported Emacs character sets are the following, plus others | 220 | It supports Unicode characters of these ranges: |
| 214 | which may be included in the translation table | 221 | U+0000..U+33FF, U+E000..U+FFFF. |
| 215 | `ucs-mule-to-mule-unicode': | 222 | They correspond to these Emacs character sets: |
| 216 | ascii | 223 | ascii, latin-iso8859-1, mule-unicode-0100-24ff, |
| 217 | eight-bit-control | 224 | mule-unicode-2500-33ff, mule-unicode-e000-ffff |
| 218 | latin-iso8859-1 | 225 | |
| 219 | mule-unicode-0100-24ff | 226 | On decoding (e.g. reading a file), Unicode characters not in the above |
| 220 | mule-unicode-2500-33ff | 227 | ranges are decoded as U+FFFD, effectively corrupting the data |
| 221 | mule-unicode-e000-ffff | 228 | if they are re-encoded. |
| 222 | 229 | ||
| 223 | Note that Unicode characters out of the ranges U+0000-U+33FF and | 230 | On encoding (e.g. writing a file), Emacs characters not belonging to |
| 224 | U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data | 231 | any of the character sets listed above are encoded into the byte |
| 225 | if they are re-encoded. Emacs characters without Unicode conversions | 232 | sequence representing U+FFFD (REPLACEMENT CHARACTER).")) |
| 226 | are encoded as U+FFFD.")) | ||
| 227 | (make-coding-system | 233 | (make-coding-system |
| 228 | 'mule-utf-16-le 4 | 234 | 'mule-utf-16-le 4 |
| 229 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. | 235 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
| @@ -242,7 +248,11 @@ are encoded as U+FFFD.")) | |||
| 242 | (mime-charset . utf-16le) | 248 | (mime-charset . utf-16le) |
| 243 | (coding-category . coding-category-utf-16-le) | 249 | (coding-category . coding-category-utf-16-le) |
| 244 | (valid-codes (0 . 255)) | 250 | (valid-codes (0 . 255)) |
| 245 | (pre-write-conversion . utf-16-le-pre-write-conversion))) | 251 | (pre-write-conversion . utf-16-le-pre-write-conversion) |
| 252 | (dependency unify-8859-on-encoding-mode | ||
| 253 | unify-8859-on-decoding-mode | ||
| 254 | utf-fragment-on-decoding | ||
| 255 | utf-translate-cjk))) | ||
| 246 | 256 | ||
| 247 | (make-coding-system | 257 | (make-coding-system |
| 248 | 'mule-utf-16-be 4 ?u | 258 | 'mule-utf-16-be 4 ?u |
| @@ -261,10 +271,11 @@ are encoded as U+FFFD.")) | |||
| 261 | (mime-charset . utf-16be) | 271 | (mime-charset . utf-16be) |
| 262 | (coding-category . coding-category-utf-16-be) | 272 | (coding-category . coding-category-utf-16-be) |
| 263 | (valid-codes (0 . 255)) | 273 | (valid-codes (0 . 255)) |
| 264 | (pre-write-conversion . utf-16-be-pre-write-conversion))) | 274 | (pre-write-conversion . utf-16-be-pre-write-conversion) |
| 265 | 275 | (dependency unify-8859-on-encoding-mode | |
| 266 | (register-char-codings 'mule-utf-16-le ucs-mule-to-mule-unicode) | 276 | unify-8859-on-decoding-mode |
| 267 | (register-char-codings 'mule-utf-16-be ucs-mule-to-mule-unicode)) | 277 | utf-fragment-on-decoding |
| 278 | utf-translate-cjk)))) | ||
| 268 | 279 | ||
| 269 | (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) | 280 | (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) |
| 270 | (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) | 281 | (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) |