diff options
| author | Kenichi Handa | 2004-06-12 02:18:36 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2004-06-12 02:18:36 +0000 |
| commit | 95d2d43392131c1040185be72cf956841615b313 (patch) | |
| tree | d639ba368f57f196c1f54e8c5b414c795a6670c9 | |
| parent | c71c26e9295573509ceaa468ea973299a8b311e1 (diff) | |
| download | emacs-95d2d43392131c1040185be72cf956841615b313.tar.gz emacs-95d2d43392131c1040185be72cf956841615b313.zip | |
(utf-16-decode-ucs): Handle a surrogate
pair correctly. Call ccl-mule-utf-untrans for untranslable chars.
(utf-16le-decode-loop): Set r5 to -1 before loop.
(utf-16be-decode-loop): Likewise.
(ccl-decode-mule-utf-16le): Add EOF processing block.
(ccl-decode-mule-utf-16be): Likewise.
(ccl-decode-mule-utf-16le-with-signature): Likewise.
(ccl-decode-mule-utf-16be-with-signature): Likewise.
(ccl-decode-mule-utf-16): Likewise. Set r5 to -1 initially.
(ccl-mule-utf-16-encode-untrans): New CCL.
(utf-16-decode-to-ucs): Handle pre-read character.
(utf-16le-encode-loop): Handle surrogate pair.
(utf-16be-encode-loop): Likewise.
(ccl-encode-mule-utf-16le-with-signature): Adjusted for the change
of utf-16le-encode-loop.
(ccl-encode-mule-utf-16be-with-signature): Adjusted for the change
of utf-16be-encode-loop.
(mule-utf-16-post-read-conversion): Call
utf-8-post-read-conversion at first.
(mule-utf-16[{le|be}], mule-utf-16{le|be}-with-signature): Include
CJK charsets in safe-charsets if utf-translate-cjk-mode is on.
Add post-read-conversion and pre-write-conversion.
| -rw-r--r-- | lisp/international/utf-16.el | 519 |
1 files changed, 313 insertions, 206 deletions
diff --git a/lisp/international/utf-16.el b/lisp/international/utf-16.el index d924512b634..477cccc2bf9 100644 --- a/lisp/international/utf-16.el +++ b/lisp/international/utf-16.el | |||
| @@ -48,99 +48,110 @@ | |||
| 48 | ;; things below, sometimes with commonality abstracted into a let | 48 | ;; things below, sometimes with commonality abstracted into a let |
| 49 | ;; binding for maintenance convenience. | 49 | ;; binding for maintenance convenience. |
| 50 | 50 | ||
| 51 | ;; We'd need new charsets distinct from ascii and eight-bit-control to | ||
| 52 | ;; deal with untranslated sequences, since we can't otherwise | ||
| 53 | ;; distinguish the bytes, as we can with utf-8. | ||
| 54 | |||
| 55 | ;; ;; Do a multibyte write for bytes in r3 and r4. | ||
| 56 | ;; ;; Intended for untranslatable utf-16 sequences. | ||
| 57 | ;; (define-ccl-program ccl-mule-utf-16-untrans | ||
| 58 | ;; `(0 | ||
| 59 | ;; (if (r3 < 128) | ||
| 60 | ;; (r0 = ,(charset-id 'ascii)) | ||
| 61 | ;; (if (r3 < 160) | ||
| 62 | ;; (r0 = ,(charset-id 'eight-bit-control)) | ||
| 63 | ;; (r0 = ,(charset-id 'eight-bit-graphic)))) | ||
| 64 | ;; (if (r4 < 128) | ||
| 65 | ;; (r0 = ,(charset-id 'ascii)) | ||
| 66 | ;; (if (r4 < 160) | ||
| 67 | ;; (r0 = ,(charset-id 'eight-bit-control)) | ||
| 68 | ;; (r0 = ,(charset-id 'eight-bit-graphic)))) | ||
| 69 | ;; (r1 = r4))) | ||
| 70 | ;; "Do a multibyte write for bytes in r3 and r4. | ||
| 71 | ;; First swap them if we're big endian, indicated by r5==0. | ||
| 72 | ;; Intended for untranslatable utf-16 sequences.") | ||
| 73 | |||
| 74 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. | 51 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. |
| 75 | (eval-and-compile | 52 | (eval-and-compile |
| 76 | (defconst utf-16-decode-ucs | 53 | (defconst utf-16-decode-ucs |
| 77 | ;; We have the unicode in r1. Output is charset ID in r0, code | 54 | ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is |
| 78 | ;; point in r1. | 55 | ;; the first of a surrogate pair and r1 is the second of the pair. |
| 79 | `((lookup-integer utf-subst-table-for-decode r1 r3) | 56 | ;; Output is charset ID in r0, code point in r1. R0 may be set to |
| 80 | (if r7 ; got a translation | 57 | ;; -1 in which case a caller should not write out r1. |
| 81 | ((r0 = r1) (r1 = r3)) | 58 | `((if (r5 >= 0) |
| 82 | (if (r1 < 128) | 59 | ((r0 = (r1 < #xDC00)) |
| 83 | (r0 = ,(charset-id 'ascii)) | 60 | (if ((r1 >= #xE000) | r0) |
| 84 | (if (r1 < 160) | 61 | ;; Invalid second code of surrogate pair. |
| 85 | (r0 = ,(charset-id 'eight-bit-control)) | 62 | ((r0 = r5) |
| 86 | (if (r1 < 256) | 63 | (call ccl-mule-utf-untrans)) |
| 87 | ((r0 = ,(charset-id 'latin-iso8859-1)) | 64 | ((r1 -= #xDC00) |
| 88 | (r1 -= 128)) | 65 | (r1 += (((r5 - #xD800) << 10) + #x10000)))) |
| 89 | (if (r1 < #x2500) | 66 | (r5 = -1))) |
| 90 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | 67 | (if (r1 < 128) |
| 91 | (r1 -= #x100) | 68 | (r0 = ,(charset-id 'ascii)) |
| 92 | (r2 = (((r1 / 96) + 32) << 7)) | 69 | ((lookup-integer utf-subst-table-for-decode r1 r3) |
| 93 | (r1 %= 96) | 70 | (if r7 ; got a translation |
| 94 | (r1 += (r2 + 32))) | 71 | ((r0 = r1) (r1 = r3)) |
| 95 | (if (r1 < #x3400) | 72 | (if (r1 < 160) |
| 96 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | 73 | (r0 = ,(charset-id 'eight-bit-control)) |
| 97 | (r1 -= #x2500) | 74 | (if (r1 < 256) |
| 98 | (r2 = (((r1 / 96) + 32) << 7)) | 75 | ((r0 = ,(charset-id 'latin-iso8859-1)) |
| 99 | (r1 %= 96) | 76 | (r1 -= 128)) |
| 100 | (r1 += (r2 + 32))) | 77 | (if (r1 < #x2500) |
| 101 | (if (r1 < #xd800) ; 2 untranslated bytes | 78 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) |
| 102 | ;; ;; Assume this is rare, so don't worry about the | 79 | (r1 -= #x100) |
| 103 | ;; ;; overhead of the call. | 80 | (r2 = (((r1 / 96) + 32) << 7)) |
| 104 | ;; (call mule-utf-16-untrans) | 81 | (r1 %= 96) |
| 105 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 82 | (r1 += (r2 + 32))) |
| 106 | (r1 = 15037)) ; U+fffd | 83 | (if (r1 < #x3400) |
| 107 | (if (r1 < #xe000) ; surrogate | 84 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) |
| 108 | ;; ((call mule-utf-16-untrans) | 85 | (r1 -= #x2500) |
| 109 | ;; (write-multibyte-character r0 r1) | 86 | (r2 = (((r1 / 96) + 32) << 7)) |
| 110 | ;; (read r3 r4) | 87 | (r1 %= 96) |
| 111 | ;; (call mule-utf-16-untrans)) | 88 | (r1 += (r2 + 32))) |
| 112 | ((read r3 r4) | 89 | (if (r1 < #xD800) |
| 113 | (r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 90 | ;; We can't have this character. |
| 114 | (r1 = 15037)) | 91 | ((r0 = r1) |
| 115 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | 92 | (call ccl-mule-utf-untrans) |
| 116 | (r1 -= #xe000) | 93 | (r5 = -1) |
| 117 | (r2 = (((r1 / 96) + 32) << 7)) | 94 | (r0 = -1)) |
| 118 | (r1 %= 96) | 95 | (if (r1 < #xDC00) |
| 119 | (r1 += (r2 + 32))))))))))))) | 96 | ;; The first code of a surrogate pair. |
| 97 | ((r5 = r1) | ||
| 98 | (r0 = -1)) | ||
| 99 | (if (r1 < #xE000) | ||
| 100 | ;; The second code of a surrogate pair, invalid. | ||
| 101 | ((r0 = r1) | ||
| 102 | (call ccl-mule-utf-untrans) | ||
| 103 | (r5 = -1) | ||
| 104 | (r0 = -1)) | ||
| 105 | (if (r1 < #x10000) | ||
| 106 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 107 | (r1 -= #xE000) | ||
| 108 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 109 | (r1 %= 96) | ||
| 110 | (r1 += (r2 + 32))) | ||
| 111 | ;; We can't have this character. | ||
| 112 | ((r0 = r1) | ||
| 113 | (call ccl-mule-utf-untrans) | ||
| 114 | (r5 = -1) | ||
| 115 | (r0 = -1))))))))))))))) | ||
| 120 | 116 | ||
| 121 | (defconst utf-16le-decode-loop | 117 | (defconst utf-16le-decode-loop |
| 122 | `(loop | 118 | `((r5 = -1) |
| 123 | (read r3 r4) | 119 | (loop |
| 124 | (r1 = (r4 <8 r3)) | 120 | (r3 = -1) |
| 125 | ,utf-16-decode-ucs | 121 | (read r3 r4) |
| 126 | (translate-character utf-translation-table-for-decode r0 r1) | 122 | (r1 = (r4 <8 r3)) |
| 127 | (write-multibyte-character r0 r1) | 123 | ,@utf-16-decode-ucs |
| 128 | (repeat))) | 124 | (if (r0 >= 0) |
| 125 | ((translate-character utf-translation-table-for-decode r0 r1) | ||
| 126 | (write-multibyte-character r0 r1))) | ||
| 127 | (repeat)))) | ||
| 129 | 128 | ||
| 130 | (defconst utf-16be-decode-loop | 129 | (defconst utf-16be-decode-loop |
| 131 | `(loop | 130 | `((r5 = -1) |
| 132 | (read r3 r4) | 131 | (loop |
| 133 | (r1 = (r3 <8 r4)) | 132 | (r3 = -1) |
| 134 | ,@utf-16-decode-ucs | 133 | (read r3 r4) |
| 135 | (translate-character utf-translation-table-for-decode r0 r1) | 134 | (r1 = (r3 <8 r4)) |
| 136 | (write-multibyte-character r0 r1) | 135 | ,@utf-16-decode-ucs |
| 137 | (repeat))) | 136 | (if (r0 >= 0) |
| 137 | ((translate-character utf-translation-table-for-decode r0 r1) | ||
| 138 | (write-multibyte-character r0 r1))) | ||
| 139 | (repeat)))) | ||
| 138 | 140 | ||
| 139 | ) | 141 | ) |
| 140 | 142 | ||
| 141 | (define-ccl-program ccl-decode-mule-utf-16le | 143 | (define-ccl-program ccl-decode-mule-utf-16le |
| 142 | `(2 ; 2 bytes -> 1 to 4 bytes | 144 | `(2 ; 2 bytes -> 1 to 4 bytes |
| 143 | ,utf-16le-decode-loop) | 145 | ,utf-16le-decode-loop |
| 146 | ((if (r5 >= 0) | ||
| 147 | ((r0 = r5) | ||
| 148 | (call ccl-mule-utf-untrans))) | ||
| 149 | (if (r3 < 0) | ||
| 150 | nil | ||
| 151 | ((if (r3 < #xA0) | ||
| 152 | (r0 = ,(charset-id 'eight-bit-control)) | ||
| 153 | (r0 = ,(charset-id 'eight-bit-graphic))) | ||
| 154 | (write-multibyte-character r0 r3))))) | ||
| 144 | "Decode UTF-16LE (little endian without signature bytes). | 155 | "Decode UTF-16LE (little endian without signature bytes). |
| 145 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 156 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 146 | mule-unicode-*. Un-representable Unicode characters are decoded as | 157 | mule-unicode-*. Un-representable Unicode characters are decoded as |
| @@ -149,7 +160,13 @@ U+fffd. The result is run through the translation-table named | |||
| 149 | 160 | ||
| 150 | (define-ccl-program ccl-decode-mule-utf-16be | 161 | (define-ccl-program ccl-decode-mule-utf-16be |
| 151 | `(2 ; 2 bytes -> 1 to 4 bytes | 162 | `(2 ; 2 bytes -> 1 to 4 bytes |
| 152 | ,utf-16be-decode-loop) | 163 | ,utf-16be-decode-loop |
| 164 | ((if (r5 >= 0) | ||
| 165 | ((r0 = r5) | ||
| 166 | (call ccl-mule-utf-untrans))) | ||
| 167 | (if (r3 >= 0) | ||
| 168 | ((r0 = r3) | ||
| 169 | (call ccl-mule-utf-untrans))))) | ||
| 153 | "Decode UTF-16BE (big endian without signature bytes). | 170 | "Decode UTF-16BE (big endian without signature bytes). |
| 154 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | 171 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 155 | mule-unicode-*. Un-representable Unicode characters are | 172 | mule-unicode-*. Un-representable Unicode characters are |
| @@ -158,91 +175,218 @@ name `utf-translation-table-for-decode'.") | |||
| 158 | 175 | ||
| 159 | (define-ccl-program ccl-decode-mule-utf-16le-with-signature | 176 | (define-ccl-program ccl-decode-mule-utf-16le-with-signature |
| 160 | `(2 | 177 | `(2 |
| 161 | ((read r3 r4) | 178 | ((r3 = -1) |
| 162 | ,utf-16le-decode-loop)) | 179 | (read r3 r4) |
| 180 | ,@utf-16le-decode-loop) | ||
| 181 | (if (r3 >= 0) | ||
| 182 | ((r0 = r3) | ||
| 183 | (call ccl-mule-utf-untrans)))) | ||
| 163 | "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") | 184 | "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") |
| 164 | 185 | ||
| 165 | (define-ccl-program ccl-decode-mule-utf-16be-with-signature | 186 | (define-ccl-program ccl-decode-mule-utf-16be-with-signature |
| 166 | `(2 | 187 | `(2 |
| 167 | ((read r3 r4) | 188 | ((r3 = -1) |
| 168 | ,utf-16be-decode-loop)) | 189 | (read r3 r4) |
| 190 | ,@utf-16be-decode-loop) | ||
| 191 | (if (r3 >= 0) | ||
| 192 | ((r0 = r3) | ||
| 193 | (call ccl-mule-utf-untrans)))) | ||
| 169 | "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") | 194 | "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") |
| 170 | 195 | ||
| 171 | (define-ccl-program ccl-decode-mule-utf-16 | 196 | (define-ccl-program ccl-decode-mule-utf-16 |
| 172 | `(2 | 197 | `(2 |
| 173 | ((read r3 r4) | 198 | ((r3 = -1) |
| 199 | (read r3 r4) | ||
| 174 | (r1 = (r3 <8 r4)) | 200 | (r1 = (r3 <8 r4)) |
| 201 | (r5 = -1) | ||
| 175 | (if (r1 == #xFFFE) | 202 | (if (r1 == #xFFFE) |
| 176 | ;; R1 is a BOM for little endian. We keep this character as | 203 | ;; R1 is a BOM for little endian. We keep this character as |
| 177 | ;; is temporarily. It is removed by post-read-conversion | 204 | ;; is temporarily. It is removed by post-read-conversion |
| 178 | ;; function. | 205 | ;; function. |
| 179 | (,@utf-16-decode-ucs | 206 | (,@utf-16-decode-ucs |
| 180 | (write-multibyte-character r0 r1) | 207 | (write-multibyte-character r0 r1) |
| 181 | ,utf-16le-decode-loop) | 208 | ,@utf-16le-decode-loop) |
| 182 | ((if (r1 == #xFEFF) | 209 | ((if (r1 == #xFEFF) |
| 183 | ;; R1 is a BOM for big endian, but we can't keep that | 210 | ;; R1 is a BOM for big endian, but we can't keep that |
| 184 | ;; character in the output because it can't be | 211 | ;; character in the output because it can't be |
| 185 | ;; distinguished with the normal U+FEFF. So, we keep | 212 | ;; distinguished with the normal U+FEFF. So, we keep |
| 186 | ;; #xFFFF instead. | 213 | ;; #xFFFF instead. |
| 187 | ((r1 = #xFFFF) | 214 | ((r1 = #xFFFF) |
| 188 | ,@utf-16-decode-ucs) | 215 | ,@utf-16-decode-ucs |
| 189 | ;; R1 a normal Unicode character. | 216 | (write-multibyte-character r0 r1)) |
| 217 | ;; R1 is a normal Unicode character. | ||
| 190 | (,@utf-16-decode-ucs | 218 | (,@utf-16-decode-ucs |
| 191 | (translate-character utf-translation-table-for-decode r0 r1))) | 219 | (if (r0 >= 0) |
| 192 | (write-multibyte-character r0 r1) | 220 | ((translate-character utf-translation-table-for-decode r0 r1) |
| 193 | ,utf-16be-decode-loop)))) | 221 | (write-multibyte-character r0 r1))))) |
| 222 | ,@utf-16be-decode-loop))) | ||
| 223 | (if (r3 >= 0) | ||
| 224 | ((r0 = r3) | ||
| 225 | (call ccl-mule-utf-untrans)))) | ||
| 194 | "Like ccl-decode-utf-16be/le but check the first BOM.") | 226 | "Like ccl-decode-utf-16be/le but check the first BOM.") |
| 195 | 227 | ||
| 196 | (makunbound 'utf-16-decode-ucs) ; done with it | 228 | (makunbound 'utf-16-decode-ucs) ; done with it |
| 197 | (makunbound 'utf-16le-decode-loop) | 229 | (makunbound 'utf-16le-decode-loop) |
| 198 | (makunbound 'utf-16be-decode-loop) | 230 | (makunbound 'utf-16be-decode-loop) |
| 199 | 231 | ||
| 232 | ;; UTF-16 decoder generates an UTF-8 sequence represented by a | ||
| 233 | ;; sequence eight-bit-control/graphic chars for an invalid byte (the | ||
| 234 | ;; last byte of an odd length source) and an untranslatable character | ||
| 235 | ;; (including an invalid surrogate-pair code-point). | ||
| 236 | ;; | ||
| 237 | ;; This CCL parses that sequence (the first byte is already in r1), | ||
| 238 | ;; and if the sequence represents an untranslatable character, it sets | ||
| 239 | ;; r1 to the original invalid code or untranslated Unicode character | ||
| 240 | ;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to | ||
| 241 | ;; -1 (to tell the caller that there's no pre-read character). | ||
| 242 | ;; | ||
| 243 | ;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to | ||
| 244 | ;; the byte, sets r3 and r5 to -1. | ||
| 245 | ;; | ||
| 246 | ;; Otherwise, don't change r1, set r2 and r3 to already read | ||
| 247 | ;; eight-bit-control/graphic characters (if any), set r5 and r6 to the | ||
| 248 | ;; last character that invalidates the UTF-8 form. | ||
| 249 | ;; | ||
| 250 | ;; Note: For UTF-8 validation, we only check if a character is | ||
| 251 | ;; eight-bit-control/graphic or not. It may result in incorrect | ||
| 252 | ;; handling of random binary data, but such a data can't be encoded by | ||
| 253 | ;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a | ||
| 254 | ;; sequence even if a source contains invalid byte-sequence. | ||
| 255 | |||
| 256 | (define-ccl-program ccl-mule-utf-16-encode-untrans | ||
| 257 | `(0 | ||
| 258 | ((r2 = -1) | ||
| 259 | ;; Read the 2nd byte. | ||
| 260 | (read-multibyte-character r5 r6) | ||
| 261 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 262 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 263 | ((r2 = r1) | ||
| 264 | (r3 = -1) | ||
| 265 | (r1 = -1) | ||
| 266 | (end))) ; invalid UTF-8 | ||
| 267 | |||
| 268 | (r3 = -1) | ||
| 269 | (r2 = r6) | ||
| 270 | (if (r1 <= #xE0) | ||
| 271 | ;; 2-byte UTF-8, i.e. originally an invalid byte. | ||
| 272 | ((r2 &= #x3F) | ||
| 273 | (r2 |= ((r1 & #x1F) << 6)) | ||
| 274 | (r1 = -1) | ||
| 275 | (r5 = -1) | ||
| 276 | (end))) | ||
| 277 | |||
| 278 | ;; Read the 3rd byte. | ||
| 279 | (read-multibyte-character r5 r6) | ||
| 280 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 281 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 282 | ((end))) ; invalid UTF-8 | ||
| 283 | |||
| 284 | (if (r1 < #xF0) ; valid 3-byte UTF-8 | ||
| 285 | ((r1 = ((r1 & #x0F) << 12)) | ||
| 286 | (r1 |= ((r2 & #x3F) << 6)) | ||
| 287 | (r1 |= (r6 & #x3F)) | ||
| 288 | (r2 = -1) | ||
| 289 | (r5 = -1) | ||
| 290 | (end))) | ||
| 291 | |||
| 292 | (r3 = r6) | ||
| 293 | ;; Read the 4th byte. | ||
| 294 | (read-multibyte-character r5 r6) | ||
| 295 | (r0 = (r5 != ,(charset-id 'eight-bit-control))) | ||
| 296 | (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) | ||
| 297 | (end)) ; livalid UTF-8 | ||
| 298 | |||
| 299 | ;; valid 4-byte UTF-8 | ||
| 300 | (r1 = ((r1 & #x07) << 18)) | ||
| 301 | (r1 |= ((r2 & #x3F) << 12)) | ||
| 302 | (r1 |= ((r3 & #x3F) << 6)) | ||
| 303 | (r1 |= (r6 & #x3F)) | ||
| 304 | (r2 = -1) | ||
| 305 | (r5 = -1) | ||
| 306 | (end)) | ||
| 307 | |||
| 308 | (if (r1 >= 0) | ||
| 309 | ((write r1) | ||
| 310 | (if (r2 >= 0) | ||
| 311 | ((write r2) | ||
| 312 | (if (r3 >= 0) | ||
| 313 | (write r3)))))))) | ||
| 314 | |||
| 200 | (eval-and-compile | 315 | (eval-and-compile |
| 201 | (defconst utf-16-decode-to-ucs | 316 | (defconst utf-16-decode-to-ucs |
| 202 | ;; CCL which, given the result of a multibyte read in r0 and r1, | 317 | ;; Read a character and set r1 to the corresponding Unicode code. |
| 203 | ;; sets r0 to the character's Unicode if the charset is one of the | 318 | ;; If r5 is not negative, it means that we have already read a |
| 204 | ;; basic utf-8 coding system ones. Otherwise set to U+fffd. | 319 | ;; character into r5 and r6. |
| 205 | `(if (r0 == ,(charset-id 'ascii)) | 320 | ;; If an invalid eight-bit-control/graphic sequence is found, r2 and |
| 206 | (r0 = r1) | 321 | ;; r3 may contain a byte to written out, r5 and r6 may contain a |
| 207 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | 322 | ;; pre-read character. Usually they are set to -1. |
| 208 | (r0 = (r1 + 128)) | 323 | `((if (r5 < 0) |
| 209 | (if (r0 == ,(charset-id 'eight-bit-control)) | 324 | (read-multibyte-character r0 r1) |
| 210 | (r0 = r1) | 325 | ((r0 = r5) |
| 211 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | 326 | (r1 = r6) |
| 212 | (r0 = r1) | 327 | (r5 = -1))) |
| 213 | ((r2 = (r1 & #x7f)) | 328 | (lookup-character utf-subst-table-for-encode r0 r1) |
| 214 | (r1 >>= 7) | 329 | (r2 = -1) |
| 215 | (r3 = ((r1 - 32) * 96)) | 330 | (if (r7 > 0) |
| 216 | (r3 += (r2 - 32)) | 331 | (r1 = r0) |
| 217 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | 332 | ((translate-character utf-translation-table-for-encode r0 r1) |
| 218 | (r0 = (r3 + #x100)) | 333 | (if (r0 == ,(charset-id 'ascii)) |
| 219 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | 334 | nil |
| 220 | (r0 = (r3 + #x2500)) | 335 | (if (r0 == ,(charset-id 'latin-iso8859-1)) |
| 221 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | 336 | (r1 += 128) |
| 222 | (r0 = (r3 + #xe000)) | 337 | (if (r0 == ,(charset-id 'eight-bit-control)) |
| 223 | (r0 = #xfffd)))))))))) | 338 | nil |
| 339 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | ||
| 340 | (call ccl-mule-utf-16-encode-untrans) | ||
| 341 | ((r2 = ((r1 & #x7f) - 32)) | ||
| 342 | (r3 = ((((r1 >> 7) - 32) * 96) + r2)) | ||
| 343 | (r2 = -1) | ||
| 344 | (r5 = -1) | ||
| 345 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 346 | (r1 = (r3 + #x100)) | ||
| 347 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 348 | (r1 = (r3 + #x2500)) | ||
| 349 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 350 | (r1 = (r3 + #xe000)) | ||
| 351 | (r1 = #xfffd))))))))))))) | ||
| 224 | 352 | ||
| 225 | (defconst utf-16le-encode-loop | 353 | (defconst utf-16le-encode-loop |
| 226 | `(loop | 354 | `((r5 = -1) |
| 227 | (read-multibyte-character r0 r1) | 355 | (loop |
| 228 | (lookup-character utf-subst-table-for-encode r0 r1) | 356 | ,@utf-16-decode-to-ucs |
| 229 | (if (r7 == 0) | 357 | (if (r1 >= #x10000) |
| 230 | ((translate-character utf-translation-table-for-encode r0 r1) | 358 | ((r1 -= #x10000) |
| 231 | ,utf-16-decode-to-ucs)) | 359 | (r0 = ((r1 >> 10) + #xD800)) |
| 232 | (write (r0 & 255)) | 360 | (write (r0 & 255)) |
| 233 | (write (r0 >> 8)) | 361 | (write (r0 >> 8)) |
| 234 | (repeat))) | 362 | (r1 = ((r1 & #x3FF) + #xDC00)))) |
| 363 | (if (r1 >= 0) | ||
| 364 | ((write (r1 & 255)) | ||
| 365 | (write (r1 >> 8)))) | ||
| 366 | (if (r2 >= 0) | ||
| 367 | ((write r2) | ||
| 368 | (if (r3 >= 0) | ||
| 369 | (write r3)))) | ||
| 370 | (repeat)))) | ||
| 235 | 371 | ||
| 236 | (defconst utf-16be-encode-loop | 372 | (defconst utf-16be-encode-loop |
| 237 | `(loop | 373 | `((r5 = -1) |
| 238 | (read-multibyte-character r0 r1) | 374 | (loop |
| 239 | (lookup-character utf-subst-table-for-encode r0 r1) | 375 | ,@utf-16-decode-to-ucs |
| 240 | (if (r7 == 0) | 376 | (if (r1 >= #x10000) |
| 241 | ((translate-character utf-translation-table-for-encode r0 r1) | 377 | ((r1 -= #x10000) |
| 242 | ,utf-16-decode-to-ucs)) | 378 | (r0 = ((r1 >> 10) + #xD800)) |
| 243 | (write (r0 >> 8)) | 379 | (write (r0 >> 8)) |
| 244 | (write (r0 & 255)) | 380 | (write (r0 & 255)) |
| 245 | (repeat))) | 381 | (r1 = ((r1 & #x3FF) + #xDC00)))) |
| 382 | (if (r1 >= 0) | ||
| 383 | ((write (r1 >> 8)) | ||
| 384 | (write (r1 & 255)))) | ||
| 385 | (if (r2 >= 0) | ||
| 386 | ((write r2) | ||
| 387 | (if (r3 >= 0) | ||
| 388 | (write r3)))) | ||
| 389 | (repeat)))) | ||
| 246 | ) | 390 | ) |
| 247 | 391 | ||
| 248 | 392 | ||
| @@ -270,7 +414,7 @@ Others are encoded as U+FFFD.") | |||
| 270 | `(1 | 414 | `(1 |
| 271 | ((write #xFF) | 415 | ((write #xFF) |
| 272 | (write #xFE) | 416 | (write #xFE) |
| 273 | ,utf-16le-encode-loop)) | 417 | ,@utf-16le-encode-loop)) |
| 274 | "Encode to UTF-16 (little endian with signature). | 418 | "Encode to UTF-16 (little endian with signature). |
| 275 | Characters from the charsets ascii, eight-bit-control, | 419 | Characters from the charsets ascii, eight-bit-control, |
| 276 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 420 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
| @@ -282,7 +426,7 @@ Others are encoded as U+FFFD.") | |||
| 282 | `(1 | 426 | `(1 |
| 283 | ((write #xFE) | 427 | ((write #xFE) |
| 284 | (write #xFF) | 428 | (write #xFF) |
| 285 | ,utf-16be-encode-loop)) | 429 | ,@utf-16be-encode-loop)) |
| 286 | "Encode to UTF-16 (big endian with signature). | 430 | "Encode to UTF-16 (big endian with signature). |
| 287 | Characters from the charsets ascii, eight-bit-control, | 431 | Characters from the charsets ascii, eight-bit-control, |
| 288 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | 432 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded |
| @@ -296,6 +440,7 @@ Others are encoded as U+FFFD.") | |||
| 296 | 440 | ||
| 297 | (defun mule-utf-16-post-read-conversion (length) | 441 | (defun mule-utf-16-post-read-conversion (length) |
| 298 | (when (> length 0) | 442 | (when (> length 0) |
| 443 | (setq length (utf-8-post-read-conversion length)) | ||
| 299 | (let ((char (following-char))) | 444 | (let ((char (following-char))) |
| 300 | (cond ((= char (decode-char 'ucs #xFFFE)) | 445 | (cond ((= char (decode-char 'ucs #xFFFE)) |
| 301 | (delete-char 1) | 446 | (delete-char 1) |
| @@ -329,29 +474,34 @@ if they are re-encoded. | |||
| 329 | 474 | ||
| 330 | On encoding (e.g. writing a file), Emacs characters not belonging to | 475 | On encoding (e.g. writing a file), Emacs characters not belonging to |
| 331 | any of the character sets listed above are encoded into the byte | 476 | any of the character sets listed above are encoded into the byte |
| 332 | sequence representing U+FFFD (REPLACEMENT CHARACTER).")) | 477 | sequence representing U+FFFD (REPLACEMENT CHARACTER).") |
| 478 | (props `((safe-charsets | ||
| 479 | ascii | ||
| 480 | eight-bit-control | ||
| 481 | eight-bit-graphic | ||
| 482 | latin-iso8859-1 | ||
| 483 | mule-unicode-0100-24ff | ||
| 484 | mule-unicode-2500-33ff | ||
| 485 | mule-unicode-e000-ffff | ||
| 486 | ,@(if utf-translate-cjk-mode | ||
| 487 | utf-translate-cjk-charsets)) | ||
| 488 | (valid-codes (0 . 255)) | ||
| 489 | (mime-text-unsuitable . t) | ||
| 490 | (pre-write-conversion . utf-8-pre-write-conversion) | ||
| 491 | (dependency unify-8859-on-encoding-mode | ||
| 492 | unify-8859-on-decoding-mode | ||
| 493 | utf-fragment-on-decoding | ||
| 494 | utf-translate-cjk-mode)))) | ||
| 333 | (make-coding-system | 495 | (make-coding-system |
| 334 | 'mule-utf-16le 4 | 496 | 'mule-utf-16le 4 |
| 335 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. | 497 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. |
| 336 | (concat | 498 | (concat |
| 337 | "UTF-16LE encoding for Emacs-supported Unicode characters." | 499 | "UTF-16LE encoding for Emacs-supported Unicode characters." |
| 338 | doc) | 500 | doc) |
| 339 | |||
| 340 | '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) | 501 | '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) |
| 341 | '((safe-charsets | 502 | `(,@props |
| 342 | ascii | 503 | (post-read-conversion . utf-8-post-read-conversion) |
| 343 | eight-bit-control | 504 | (mime-charset . utf-16le))) |
| 344 | latin-iso8859-1 | ||
| 345 | mule-unicode-0100-24ff | ||
| 346 | mule-unicode-2500-33ff | ||
| 347 | mule-unicode-e000-ffff) | ||
| 348 | (mime-charset . utf-16le) | ||
| 349 | (mime-text-unsuitable . t) | ||
| 350 | (valid-codes (0 . 255)) | ||
| 351 | (dependency unify-8859-on-encoding-mode | ||
| 352 | unify-8859-on-decoding-mode | ||
| 353 | utf-fragment-on-decoding | ||
| 354 | utf-translate-cjk-mode))) | ||
| 355 | 505 | ||
| 356 | (make-coding-system | 506 | (make-coding-system |
| 357 | 'mule-utf-16be 4 ?u | 507 | 'mule-utf-16be 4 ?u |
| @@ -360,19 +510,9 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER).")) | |||
| 360 | doc) | 510 | doc) |
| 361 | 511 | ||
| 362 | '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) | 512 | '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) |
| 363 | '((safe-charsets | 513 | `(,@props |
| 364 | ascii | 514 | (post-read-conversion . utf-8-post-read-conversion) |
| 365 | eight-bit-control | 515 | (mime-charset . utf-16be))) |
| 366 | latin-iso8859-1 | ||
| 367 | mule-unicode-0100-24ff | ||
| 368 | mule-unicode-2500-33ff | ||
| 369 | mule-unicode-e000-ffff) | ||
| 370 | (mime-charset . utf-16be) | ||
| 371 | (valid-codes (0 . 255)) | ||
| 372 | (dependency unify-8859-on-encoding-mode | ||
| 373 | unify-8859-on-decoding-mode | ||
| 374 | utf-fragment-on-decoding | ||
| 375 | utf-translate-cjk-mode))) | ||
| 376 | 516 | ||
| 377 | (make-coding-system | 517 | (make-coding-system |
| 378 | 'mule-utf-16le-with-signature 4 ?u | 518 | 'mule-utf-16le-with-signature 4 ?u |
| @@ -382,21 +522,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER).")) | |||
| 382 | 522 | ||
| 383 | '(ccl-decode-mule-utf-16le-with-signature | 523 | '(ccl-decode-mule-utf-16le-with-signature |
| 384 | . ccl-encode-mule-utf-16le-with-signature) | 524 | . ccl-encode-mule-utf-16le-with-signature) |
| 385 | '((safe-charsets | 525 | `(,@props |
| 386 | ascii | 526 | (post-read-conversion . utf-8-post-read-conversion) |
| 387 | eight-bit-control | ||
| 388 | latin-iso8859-1 | ||
| 389 | mule-unicode-0100-24ff | ||
| 390 | mule-unicode-2500-33ff | ||
| 391 | mule-unicode-e000-ffff) | ||
| 392 | (coding-category . coding-category-utf-16-le) | 527 | (coding-category . coding-category-utf-16-le) |
| 393 | (mime-charset . utf-16) | 528 | (mime-charset . utf-16))) |
| 394 | (mime-text-unsuitable . t) | ||
| 395 | (valid-codes (0 . 255)) | ||
| 396 | (dependency unify-8859-on-encoding-mode | ||
| 397 | unify-8859-on-decoding-mode | ||
| 398 | utf-fragment-on-decoding | ||
| 399 | utf-translate-cjk-mode))) | ||
| 400 | 529 | ||
| 401 | (make-coding-system | 530 | (make-coding-system |
| 402 | 'mule-utf-16be-with-signature 4 ?u | 531 | 'mule-utf-16be-with-signature 4 ?u |
| @@ -406,20 +535,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER).")) | |||
| 406 | 535 | ||
| 407 | '(ccl-decode-mule-utf-16be-with-signature | 536 | '(ccl-decode-mule-utf-16be-with-signature |
| 408 | . ccl-encode-mule-utf-16be-with-signature) | 537 | . ccl-encode-mule-utf-16be-with-signature) |
| 409 | '((safe-charsets | 538 | `(,@props |
| 410 | ascii | 539 | (post-read-conversion . utf-8-post-read-conversion) |
| 411 | eight-bit-control | ||
| 412 | latin-iso8859-1 | ||
| 413 | mule-unicode-0100-24ff | ||
| 414 | mule-unicode-2500-33ff | ||
| 415 | mule-unicode-e000-ffff) | ||
| 416 | (coding-category . coding-category-utf-16-be) | 540 | (coding-category . coding-category-utf-16-be) |
| 417 | (mime-charset . utf-16) | 541 | (mime-charset . utf-16))) |
| 418 | (valid-codes (0 . 255)) | ||
| 419 | (dependency unify-8859-on-encoding-mode | ||
| 420 | unify-8859-on-decoding-mode | ||
| 421 | utf-fragment-on-decoding | ||
| 422 | utf-translate-cjk-mode))) | ||
| 423 | 542 | ||
| 424 | (make-coding-system | 543 | (make-coding-system |
| 425 | 'mule-utf-16 4 ?u | 544 | 'mule-utf-16 4 ?u |
| @@ -428,22 +547,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER).")) | |||
| 428 | doc) | 547 | doc) |
| 429 | 548 | ||
| 430 | '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) | 549 | '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) |
| 431 | '((safe-charsets | 550 | `(,@props |
| 432 | ascii | 551 | (post-read-conversion . mule-utf-16-post-read-conversion) |
| 433 | eight-bit-control | ||
| 434 | latin-iso8859-1 | ||
| 435 | mule-unicode-0100-24ff | ||
| 436 | mule-unicode-2500-33ff | ||
| 437 | mule-unicode-e000-ffff) | ||
| 438 | (coding-category . coding-category-utf-16-be) | 552 | (coding-category . coding-category-utf-16-be) |
| 439 | (mime-charset . utf-16) | 553 | (mime-charset . utf-16))) |
| 440 | (mime-text-unsuitable . t) | ||
| 441 | (valid-codes (0 . 255)) | ||
| 442 | (dependency unify-8859-on-encoding-mode | ||
| 443 | unify-8859-on-decoding-mode | ||
| 444 | utf-fragment-on-decoding | ||
| 445 | utf-translate-cjk-mode) | ||
| 446 | (post-read-conversion . mule-utf-16-post-read-conversion))) | ||
| 447 | ) | 554 | ) |
| 448 | 555 | ||
| 449 | (define-coding-system-alias 'utf-16le 'mule-utf-16le) | 556 | (define-coding-system-alias 'utf-16le 'mule-utf-16le) |