aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKenichi Handa2004-06-12 02:18:36 +0000
committerKenichi Handa2004-06-12 02:18:36 +0000
commit95d2d43392131c1040185be72cf956841615b313 (patch)
treed639ba368f57f196c1f54e8c5b414c795a6670c9
parentc71c26e9295573509ceaa468ea973299a8b311e1 (diff)
downloademacs-95d2d43392131c1040185be72cf956841615b313.tar.gz
emacs-95d2d43392131c1040185be72cf956841615b313.zip
(utf-16-decode-ucs): Handle a surrogate
pair correctly. Call ccl-mule-utf-untrans for untranslable chars. (utf-16le-decode-loop): Set r5 to -1 before loop. (utf-16be-decode-loop): Likewise. (ccl-decode-mule-utf-16le): Add EOF processing block. (ccl-decode-mule-utf-16be): Likewise. (ccl-decode-mule-utf-16le-with-signature): Likewise. (ccl-decode-mule-utf-16be-with-signature): Likewise. (ccl-decode-mule-utf-16): Likewise. Set r5 to -1 initially. (ccl-mule-utf-16-encode-untrans): New CCL. (utf-16-decode-to-ucs): Handle pre-read character. (utf-16le-encode-loop): Handle surrogate pair. (utf-16be-encode-loop): Likewise. (ccl-encode-mule-utf-16le-with-signature): Adjusted for the change of utf-16le-encode-loop. (ccl-encode-mule-utf-16be-with-signature): Adjusted for the change of utf-16be-encode-loop. (mule-utf-16-post-read-conversion): Call utf-8-post-read-conversion at first. (mule-utf-16[{le|be}], mule-utf-16{le|be}-with-signature): Include CJK charsets in safe-charsets if utf-translate-cjk-mode is on. Add post-read-conversion and pre-write-conversion.
-rw-r--r--lisp/international/utf-16.el519
1 files changed, 313 insertions, 206 deletions
diff --git a/lisp/international/utf-16.el b/lisp/international/utf-16.el
index d924512b634..477cccc2bf9 100644
--- a/lisp/international/utf-16.el
+++ b/lisp/international/utf-16.el
@@ -48,99 +48,110 @@
48;; things below, sometimes with commonality abstracted into a let 48;; things below, sometimes with commonality abstracted into a let
49;; binding for maintenance convenience. 49;; binding for maintenance convenience.
50 50
51;; We'd need new charsets distinct from ascii and eight-bit-control to
52;; deal with untranslated sequences, since we can't otherwise
53;; distinguish the bytes, as we can with utf-8.
54
55;; ;; Do a multibyte write for bytes in r3 and r4.
56;; ;; Intended for untranslatable utf-16 sequences.
57;; (define-ccl-program ccl-mule-utf-16-untrans
58;; `(0
59;; (if (r3 < 128)
60;; (r0 = ,(charset-id 'ascii))
61;; (if (r3 < 160)
62;; (r0 = ,(charset-id 'eight-bit-control))
63;; (r0 = ,(charset-id 'eight-bit-graphic))))
64;; (if (r4 < 128)
65;; (r0 = ,(charset-id 'ascii))
66;; (if (r4 < 160)
67;; (r0 = ,(charset-id 'eight-bit-control))
68;; (r0 = ,(charset-id 'eight-bit-graphic))))
69;; (r1 = r4)))
70;; "Do a multibyte write for bytes in r3 and r4.
71;; First swap them if we're big endian, indicated by r5==0.
72;; Intended for untranslatable utf-16 sequences.")
73
74;; Needed in macro expansion, so can't be let-bound. Zapped after use. 51;; Needed in macro expansion, so can't be let-bound. Zapped after use.
75(eval-and-compile 52(eval-and-compile
76(defconst utf-16-decode-ucs 53(defconst utf-16-decode-ucs
77 ;; We have the unicode in r1. Output is charset ID in r0, code 54 ;; If r5 is negative, r1 is a Unicode chacter code. Otherise, r5 is
78 ;; point in r1. 55 ;; the first of a surrogate pair and r1 is the second of the pair.
79 `((lookup-integer utf-subst-table-for-decode r1 r3) 56 ;; Output is charset ID in r0, code point in r1. R0 may be set to
80 (if r7 ; got a translation 57 ;; -1 in which case a caller should not write out r1.
81 ((r0 = r1) (r1 = r3)) 58 `((if (r5 >= 0)
82 (if (r1 < 128) 59 ((r0 = (r1 < #xDC00))
83 (r0 = ,(charset-id 'ascii)) 60 (if ((r1 >= #xE000) | r0)
84 (if (r1 < 160) 61 ;; Invalid second code of surrogate pair.
85 (r0 = ,(charset-id 'eight-bit-control)) 62 ((r0 = r5)
86 (if (r1 < 256) 63 (call ccl-mule-utf-untrans))
87 ((r0 = ,(charset-id 'latin-iso8859-1)) 64 ((r1 -= #xDC00)
88 (r1 -= 128)) 65 (r1 += (((r5 - #xD800) << 10) + #x10000))))
89 (if (r1 < #x2500) 66 (r5 = -1)))
90 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 67 (if (r1 < 128)
91 (r1 -= #x100) 68 (r0 = ,(charset-id 'ascii))
92 (r2 = (((r1 / 96) + 32) << 7)) 69 ((lookup-integer utf-subst-table-for-decode r1 r3)
93 (r1 %= 96) 70 (if r7 ; got a translation
94 (r1 += (r2 + 32))) 71 ((r0 = r1) (r1 = r3))
95 (if (r1 < #x3400) 72 (if (r1 < 160)
96 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 73 (r0 = ,(charset-id 'eight-bit-control))
97 (r1 -= #x2500) 74 (if (r1 < 256)
98 (r2 = (((r1 / 96) + 32) << 7)) 75 ((r0 = ,(charset-id 'latin-iso8859-1))
99 (r1 %= 96) 76 (r1 -= 128))
100 (r1 += (r2 + 32))) 77 (if (r1 < #x2500)
101 (if (r1 < #xd800) ; 2 untranslated bytes 78 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
102 ;; ;; Assume this is rare, so don't worry about the 79 (r1 -= #x100)
103 ;; ;; overhead of the call. 80 (r2 = (((r1 / 96) + 32) << 7))
104 ;; (call mule-utf-16-untrans) 81 (r1 %= 96)
105 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 82 (r1 += (r2 + 32)))
106 (r1 = 15037)) ; U+fffd 83 (if (r1 < #x3400)
107 (if (r1 < #xe000) ; surrogate 84 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
108 ;; ((call mule-utf-16-untrans) 85 (r1 -= #x2500)
109 ;; (write-multibyte-character r0 r1) 86 (r2 = (((r1 / 96) + 32) << 7))
110 ;; (read r3 r4) 87 (r1 %= 96)
111 ;; (call mule-utf-16-untrans)) 88 (r1 += (r2 + 32)))
112 ((read r3 r4) 89 (if (r1 < #xD800)
113 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) 90 ;; We can't have this character.
114 (r1 = 15037)) 91 ((r0 = r1)
115 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 92 (call ccl-mule-utf-untrans)
116 (r1 -= #xe000) 93 (r5 = -1)
117 (r2 = (((r1 / 96) + 32) << 7)) 94 (r0 = -1))
118 (r1 %= 96) 95 (if (r1 < #xDC00)
119 (r1 += (r2 + 32))))))))))))) 96 ;; The first code of a surrogate pair.
97 ((r5 = r1)
98 (r0 = -1))
99 (if (r1 < #xE000)
100 ;; The second code of a surrogate pair, invalid.
101 ((r0 = r1)
102 (call ccl-mule-utf-untrans)
103 (r5 = -1)
104 (r0 = -1))
105 (if (r1 < #x10000)
106 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
107 (r1 -= #xE000)
108 (r2 = (((r1 / 96) + 32) << 7))
109 (r1 %= 96)
110 (r1 += (r2 + 32)))
111 ;; We can't have this character.
112 ((r0 = r1)
113 (call ccl-mule-utf-untrans)
114 (r5 = -1)
115 (r0 = -1)))))))))))))))
120 116
121(defconst utf-16le-decode-loop 117(defconst utf-16le-decode-loop
122 `(loop 118 `((r5 = -1)
123 (read r3 r4) 119 (loop
124 (r1 = (r4 <8 r3)) 120 (r3 = -1)
125 ,utf-16-decode-ucs 121 (read r3 r4)
126 (translate-character utf-translation-table-for-decode r0 r1) 122 (r1 = (r4 <8 r3))
127 (write-multibyte-character r0 r1) 123 ,@utf-16-decode-ucs
128 (repeat))) 124 (if (r0 >= 0)
125 ((translate-character utf-translation-table-for-decode r0 r1)
126 (write-multibyte-character r0 r1)))
127 (repeat))))
129 128
130(defconst utf-16be-decode-loop 129(defconst utf-16be-decode-loop
131 `(loop 130 `((r5 = -1)
132 (read r3 r4) 131 (loop
133 (r1 = (r3 <8 r4)) 132 (r3 = -1)
134 ,@utf-16-decode-ucs 133 (read r3 r4)
135 (translate-character utf-translation-table-for-decode r0 r1) 134 (r1 = (r3 <8 r4))
136 (write-multibyte-character r0 r1) 135 ,@utf-16-decode-ucs
137 (repeat))) 136 (if (r0 >= 0)
137 ((translate-character utf-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1)))
139 (repeat))))
138 140
139) 141)
140 142
141(define-ccl-program ccl-decode-mule-utf-16le 143(define-ccl-program ccl-decode-mule-utf-16le
142 `(2 ; 2 bytes -> 1 to 4 bytes 144 `(2 ; 2 bytes -> 1 to 4 bytes
143 ,utf-16le-decode-loop) 145 ,utf-16le-decode-loop
146 ((if (r5 >= 0)
147 ((r0 = r5)
148 (call ccl-mule-utf-untrans)))
149 (if (r3 < 0)
150 nil
151 ((if (r3 < #xA0)
152 (r0 = ,(charset-id 'eight-bit-control))
153 (r0 = ,(charset-id 'eight-bit-graphic)))
154 (write-multibyte-character r0 r3)))))
144 "Decode UTF-16LE (little endian without signature bytes). 155 "Decode UTF-16LE (little endian without signature bytes).
145Basic decoding is done into the charsets ascii, latin-iso8859-1 and 156Basic decoding is done into the charsets ascii, latin-iso8859-1 and
146mule-unicode-*. Un-representable Unicode characters are decoded as 157mule-unicode-*. Un-representable Unicode characters are decoded as
@@ -149,7 +160,13 @@ U+fffd. The result is run through the translation-table named
149 160
150(define-ccl-program ccl-decode-mule-utf-16be 161(define-ccl-program ccl-decode-mule-utf-16be
151 `(2 ; 2 bytes -> 1 to 4 bytes 162 `(2 ; 2 bytes -> 1 to 4 bytes
152 ,utf-16be-decode-loop) 163 ,utf-16be-decode-loop
164 ((if (r5 >= 0)
165 ((r0 = r5)
166 (call ccl-mule-utf-untrans)))
167 (if (r3 >= 0)
168 ((r0 = r3)
169 (call ccl-mule-utf-untrans)))))
153 "Decode UTF-16BE (big endian without signature bytes). 170 "Decode UTF-16BE (big endian without signature bytes).
154Basic decoding is done into the charsets ascii, latin-iso8859-1 and 171Basic decoding is done into the charsets ascii, latin-iso8859-1 and
155mule-unicode-*. Un-representable Unicode characters are 172mule-unicode-*. Un-representable Unicode characters are
@@ -158,91 +175,218 @@ name `utf-translation-table-for-decode'.")
158 175
159(define-ccl-program ccl-decode-mule-utf-16le-with-signature 176(define-ccl-program ccl-decode-mule-utf-16le-with-signature
160 `(2 177 `(2
161 ((read r3 r4) 178 ((r3 = -1)
162 ,utf-16le-decode-loop)) 179 (read r3 r4)
180 ,@utf-16le-decode-loop)
181 (if (r3 >= 0)
182 ((r0 = r3)
183 (call ccl-mule-utf-untrans))))
163 "Like ccl-decode-utf-16le but skip the first 2-byte BOM.") 184 "Like ccl-decode-utf-16le but skip the first 2-byte BOM.")
164 185
165(define-ccl-program ccl-decode-mule-utf-16be-with-signature 186(define-ccl-program ccl-decode-mule-utf-16be-with-signature
166 `(2 187 `(2
167 ((read r3 r4) 188 ((r3 = -1)
168 ,utf-16be-decode-loop)) 189 (read r3 r4)
190 ,@utf-16be-decode-loop)
191 (if (r3 >= 0)
192 ((r0 = r3)
193 (call ccl-mule-utf-untrans))))
169 "Like ccl-decode-utf-16be but skip the first 2-byte BOM.") 194 "Like ccl-decode-utf-16be but skip the first 2-byte BOM.")
170 195
171(define-ccl-program ccl-decode-mule-utf-16 196(define-ccl-program ccl-decode-mule-utf-16
172 `(2 197 `(2
173 ((read r3 r4) 198 ((r3 = -1)
199 (read r3 r4)
174 (r1 = (r3 <8 r4)) 200 (r1 = (r3 <8 r4))
201 (r5 = -1)
175 (if (r1 == #xFFFE) 202 (if (r1 == #xFFFE)
176 ;; R1 is a BOM for little endian. We keep this character as 203 ;; R1 is a BOM for little endian. We keep this character as
177 ;; is temporarily. It is removed by post-read-conversion 204 ;; is temporarily. It is removed by post-read-conversion
178 ;; function. 205 ;; function.
179 (,@utf-16-decode-ucs 206 (,@utf-16-decode-ucs
180 (write-multibyte-character r0 r1) 207 (write-multibyte-character r0 r1)
181 ,utf-16le-decode-loop) 208 ,@utf-16le-decode-loop)
182 ((if (r1 == #xFEFF) 209 ((if (r1 == #xFEFF)
183 ;; R1 is a BOM for big endian, but we can't keep that 210 ;; R1 is a BOM for big endian, but we can't keep that
184 ;; character in the output because it can't be 211 ;; character in the output because it can't be
185 ;; distinguished with the normal U+FEFF. So, we keep 212 ;; distinguished with the normal U+FEFF. So, we keep
186 ;; #xFFFF instead. 213 ;; #xFFFF instead.
187 ((r1 = #xFFFF) 214 ((r1 = #xFFFF)
188 ,@utf-16-decode-ucs) 215 ,@utf-16-decode-ucs
189 ;; R1 a normal Unicode character. 216 (write-multibyte-character r0 r1))
217 ;; R1 is a normal Unicode character.
190 (,@utf-16-decode-ucs 218 (,@utf-16-decode-ucs
191 (translate-character utf-translation-table-for-decode r0 r1))) 219 (if (r0 >= 0)
192 (write-multibyte-character r0 r1) 220 ((translate-character utf-translation-table-for-decode r0 r1)
193 ,utf-16be-decode-loop)))) 221 (write-multibyte-character r0 r1)))))
222 ,@utf-16be-decode-loop)))
223 (if (r3 >= 0)
224 ((r0 = r3)
225 (call ccl-mule-utf-untrans))))
194 "Like ccl-decode-utf-16be/le but check the first BOM.") 226 "Like ccl-decode-utf-16be/le but check the first BOM.")
195 227
196(makunbound 'utf-16-decode-ucs) ; done with it 228(makunbound 'utf-16-decode-ucs) ; done with it
197(makunbound 'utf-16le-decode-loop) 229(makunbound 'utf-16le-decode-loop)
198(makunbound 'utf-16be-decode-loop) 230(makunbound 'utf-16be-decode-loop)
199 231
232;; UTF-16 decoder generates an UTF-8 sequence represented by a
233;; sequence eight-bit-control/graphic chars for an invalid byte (the
234;; last byte of an odd length source) and an untranslatable character
235;; (including an invalid surrogate-pair code-point).
236;;
237;; This CCL parses that sequence (the first byte is already in r1),
238;; and if the sequence represents an untranslatable character, it sets
239;; r1 to the original invalid code or untranslated Unicode character
240;; code, sets r2 to -1 (to prevent r2 and r3 are written), set2 r5 to
241;; -1 (to tell the caller that there's no pre-read character).
242;;
243;; If the sequence represents an invalid byte, it sets r1 to -1, r2 to
244;; the byte, sets r3 and r5 to -1.
245;;
246;; Otherwise, don't change r1, set r2 and r3 to already read
247;; eight-bit-control/graphic characters (if any), set r5 and r6 to the
248;; last character that invalidates the UTF-8 form.
249;;
250;; Note: For UTF-8 validation, we only check if a character is
251;; eight-bit-control/graphic or not. It may result in incorrect
252;; handling of random binary data, but such a data can't be encoded by
253;; UTF-16 anyway. At least, UTF-16 decoder doesn't generate such a
254;; sequence even if a source contains invalid byte-sequence.
255
256(define-ccl-program ccl-mule-utf-16-encode-untrans
257 `(0
258 ((r2 = -1)
259 ;; Read the 2nd byte.
260 (read-multibyte-character r5 r6)
261 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
262 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
263 ((r2 = r1)
264 (r3 = -1)
265 (r1 = -1)
266 (end))) ; invalid UTF-8
267
268 (r3 = -1)
269 (r2 = r6)
270 (if (r1 <= #xE0)
271 ;; 2-byte UTF-8, i.e. originally an invalid byte.
272 ((r2 &= #x3F)
273 (r2 |= ((r1 & #x1F) << 6))
274 (r1 = -1)
275 (r5 = -1)
276 (end)))
277
278 ;; Read the 3rd byte.
279 (read-multibyte-character r5 r6)
280 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
281 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
282 ((end))) ; invalid UTF-8
283
284 (if (r1 < #xF0) ; valid 3-byte UTF-8
285 ((r1 = ((r1 & #x0F) << 12))
286 (r1 |= ((r2 & #x3F) << 6))
287 (r1 |= (r6 & #x3F))
288 (r2 = -1)
289 (r5 = -1)
290 (end)))
291
292 (r3 = r6)
293 ;; Read the 4th byte.
294 (read-multibyte-character r5 r6)
295 (r0 = (r5 != ,(charset-id 'eight-bit-control)))
296 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
297 (end)) ; livalid UTF-8
298
299 ;; valid 4-byte UTF-8
300 (r1 = ((r1 & #x07) << 18))
301 (r1 |= ((r2 & #x3F) << 12))
302 (r1 |= ((r3 & #x3F) << 6))
303 (r1 |= (r6 & #x3F))
304 (r2 = -1)
305 (r5 = -1)
306 (end))
307
308 (if (r1 >= 0)
309 ((write r1)
310 (if (r2 >= 0)
311 ((write r2)
312 (if (r3 >= 0)
313 (write r3))))))))
314
200(eval-and-compile 315(eval-and-compile
201(defconst utf-16-decode-to-ucs 316(defconst utf-16-decode-to-ucs
202 ;; CCL which, given the result of a multibyte read in r0 and r1, 317 ;; Read a character and set r1 to the corresponding Unicode code.
203 ;; sets r0 to the character's Unicode if the charset is one of the 318 ;; If r5 is not negative, it means that we have already read a
204 ;; basic utf-8 coding system ones. Otherwise set to U+fffd. 319 ;; character into r5 and r6.
205 `(if (r0 == ,(charset-id 'ascii)) 320 ;; If an invalid eight-bit-control/graphic sequence is found, r2 and
206 (r0 = r1) 321 ;; r3 may contain a byte to written out, r5 and r6 may contain a
207 (if (r0 == ,(charset-id 'latin-iso8859-1)) 322 ;; pre-read character. Usually they are set to -1.
208 (r0 = (r1 + 128)) 323 `((if (r5 < 0)
209 (if (r0 == ,(charset-id 'eight-bit-control)) 324 (read-multibyte-character r0 r1)
210 (r0 = r1) 325 ((r0 = r5)
211 (if (r0 == ,(charset-id 'eight-bit-graphic)) 326 (r1 = r6)
212 (r0 = r1) 327 (r5 = -1)))
213 ((r2 = (r1 & #x7f)) 328 (lookup-character utf-subst-table-for-encode r0 r1)
214 (r1 >>= 7) 329 (r2 = -1)
215 (r3 = ((r1 - 32) * 96)) 330 (if (r7 > 0)
216 (r3 += (r2 - 32)) 331 (r1 = r0)
217 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) 332 ((translate-character utf-translation-table-for-encode r0 r1)
218 (r0 = (r3 + #x100)) 333 (if (r0 == ,(charset-id 'ascii))
219 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) 334 nil
220 (r0 = (r3 + #x2500)) 335 (if (r0 == ,(charset-id 'latin-iso8859-1))
221 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 336 (r1 += 128)
222 (r0 = (r3 + #xe000)) 337 (if (r0 == ,(charset-id 'eight-bit-control))
223 (r0 = #xfffd)))))))))) 338 nil
339 (if (r0 == ,(charset-id 'eight-bit-graphic))
340 (call ccl-mule-utf-16-encode-untrans)
341 ((r2 = ((r1 & #x7f) - 32))
342 (r3 = ((((r1 >> 7) - 32) * 96) + r2))
343 (r2 = -1)
344 (r5 = -1)
345 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
346 (r1 = (r3 + #x100))
347 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
348 (r1 = (r3 + #x2500))
349 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
350 (r1 = (r3 + #xe000))
351 (r1 = #xfffd)))))))))))))
224 352
225(defconst utf-16le-encode-loop 353(defconst utf-16le-encode-loop
226 `(loop 354 `((r5 = -1)
227 (read-multibyte-character r0 r1) 355 (loop
228 (lookup-character utf-subst-table-for-encode r0 r1) 356 ,@utf-16-decode-to-ucs
229 (if (r7 == 0) 357 (if (r1 >= #x10000)
230 ((translate-character utf-translation-table-for-encode r0 r1) 358 ((r1 -= #x10000)
231 ,utf-16-decode-to-ucs)) 359 (r0 = ((r1 >> 10) + #xD800))
232 (write (r0 & 255)) 360 (write (r0 & 255))
233 (write (r0 >> 8)) 361 (write (r0 >> 8))
234 (repeat))) 362 (r1 = ((r1 & #x3FF) + #xDC00))))
363 (if (r1 >= 0)
364 ((write (r1 & 255))
365 (write (r1 >> 8))))
366 (if (r2 >= 0)
367 ((write r2)
368 (if (r3 >= 0)
369 (write r3))))
370 (repeat))))
235 371
236(defconst utf-16be-encode-loop 372(defconst utf-16be-encode-loop
237 `(loop 373 `((r5 = -1)
238 (read-multibyte-character r0 r1) 374 (loop
239 (lookup-character utf-subst-table-for-encode r0 r1) 375 ,@utf-16-decode-to-ucs
240 (if (r7 == 0) 376 (if (r1 >= #x10000)
241 ((translate-character utf-translation-table-for-encode r0 r1) 377 ((r1 -= #x10000)
242 ,utf-16-decode-to-ucs)) 378 (r0 = ((r1 >> 10) + #xD800))
243 (write (r0 >> 8)) 379 (write (r0 >> 8))
244 (write (r0 & 255)) 380 (write (r0 & 255))
245 (repeat))) 381 (r1 = ((r1 & #x3FF) + #xDC00))))
382 (if (r1 >= 0)
383 ((write (r1 >> 8))
384 (write (r1 & 255))))
385 (if (r2 >= 0)
386 ((write r2)
387 (if (r3 >= 0)
388 (write r3))))
389 (repeat))))
246) 390)
247 391
248 392
@@ -270,7 +414,7 @@ Others are encoded as U+FFFD.")
270 `(1 414 `(1
271 ((write #xFF) 415 ((write #xFF)
272 (write #xFE) 416 (write #xFE)
273 ,utf-16le-encode-loop)) 417 ,@utf-16le-encode-loop))
274 "Encode to UTF-16 (little endian with signature). 418 "Encode to UTF-16 (little endian with signature).
275Characters from the charsets ascii, eight-bit-control, 419Characters from the charsets ascii, eight-bit-control,
276eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 420eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
@@ -282,7 +426,7 @@ Others are encoded as U+FFFD.")
282 `(1 426 `(1
283 ((write #xFE) 427 ((write #xFE)
284 (write #xFF) 428 (write #xFF)
285 ,utf-16be-encode-loop)) 429 ,@utf-16be-encode-loop))
286 "Encode to UTF-16 (big endian with signature). 430 "Encode to UTF-16 (big endian with signature).
287Characters from the charsets ascii, eight-bit-control, 431Characters from the charsets ascii, eight-bit-control,
288eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 432eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
@@ -296,6 +440,7 @@ Others are encoded as U+FFFD.")
296 440
297(defun mule-utf-16-post-read-conversion (length) 441(defun mule-utf-16-post-read-conversion (length)
298 (when (> length 0) 442 (when (> length 0)
443 (setq length (utf-8-post-read-conversion length))
299 (let ((char (following-char))) 444 (let ((char (following-char)))
300 (cond ((= char (decode-char 'ucs #xFFFE)) 445 (cond ((= char (decode-char 'ucs #xFFFE))
301 (delete-char 1) 446 (delete-char 1)
@@ -329,29 +474,34 @@ if they are re-encoded.
329 474
330On encoding (e.g. writing a file), Emacs characters not belonging to 475On encoding (e.g. writing a file), Emacs characters not belonging to
331any of the character sets listed above are encoded into the byte 476any of the character sets listed above are encoded into the byte
332sequence representing U+FFFD (REPLACEMENT CHARACTER).")) 477sequence representing U+FFFD (REPLACEMENT CHARACTER).")
478 (props `((safe-charsets
479 ascii
480 eight-bit-control
481 eight-bit-graphic
482 latin-iso8859-1
483 mule-unicode-0100-24ff
484 mule-unicode-2500-33ff
485 mule-unicode-e000-ffff
486 ,@(if utf-translate-cjk-mode
487 utf-translate-cjk-charsets))
488 (valid-codes (0 . 255))
489 (mime-text-unsuitable . t)
490 (pre-write-conversion . utf-8-pre-write-conversion)
491 (dependency unify-8859-on-encoding-mode
492 unify-8859-on-decoding-mode
493 utf-fragment-on-decoding
494 utf-translate-cjk-mode))))
333 (make-coding-system 495 (make-coding-system
334 'mule-utf-16le 4 496 'mule-utf-16le 4
335 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. 497 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
336 (concat 498 (concat
337 "UTF-16LE encoding for Emacs-supported Unicode characters." 499 "UTF-16LE encoding for Emacs-supported Unicode characters."
338 doc) 500 doc)
339
340 '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le) 501 '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le)
341 '((safe-charsets 502 `(,@props
342 ascii 503 (post-read-conversion . utf-8-post-read-conversion)
343 eight-bit-control 504 (mime-charset . utf-16le)))
344 latin-iso8859-1
345 mule-unicode-0100-24ff
346 mule-unicode-2500-33ff
347 mule-unicode-e000-ffff)
348 (mime-charset . utf-16le)
349 (mime-text-unsuitable . t)
350 (valid-codes (0 . 255))
351 (dependency unify-8859-on-encoding-mode
352 unify-8859-on-decoding-mode
353 utf-fragment-on-decoding
354 utf-translate-cjk-mode)))
355 505
356 (make-coding-system 506 (make-coding-system
357 'mule-utf-16be 4 ?u 507 'mule-utf-16be 4 ?u
@@ -360,19 +510,9 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
360 doc) 510 doc)
361 511
362 '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be) 512 '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be)
363 '((safe-charsets 513 `(,@props
364 ascii 514 (post-read-conversion . utf-8-post-read-conversion)
365 eight-bit-control 515 (mime-charset . utf-16be)))
366 latin-iso8859-1
367 mule-unicode-0100-24ff
368 mule-unicode-2500-33ff
369 mule-unicode-e000-ffff)
370 (mime-charset . utf-16be)
371 (valid-codes (0 . 255))
372 (dependency unify-8859-on-encoding-mode
373 unify-8859-on-decoding-mode
374 utf-fragment-on-decoding
375 utf-translate-cjk-mode)))
376 516
377 (make-coding-system 517 (make-coding-system
378 'mule-utf-16le-with-signature 4 ?u 518 'mule-utf-16le-with-signature 4 ?u
@@ -382,21 +522,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
382 522
383 '(ccl-decode-mule-utf-16le-with-signature 523 '(ccl-decode-mule-utf-16le-with-signature
384 . ccl-encode-mule-utf-16le-with-signature) 524 . ccl-encode-mule-utf-16le-with-signature)
385 '((safe-charsets 525 `(,@props
386 ascii 526 (post-read-conversion . utf-8-post-read-conversion)
387 eight-bit-control
388 latin-iso8859-1
389 mule-unicode-0100-24ff
390 mule-unicode-2500-33ff
391 mule-unicode-e000-ffff)
392 (coding-category . coding-category-utf-16-le) 527 (coding-category . coding-category-utf-16-le)
393 (mime-charset . utf-16) 528 (mime-charset . utf-16)))
394 (mime-text-unsuitable . t)
395 (valid-codes (0 . 255))
396 (dependency unify-8859-on-encoding-mode
397 unify-8859-on-decoding-mode
398 utf-fragment-on-decoding
399 utf-translate-cjk-mode)))
400 529
401 (make-coding-system 530 (make-coding-system
402 'mule-utf-16be-with-signature 4 ?u 531 'mule-utf-16be-with-signature 4 ?u
@@ -406,20 +535,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
406 535
407 '(ccl-decode-mule-utf-16be-with-signature 536 '(ccl-decode-mule-utf-16be-with-signature
408 . ccl-encode-mule-utf-16be-with-signature) 537 . ccl-encode-mule-utf-16be-with-signature)
409 '((safe-charsets 538 `(,@props
410 ascii 539 (post-read-conversion . utf-8-post-read-conversion)
411 eight-bit-control
412 latin-iso8859-1
413 mule-unicode-0100-24ff
414 mule-unicode-2500-33ff
415 mule-unicode-e000-ffff)
416 (coding-category . coding-category-utf-16-be) 540 (coding-category . coding-category-utf-16-be)
417 (mime-charset . utf-16) 541 (mime-charset . utf-16)))
418 (valid-codes (0 . 255))
419 (dependency unify-8859-on-encoding-mode
420 unify-8859-on-decoding-mode
421 utf-fragment-on-decoding
422 utf-translate-cjk-mode)))
423 542
424 (make-coding-system 543 (make-coding-system
425 'mule-utf-16 4 ?u 544 'mule-utf-16 4 ?u
@@ -428,22 +547,10 @@ sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
428 doc) 547 doc)
429 548
430 '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature) 549 '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature)
431 '((safe-charsets 550 `(,@props
432 ascii 551 (post-read-conversion . mule-utf-16-post-read-conversion)
433 eight-bit-control
434 latin-iso8859-1
435 mule-unicode-0100-24ff
436 mule-unicode-2500-33ff
437 mule-unicode-e000-ffff)
438 (coding-category . coding-category-utf-16-be) 552 (coding-category . coding-category-utf-16-be)
439 (mime-charset . utf-16) 553 (mime-charset . utf-16)))
440 (mime-text-unsuitable . t)
441 (valid-codes (0 . 255))
442 (dependency unify-8859-on-encoding-mode
443 unify-8859-on-decoding-mode
444 utf-fragment-on-decoding
445 utf-translate-cjk-mode)
446 (post-read-conversion . mule-utf-16-post-read-conversion)))
447) 554)
448 555
449(define-coding-system-alias 'utf-16le 'mule-utf-16le) 556(define-coding-system-alias 'utf-16le 'mule-utf-16le)