aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lisp/international/utf-16.el159
1 files changed, 85 insertions, 74 deletions
diff --git a/lisp/international/utf-16.el b/lisp/international/utf-16.el
index 7786795f4a5..6ccac419f57 100644
--- a/lisp/international/utf-16.el
+++ b/lisp/international/utf-16.el
@@ -68,48 +68,49 @@
68;; Needed in macro expansion, so can't be let-bound. Zapped after use. 68;; Needed in macro expansion, so can't be let-bound. Zapped after use.
69(eval-and-compile 69(eval-and-compile
70(defconst utf-16-decode-ucs 70(defconst utf-16-decode-ucs
71 ;; We have the unicode in r1. Output is character codes in r0, r1, 71 ;; We have the unicode in r1. Output is charset ID in r0, code point
72 ;; and r2 if appropriate. 72 ;; in r1.
73 `((lookup-integer utf-8-subst-table r0 r3) 73 `((lookup-integer utf-subst-table-for-decode r1 r3)
74 (if r7 (r1 = r3)) ; got a translation 74 (if r7 ; got a translation
75 (if (r1 < 128) 75 ((r0 = r1) (r1 = r3))
76 (r0 = ,(charset-id 'ascii)) 76 (if (r1 < 128)
77 (if (r1 < 160) 77 (r0 = ,(charset-id 'ascii))
78 (r0 = ,(charset-id 'eight-bit-control)) 78 (if (r1 < 160)
79 (if (r1 < 256) 79 (r0 = ,(charset-id 'eight-bit-control))
80 ((r0 = ,(charset-id 'latin-iso8859-1)) 80 (if (r1 < 256)
81 (r1 -= 128)) 81 ((r0 = ,(charset-id 'latin-iso8859-1))
82 (if (r1 < #x2500) 82 (r1 -= 128))
83 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 83 (if (r1 < #x2500)
84 (r1 -= #x100) 84 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
85 (r2 = (((r1 / 96) + 32) << 7)) 85 (r1 -= #x100)
86 (r1 %= 96) 86 (r2 = (((r1 / 96) + 32) << 7))
87 (r1 += (r2 + 32))) 87 (r1 %= 96)
88 (if (r1 < #x3400) 88 (r1 += (r2 + 32)))
89 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 89 (if (r1 < #x3400)
90 (r1 -= #x2500) 90 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
91 (r2 = (((r1 / 96) + 32) << 7)) 91 (r1 -= #x2500)
92 (r1 %= 96) 92 (r2 = (((r1 / 96) + 32) << 7))
93 (r1 += (r2 + 32))) 93 (r1 %= 96)
94 (if (r1 < #xd800) ; 2 untranslated bytes 94 (r1 += (r2 + 32)))
95;; ;; Assume this is rare, so don't worry about the 95 (if (r1 < #xd800) ; 2 untranslated bytes
96;; ;; overhead of the call. 96 ;; ;; Assume this is rare, so don't worry about the
97;; (call mule-utf-16-untrans) 97 ;; ;; overhead of the call.
98 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 98 ;; (call mule-utf-16-untrans)
99 (r1 = 15037)) ; U+fffd 99 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
100 (if (r1 < #xe000) ; surrogate 100 (r1 = 15037)) ; U+fffd
101;; ((call mule-utf-16-untrans) 101 (if (r1 < #xe000) ; surrogate
102;; (write-multibyte-character r0 r1) 102 ;; ((call mule-utf-16-untrans)
103;; (read r3 r4) 103 ;; (write-multibyte-character r0 r1)
104;; (call mule-utf-16-untrans)) 104 ;; (read r3 r4)
105 ((read r3 r4) 105 ;; (call mule-utf-16-untrans))
106 (r0 = ,(charset-id 'mule-unicode-e000-ffff)) 106 ((read r3 r4)
107 (r1 = 15037)) 107 (r0 = ,(charset-id 'mule-unicode-e000-ffff))
108 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 108 (r1 = 15037))
109 (r1 -= #xe000) 109 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
110 (r2 = (((r1 / 96) + 32) << 7)) 110 (r1 -= #xe000)
111 (r1 %= 96) 111 (r2 = (((r1 / 96) + 32) << 7))
112 (r1 += (r2 + 32))))))))))))) 112 (r1 %= 96)
113 (r1 += (r2 + 32))))))))))))))
113 114
114(define-ccl-program ccl-decode-mule-utf-16-le 115(define-ccl-program ccl-decode-mule-utf-16-le
115 `(2 ; 2 bytes -> 1 to 4 bytes 116 `(2 ; 2 bytes -> 1 to 4 bytes
@@ -118,14 +119,14 @@
118 (read r3 r4) 119 (read r3 r4)
119 (r1 = (r4 <8 r3)) 120 (r1 = (r4 <8 r3))
120 ,utf-16-decode-ucs 121 ,utf-16-decode-ucs
121 (translate-character utf-8-translation-table-for-decode r0 r1) 122 (translate-character utf-translation-table-for-decode r0 r1)
122 (write-multibyte-character r0 r1) 123 (write-multibyte-character r0 r1)
123 (repeat)))) 124 (repeat))))
124 "Decode little endian UTF-16 (ignoring signature bytes). 125 "Decode little endian UTF-16 (ignoring signature bytes).
125Basic decoding is done into the charsets ascii, latin-iso8859-1 and 126Basic decoding is done into the charsets ascii, latin-iso8859-1 and
126mule-unicode-*. Un-representable Unicode characters are 127mule-unicode-*. Un-representable Unicode characters are decoded as
127decoded as U+fffd. The result is run through translation table 128U+fffd. The result is run through the translation-table named
128`utf-8-translation-table-for-decode' if that is defined.") 129`utf-translation-table-for-decode'.")
129 130
130(define-ccl-program ccl-decode-mule-utf-16-be 131(define-ccl-program ccl-decode-mule-utf-16-be
131 `(2 ; 2 bytes -> 1 to 4 bytes 132 `(2 ; 2 bytes -> 1 to 4 bytes
@@ -134,14 +135,14 @@ decoded as U+fffd. The result is run through translation table
134 (read r3 r4) 135 (read r3 r4)
135 (r1 = (r3 <8 r4)) 136 (r1 = (r3 <8 r4))
136 ,utf-16-decode-ucs 137 ,utf-16-decode-ucs
137 (translate-character utf-8-translation-table-for-decode r0 r1) 138 (translate-character utf-translation-table-for-decode r0 r1)
138 (write-multibyte-character r0 r1) 139 (write-multibyte-character r0 r1)
139 (repeat)))) 140 (repeat))))
140 "Decode big endian UTF-16 (ignoring signature bytes). 141 "Decode big endian UTF-16 (ignoring signature bytes).
141Basic decoding is done into the charsets ascii, latin-iso8859-1 and 142Basic decoding is done into the charsets ascii, latin-iso8859-1 and
142mule-unicode-*. Un-representable Unicode characters are 143mule-unicode-*. Un-representable Unicode characters are
143decoded as U+fffd. The result is run through translation table 144decoded as U+fffd. The result is run through the translation-table of
144`utf-8-non-latin-8859-table'.") 145name `utf-translation-table-for-decode'.")
145 146
146(makunbound 'utf-16-decode-ucs) ; done with it 147(makunbound 'utf-16-decode-ucs) ; done with it
147 148
@@ -176,15 +177,18 @@ decoded as U+fffd. The result is run through translation table
176 (write #xfe) 177 (write #xfe)
177 (loop 178 (loop
178 (read-multibyte-character r0 r1) 179 (read-multibyte-character r0 r1)
179 (translate-character ucs-mule-to-mule-unicode r0 r1) 180 (lookup-character utf-subst-table-for-encode r0 r1)
180 ,utf-16-decode-to-ucs 181 (if (r7 == 0)
182 ((translate-character utf-translation-table-for-encode r0 r1)
183 ,utf-16-decode-to-ucs))
181 (write (r0 & 255)) 184 (write (r0 & 255))
182 (write (r0 >> 8)) 185 (write (r0 >> 8))
183 (repeat)))) 186 (repeat))))
184 "Encode to little endian UTF-16 with signature. 187 "Encode to little endian UTF-16 with signature.
185Characters from the charsets ascii, eight-bit-control, 188Characters from the charsets ascii, eight-bit-control,
186eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 189eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
187after translation through the table `ucs-mule-to-mule-unicode'. 190after translation through the translation-table of name
191`utf-translation-table-for-encode'.
188Others are encoded as U+FFFD.") 192Others are encoded as U+FFFD.")
189 193
190(define-ccl-program ccl-encode-mule-utf-16-be 194(define-ccl-program ccl-encode-mule-utf-16-be
@@ -193,15 +197,18 @@ Others are encoded as U+FFFD.")
193 (write #xff) 197 (write #xff)
194 (loop 198 (loop
195 (read-multibyte-character r0 r1) 199 (read-multibyte-character r0 r1)
196 (translate-character ucs-mule-to-mule-unicode r0 r1) 200 (lookup-character utf-subst-table-for-encode r0 r1)
197 ,utf-16-decode-to-ucs 201 (if (r7 == 0)
202 ((translate-character utf-translation-table-for-encode r0 r1)
203 ,utf-16-decode-to-ucs))
198 (write (r0 >> 8)) 204 (write (r0 >> 8))
199 (write (r0 & 255)) 205 (write (r0 & 255))
200 (repeat)))) 206 (repeat))))
201 "Encode to big endian UTF-16 with signature. 207 "Encode to big endian UTF-16 with signature.
202Characters from the charsets ascii, eight-bit-control, 208Characters from the charsets ascii, eight-bit-control,
203eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded 209eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
204after translation through the table `ucs-mule-to-mule-unicode'. 210after translation through the translation-table named
211`utf-translation-table-for-encode'.
205Others are encoded as U+FFFD.") 212Others are encoded as U+FFFD.")
206 213
207(makunbound 'utf-16-decode-to-ucs) 214(makunbound 'utf-16-decode-to-ucs)
@@ -210,20 +217,19 @@ Others are encoded as U+FFFD.")
210 217
211Assumes and ignores the leading two-byte signature. 218Assumes and ignores the leading two-byte signature.
212 219
213The supported Emacs character sets are the following, plus others 220It supports Unicode characters of these ranges:
214which may be included in the translation table 221 U+0000..U+33FF, U+E000..U+FFFF.
215`ucs-mule-to-mule-unicode': 222They correspond to these Emacs character sets:
216 ascii 223 ascii, latin-iso8859-1, mule-unicode-0100-24ff,
217 eight-bit-control 224 mule-unicode-2500-33ff, mule-unicode-e000-ffff
218 latin-iso8859-1 225
219 mule-unicode-0100-24ff 226On decoding (e.g. reading a file), Unicode characters not in the above
220 mule-unicode-2500-33ff 227ranges are decoded as U+FFFD, effectively corrupting the data
221 mule-unicode-e000-ffff 228if they are re-encoded.
222 229
223Note that Unicode characters out of the ranges U+0000-U+33FF and 230On encoding (e.g. writing a file), Emacs characters not belonging to
224U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data 231any of the character sets listed above are encoded into the byte
225if they are re-encoded. Emacs characters without Unicode conversions 232sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
226are encoded as U+FFFD."))
227 (make-coding-system 233 (make-coding-system
228 'mule-utf-16-le 4 234 'mule-utf-16-le 4
229 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. 235 ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
@@ -242,7 +248,11 @@ are encoded as U+FFFD."))
242 (mime-charset . utf-16le) 248 (mime-charset . utf-16le)
243 (coding-category . coding-category-utf-16-le) 249 (coding-category . coding-category-utf-16-le)
244 (valid-codes (0 . 255)) 250 (valid-codes (0 . 255))
245 (pre-write-conversion . utf-16-le-pre-write-conversion))) 251 (pre-write-conversion . utf-16-le-pre-write-conversion)
252 (dependency unify-8859-on-encoding-mode
253 unify-8859-on-decoding-mode
254 utf-fragment-on-decoding
255 utf-translate-cjk)))
246 256
247 (make-coding-system 257 (make-coding-system
248 'mule-utf-16-be 4 ?u 258 'mule-utf-16-be 4 ?u
@@ -261,10 +271,11 @@ are encoded as U+FFFD."))
261 (mime-charset . utf-16be) 271 (mime-charset . utf-16be)
262 (coding-category . coding-category-utf-16-be) 272 (coding-category . coding-category-utf-16-be)
263 (valid-codes (0 . 255)) 273 (valid-codes (0 . 255))
264 (pre-write-conversion . utf-16-be-pre-write-conversion))) 274 (pre-write-conversion . utf-16-be-pre-write-conversion)
265 275 (dependency unify-8859-on-encoding-mode
266 (register-char-codings 'mule-utf-16-le ucs-mule-to-mule-unicode) 276 unify-8859-on-decoding-mode
267 (register-char-codings 'mule-utf-16-be ucs-mule-to-mule-unicode)) 277 utf-fragment-on-decoding
278 utf-translate-cjk))))
268 279
269(define-coding-system-alias 'utf-16-le 'mule-utf-16-le) 280(define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
270(define-coding-system-alias 'utf-16-be 'mule-utf-16-be) 281(define-coding-system-alias 'utf-16-be 'mule-utf-16-be)