aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Love2001-02-20 20:55:06 +0000
committerDave Love2001-02-20 20:55:06 +0000
commitc49b8288474283f2ef0414798d4755a450193e99 (patch)
tree3b69d50a44e4fa927aa83ca240cbc85d75a35bcb
parentda4cf7c5a0ffd4f8c836bf4318517a0771953688 (diff)
downloademacs-c49b8288474283f2ef0414798d4755a450193e99.tar.gz
emacs-c49b8288474283f2ef0414798d4755a450193e99.zip
Doc and commentary fixes.
-rw-r--r--lisp/international/utf-8.el44
1 files changed, 29 insertions, 15 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 0d7bcb8692a..c56e13c9475 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -3,7 +3,7 @@
3;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. 3;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4;; Licensed to the Free Software Foundation. 4;; Licensed to the Free Software Foundation.
5 5
6;; Keywords: multilingual, Unicode, UTF-8 6;; Keywords: multilingual, Unicode, UTF-8, i18n
7 7
8;; This file is part of GNU Emacs. 8;; This file is part of GNU Emacs.
9 9
@@ -25,7 +25,7 @@
25;;; Commentary: 25;;; Commentary:
26 26
27;; The coding-system `mule-utf-8' supports encoding/decoding of the 27;; The coding-system `mule-utf-8' supports encoding/decoding of the
28;; following character sets: 28;; following character sets to and from UTF-8:
29;; 29;;
30;; ascii 30;; ascii
31;; eight-bit-control 31;; eight-bit-control
@@ -35,12 +35,16 @@
35;; mule-unicode-e000-ffff 35;; mule-unicode-e000-ffff
36;; 36;;
37;; Characters of other character sets cannot be encoded with 37;; Characters of other character sets cannot be encoded with
38;; mule-utf-8. 38;; mule-utf-8. Note that the mule-unicode charsets currently lack
39;; case and syntax information, so things like `downcase' will only
40;; work for characters from ASCII and Latin-1.
39;; 41;;
40;; On decoding, Unicode characters that do not fit in above character 42;; On decoding, Unicode characters that do not fit into the above
41;; sets are handled as `eight-bit-control' or `eight-bit-graphic' 43;; character sets are handled as `eight-bit-control' or
42;; characters to retain original information (i.e. original byte 44;; `eight-bit-graphic' characters to retain the information about the
43;; sequence). 45;; original byte sequence.
46
47;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
44 48
45;; scalar | utf-8 49;; scalar | utf-8
46;; value | 1st byte | 2nd byte | 3rd byte 50;; value | 1st byte | 2nd byte | 3rd byte
@@ -174,7 +178,9 @@
174 178
175 (repeat)))) 179 (repeat))))
176 180
177 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") 181 "CCL program to decode UTF-8.
182Decoding is done into the charsets ascii, eight-bit-control,
183latin-iso8859-1 and mule-unicode-* only.")
178 184
179(define-ccl-program ccl-encode-mule-utf-8 185(define-ccl-program ccl-encode-mule-utf-8
180 `(1 186 `(1
@@ -251,20 +257,22 @@
251 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 257 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
252 (write r1) 258 (write r1)
253 259
254 ;; unsupported character. 260 ;; Unsupported character.
255 ;; output U+FFFD, which is `ef bf bd' in UTF-8 261 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
256 ;; actually it never reach here
257 ((write #xef) 262 ((write #xef)
258 (write #xbf) 263 (write #xbf)
259 (write #xbd))))))))) 264 (write #xbd)))))))))
260 (repeat))) 265 (repeat)))
261 266
262 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") 267 "CCL program to encode into UTF-8.
268Only characters from the charsets ascii, eight-bit-control,
269latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded
270as U+FFFD.")
263 271
264(make-coding-system 272(make-coding-system
265 'mule-utf-8 4 ?u 273 'mule-utf-8 4 ?u
266 "UTF-8 encoding for Emacs-supported Unicode characters. 274 "UTF-8 encoding for Emacs-supported Unicode characters.
267Supported character sets are: 275The supported Emacs character sets are:
268 ascii 276 ascii
269 eight-bit-control 277 eight-bit-control
270 eight-bit-graphic 278 eight-bit-graphic
@@ -273,8 +281,14 @@ Supported character sets are:
273 mule-unicode-2500-33ff 281 mule-unicode-2500-33ff
274 mule-unicode-e000-ffff 282 mule-unicode-e000-ffff
275 283
276Unicode characters out of these ranges are decoded 284Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
277into eight-bit-control or eight-bit-graphic." 285are decoded into sequences of eight-bit-control and eight-bit-graphic
286characters to preserve their byte sequences. Emacs characters out of
287these ranges are encoded into U+FFFD.
288
289Note that, currently, characters in the mule-unicode charsets have no
290syntax and case information. Thus, for instance, upper- and
291lower-casing commands won't work with them."
278 292
279 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 293 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
280 '((safe-charsets 294 '((safe-charsets