diff options
| author | Dave Love | 2001-02-20 20:55:06 +0000 |
|---|---|---|
| committer | Dave Love | 2001-02-20 20:55:06 +0000 |
| commit | c49b8288474283f2ef0414798d4755a450193e99 (patch) | |
| tree | 3b69d50a44e4fa927aa83ca240cbc85d75a35bcb | |
| parent | da4cf7c5a0ffd4f8c836bf4318517a0771953688 (diff) | |
| download | emacs-c49b8288474283f2ef0414798d4755a450193e99.tar.gz emacs-c49b8288474283f2ef0414798d4755a450193e99.zip | |
Doc and commentary fixes.
| -rw-r--r-- | lisp/international/utf-8.el | 44 |
1 files changed, 29 insertions, 15 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 0d7bcb8692a..c56e13c9475 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | 3 | ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. |
| 4 | ;; Licensed to the Free Software Foundation. | 4 | ;; Licensed to the Free Software Foundation. |
| 5 | 5 | ||
| 6 | ;; Keywords: multilingual, Unicode, UTF-8 | 6 | ;; Keywords: multilingual, Unicode, UTF-8, i18n |
| 7 | 7 | ||
| 8 | ;; This file is part of GNU Emacs. | 8 | ;; This file is part of GNU Emacs. |
| 9 | 9 | ||
| @@ -25,7 +25,7 @@ | |||
| 25 | ;;; Commentary: | 25 | ;;; Commentary: |
| 26 | 26 | ||
| 27 | ;; The coding-system `mule-utf-8' supports encoding/decoding of the | 27 | ;; The coding-system `mule-utf-8' supports encoding/decoding of the |
| 28 | ;; following character sets: | 28 | ;; following character sets to and from UTF-8: |
| 29 | ;; | 29 | ;; |
| 30 | ;; ascii | 30 | ;; ascii |
| 31 | ;; eight-bit-control | 31 | ;; eight-bit-control |
| @@ -35,12 +35,16 @@ | |||
| 35 | ;; mule-unicode-e000-ffff | 35 | ;; mule-unicode-e000-ffff |
| 36 | ;; | 36 | ;; |
| 37 | ;; Characters of other character sets cannot be encoded with | 37 | ;; Characters of other character sets cannot be encoded with |
| 38 | ;; mule-utf-8. | 38 | ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
| 39 | ;; case and syntax information, so things like `downcase' will only | ||
| 40 | ;; work for characters from ASCII and Latin-1. | ||
| 39 | ;; | 41 | ;; |
| 40 | ;; On decoding, Unicode characters that do not fit in above character | 42 | ;; On decoding, Unicode characters that do not fit into the above |
| 41 | ;; sets are handled as `eight-bit-control' or `eight-bit-graphic' | 43 | ;; character sets are handled as `eight-bit-control' or |
| 42 | ;; characters to retain original information (i.e. original byte | 44 | ;; `eight-bit-graphic' characters to retain the information about the |
| 43 | ;; sequence). | 45 | ;; original byte sequence. |
| 46 | |||
| 47 | ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | ||
| 44 | 48 | ||
| 45 | ;; scalar | utf-8 | 49 | ;; scalar | utf-8 |
| 46 | ;; value | 1st byte | 2nd byte | 3rd byte | 50 | ;; value | 1st byte | 2nd byte | 3rd byte |
| @@ -174,7 +178,9 @@ | |||
| 174 | 178 | ||
| 175 | (repeat)))) | 179 | (repeat)))) |
| 176 | 180 | ||
| 177 | "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.") | 181 | "CCL program to decode UTF-8. |
| 182 | Decoding is done into the charsets ascii, eight-bit-control, | ||
| 183 | latin-iso8859-1 and mule-unicode-* only.") | ||
| 178 | 184 | ||
| 179 | (define-ccl-program ccl-encode-mule-utf-8 | 185 | (define-ccl-program ccl-encode-mule-utf-8 |
| 180 | `(1 | 186 | `(1 |
| @@ -251,20 +257,22 @@ | |||
| 251 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 | 257 | ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
| 252 | (write r1) | 258 | (write r1) |
| 253 | 259 | ||
| 254 | ;; unsupported character. | 260 | ;; Unsupported character. |
| 255 | ;; output U+FFFD, which is `ef bf bd' in UTF-8 | 261 | ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
| 256 | ;; actually it never reach here | ||
| 257 | ((write #xef) | 262 | ((write #xef) |
| 258 | (write #xbf) | 263 | (write #xbf) |
| 259 | (write #xbd))))))))) | 264 | (write #xbd))))))))) |
| 260 | (repeat))) | 265 | (repeat))) |
| 261 | 266 | ||
| 262 | "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.") | 267 | "CCL program to encode into UTF-8. |
| 268 | Only characters from the charsets ascii, eight-bit-control, | ||
| 269 | latin-iso8859-1 and mule-unicode-* are recognized. Others are encoded | ||
| 270 | as U+FFFD.") | ||
| 263 | 271 | ||
| 264 | (make-coding-system | 272 | (make-coding-system |
| 265 | 'mule-utf-8 4 ?u | 273 | 'mule-utf-8 4 ?u |
| 266 | "UTF-8 encoding for Emacs-supported Unicode characters. | 274 | "UTF-8 encoding for Emacs-supported Unicode characters. |
| 267 | Supported character sets are: | 275 | The supported Emacs character sets are: |
| 268 | ascii | 276 | ascii |
| 269 | eight-bit-control | 277 | eight-bit-control |
| 270 | eight-bit-graphic | 278 | eight-bit-graphic |
| @@ -273,8 +281,14 @@ Supported character sets are: | |||
| 273 | mule-unicode-2500-33ff | 281 | mule-unicode-2500-33ff |
| 274 | mule-unicode-e000-ffff | 282 | mule-unicode-e000-ffff |
| 275 | 283 | ||
| 276 | Unicode characters out of these ranges are decoded | 284 | Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
| 277 | into eight-bit-control or eight-bit-graphic." | 285 | are decoded into sequences of eight-bit-control and eight-bit-graphic |
| 286 | characters to preserve their byte sequences. Emacs characters out of | ||
| 287 | these ranges are encoded into U+FFFD. | ||
| 288 | |||
| 289 | Note that, currently, characters in the mule-unicode charsets have no | ||
| 290 | syntax and case information. Thus, for instance, upper- and | ||
| 291 | lower-casing commands won't work with them." | ||
| 278 | 292 | ||
| 279 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | 293 | '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) |
| 280 | '((safe-charsets | 294 | '((safe-charsets |