diff options
| author | Richard M. Stallman | 1997-07-13 20:43:31 +0000 |
|---|---|---|
| committer | Richard M. Stallman | 1997-07-13 20:43:31 +0000 |
| commit | f4dee5826e43e51c29bbaa4edf653a811ed95f72 (patch) | |
| tree | 931a582a23624c31faa224c8e44fb5dc770a30f3 /src/coding.c | |
| parent | c49f3d5a65a9faf3827610d0b80683088a9574d4 (diff) | |
| download | emacs-f4dee5826e43e51c29bbaa4edf653a811ed95f72.tar.gz emacs-f4dee5826e43e51c29bbaa4edf653a811ed95f72.zip | |
Comment changes.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 118 |
1 files changed, 58 insertions, 60 deletions
diff --git a/src/coding.c b/src/coding.c index a2ed7aa038e..d3093a58960 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -44,41 +44,40 @@ Boston, MA 02111-1307, USA. */ | |||
| 44 | 0. Emacs' internal format (emacs-mule) | 44 | 0. Emacs' internal format (emacs-mule) |
| 45 | 45 | ||
| 46 | Emacs itself holds a multi-lingual character in a buffer and a string | 46 | Emacs itself holds a multi-lingual character in a buffer and a string |
| 47 | in a special format. Details are described in the section 2. | 47 | in a special format. Details are described in section 2. |
| 48 | 48 | ||
| 49 | 1. ISO2022 | 49 | 1. ISO2022 |
| 50 | 50 | ||
| 51 | The most famous coding system for multiple character sets. X's | 51 | The most famous coding system for multiple character sets. X's |
| 52 | Compound Text, various EUCs (Extended Unix Code), and such coding | 52 | Compound Text, various EUCs (Extended Unix Code), and coding |
| 53 | systems used in Internet communication as ISO-2022-JP are all | 53 | systems used in Internet communication such as ISO-2022-JP are |
| 54 | variants of ISO2022. Details are described in the section 3. | 54 | all variants of ISO2022. Details are described in section 3. |
| 55 | 55 | ||
| 56 | 2. SJIS (or Shift-JIS or MS-Kanji-Code) | 56 | 2. SJIS (or Shift-JIS or MS-Kanji-Code) |
| 57 | 57 | ||
| 58 | A coding system to encode character sets: ASCII, JISX0201, and | 58 | A coding system to encode character sets: ASCII, JISX0201, and |
| 59 | JISX0208. Widely used for PC's in Japan. Details are described in | 59 | JISX0208. Widely used for PC's in Japan. Details are described in |
| 60 | the section 4. | 60 | section 4. |
| 61 | 61 | ||
| 62 | 3. BIG5 | 62 | 3. BIG5 |
| 63 | 63 | ||
| 64 | A coding system to encode character sets: ASCII and Big5. Widely | 64 | A coding system to encode character sets: ASCII and Big5. Widely |
| 65 | used by Chinese (mainly in Taiwan and Hong Kong). Details are | 65 | used by Chinese (mainly in Taiwan and Hong Kong). Details are |
| 66 | described in the section 4. In this file, when written as "BIG5" | 66 | described in section 4. In this file, when we write "BIG5" |
| 67 | (all uppercase), it means the coding system, and when written as | 67 | (all uppercase), we mean the coding system, and when we write |
| 68 | "Big5" (capitalized), it means the character set. | 68 | "Big5" (capitalized), we mean the character set. |
| 69 | 69 | ||
| 70 | 4. Else | 70 | 4. Other |
| 71 | 71 | ||
| 72 | If a user want to read/write a text encoded in a coding system not | 72 | If a user wants to read/write a text encoded in a coding system not |
| 73 | listed above, he can supply a decoder and an encoder for it in CCL | 73 | listed above, he can supply a decoder and an encoder for it in CCL |
| 74 | (Code Conversion Language) programs. Emacs executes the CCL program | 74 | (Code Conversion Language) programs. Emacs executes the CCL program |
| 75 | while reading/writing. | 75 | while reading/writing. |
| 76 | 76 | ||
| 77 | Emacs represent a coding-system by a Lisp symbol that has a property | 77 | Emacs represents a coding-system by a Lisp symbol that has a property |
| 78 | `coding-system'. But, before actually using the coding-system, the | 78 | `coding-system'. But, before actually using the coding-system, the |
| 79 | information about it is set in a structure of type `struct | 79 | information about it is set in a structure of type `struct |
| 80 | coding_system' for rapid processing. See the section 6 for more | 80 | coding_system' for rapid processing. See section 6 for more details. |
| 81 | detail. | ||
| 82 | 81 | ||
| 83 | */ | 82 | */ |
| 84 | 83 | ||
| @@ -86,14 +85,13 @@ Boston, MA 02111-1307, USA. */ | |||
| 86 | 85 | ||
| 87 | How end-of-line of a text is encoded depends on a system. For | 86 | How end-of-line of a text is encoded depends on a system. For |
| 88 | instance, Unix's format is just one byte of `line-feed' code, | 87 | instance, Unix's format is just one byte of `line-feed' code, |
| 89 | whereas DOS's format is two bytes sequence of `carriage-return' and | 88 | whereas DOS's format is two-byte sequence of `carriage-return' and |
| 90 | `line-feed' codes. MacOS's format is one byte of `carriage-return'. | 89 | `line-feed' codes. MacOS's format is one byte of `carriage-return'. |
| 91 | 90 | ||
| 92 | Since how characters in a text is encoded and how end-of-line is | 91 | Since text characters encoding and end-of-line encoding are |
| 93 | encoded is independent, any coding system described above can take | 92 | independent, any coding system described above can take |
| 94 | any format of end-of-line. So, Emacs has information of format of | 93 | any format of end-of-line. So, Emacs has information of format of |
| 95 | end-of-line in each coding-system. See the section 6 for more | 94 | end-of-line in each coding-system. See section 6 for more details. |
| 96 | detail. | ||
| 97 | 95 | ||
| 98 | */ | 96 | */ |
| 99 | 97 | ||
| @@ -117,10 +115,10 @@ detect_coding_emacs_mule (src, src_end) | |||
| 117 | 115 | ||
| 118 | These functions decode SRC_BYTES length text at SOURCE encoded in | 116 | These functions decode SRC_BYTES length text at SOURCE encoded in |
| 119 | CODING to Emacs' internal format (emacs-mule). The resulting text | 117 | CODING to Emacs' internal format (emacs-mule). The resulting text |
| 120 | goes to a place pointed by DESTINATION, the length of which should | 118 | goes to a place pointed to by DESTINATION, the length of which should |
| 121 | not exceed DST_BYTES. The bytes actually processed is returned as | 119 | not exceed DST_BYTES. The number of bytes actually processed is |
| 122 | *CONSUMED. The return value is the length of the decoded text. | 120 | returned as *CONSUMED. The return value is the length of the decoded |
| 123 | Below is a template of these functions. */ | 121 | text. Below is a template of these functions. */ |
| 124 | #if 0 | 122 | #if 0 |
| 125 | decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) | 123 | decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) |
| 126 | struct coding_system *coding; | 124 | struct coding_system *coding; |
| @@ -136,10 +134,10 @@ decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) | |||
| 136 | 134 | ||
| 137 | These functions encode SRC_BYTES length text at SOURCE of Emacs' | 135 | These functions encode SRC_BYTES length text at SOURCE of Emacs' |
| 138 | internal format (emacs-mule) to CODING. The resulting text goes to | 136 | internal format (emacs-mule) to CODING. The resulting text goes to |
| 139 | a place pointed by DESTINATION, the length of which should not | 137 | a place pointed to by DESTINATION, the length of which should not |
| 140 | exceed DST_BYTES. The bytes actually processed is returned as | 138 | exceed DST_BYTES. The number of bytes actually processed is |
| 141 | *CONSUMED. The return value is the length of the encoded text. | 139 | returned as *CONSUMED. The return value is the length of the |
| 142 | Below is a template of these functions. */ | 140 | encoded text. Below is a template of these functions. */ |
| 143 | #if 0 | 141 | #if 0 |
| 144 | encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) | 142 | encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) |
| 145 | struct coding_system *coding; | 143 | struct coding_system *coding; |
| @@ -200,7 +198,7 @@ encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) | |||
| 200 | *dst++ = (c); \ | 198 | *dst++ = (c); \ |
| 201 | } while (0) | 199 | } while (0) |
| 202 | 200 | ||
| 203 | /* Decode one DIMENSION1 character of which charset is CHARSET and | 201 | /* Decode one DIMENSION1 character whose charset is CHARSET and whose |
| 204 | position-code is C. */ | 202 | position-code is C. */ |
| 205 | 203 | ||
| 206 | #define DECODE_CHARACTER_DIMENSION1(charset, c) \ | 204 | #define DECODE_CHARACTER_DIMENSION1(charset, c) \ |
| @@ -215,7 +213,7 @@ encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) | |||
| 215 | *dst++ = (c) | 0x80; \ | 213 | *dst++ = (c) | 0x80; \ |
| 216 | } while (0) | 214 | } while (0) |
| 217 | 215 | ||
| 218 | /* Decode one DIMENSION2 character of which charset is CHARSET and | 216 | /* Decode one DIMENSION2 character whose charset is CHARSET and whose |
| 219 | position-codes are C1 and C2. */ | 217 | position-codes are C1 and C2. */ |
| 220 | 218 | ||
| 221 | #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \ | 219 | #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \ |
| @@ -337,25 +335,25 @@ Lisp_Object Vdefault_process_coding_system; | |||
| 337 | /*** 2. Emacs internal format (emacs-mule) handlers ***/ | 335 | /*** 2. Emacs internal format (emacs-mule) handlers ***/ |
| 338 | 336 | ||
| 339 | /* Emacs' internal format for encoding multiple character sets is a | 337 | /* Emacs' internal format for encoding multiple character sets is a |
| 340 | kind of multi-byte encoding, i.e. encoding a character by a sequence | 338 | kind of multi-byte encoding, i.e. characters are encoded by |
| 341 | of one-byte codes of variable length. ASCII characters and control | 339 | variable-length sequences of one-byte codes. ASCII characters |
| 342 | characters (e.g. `tab', `newline') are represented by one-byte as | 340 | and control characters (e.g. `tab', `newline') are represented by |
| 343 | is. It takes the range 0x00 through 0x7F. The other characters | 341 | one-byte sequences which are their ASCII codes, in the range 0x00 |
| 344 | are represented by a sequence of `base leading-code', optional | 342 | through 0x7F. The other characters are represented by a sequence |
| 345 | `extended leading-code', and one or two `position-code's. Length | 343 | of `base leading-code', optional `extended leading-code', and one |
| 346 | of the sequence is decided by the base leading-code. Leading-code | 344 | or two `position-code's. The length of the sequence is determined |
| 347 | takes the range 0x80 through 0x9F, whereas extended leading-code | 345 | by the base leading-code. Leading-code takes the range 0x80 |
| 348 | and position-code take the range 0xA0 through 0xFF. See the | 346 | through 0x9F, whereas extended leading-code and position-code take |
| 349 | document of `charset.h' for more detail about leading-code and | 347 | the range 0xA0 through 0xFF. See `charset.h' for more details |
| 350 | position-code. | 348 | about leading-code and position-code. |
| 351 | 349 | ||
| 352 | There's one exception in this rule. Special leading-code | 350 | There's one exception to this rule. Special leading-code |
| 353 | `leading-code-composition' denotes that the following several | 351 | `leading-code-composition' denotes that the following several |
| 354 | characters should be composed into one character. Leading-codes of | 352 | characters should be composed into one character. Leading-codes of |
| 355 | components (except for ASCII) are added 0x20. An ASCII character | 353 | components (except for ASCII) are added 0x20. An ASCII character |
| 356 | component is represented by a 2-byte sequence of `0xA0' and | 354 | component is represented by a 2-byte sequence of `0xA0' and |
| 357 | `ASCII-code + 0x80'. See also the document in `charset.h' for the | 355 | `ASCII-code + 0x80'. See also the comments in `charset.h' for the |
| 358 | detail of composite character. Hence, we can summarize the code | 356 | details of composite character. Hence, we can summarize the code |
| 359 | range as follows: | 357 | range as follows: |
| 360 | 358 | ||
| 361 | --- CODE RANGE of Emacs' internal format --- | 359 | --- CODE RANGE of Emacs' internal format --- |
| @@ -447,21 +445,21 @@ detect_coding_emacs_mule (src, src_end) | |||
| 447 | /*** 3. ISO2022 handlers ***/ | 445 | /*** 3. ISO2022 handlers ***/ |
| 448 | 446 | ||
| 449 | /* The following note describes the coding system ISO2022 briefly. | 447 | /* The following note describes the coding system ISO2022 briefly. |
| 450 | Since the intension of this note is to help understanding of the | 448 | Since the intention of this note is to help in understanding of |
| 451 | programs in this file, some parts are NOT ACCURATE or OVERLY | 449 | the programs in this file, some parts are NOT ACCURATE or OVERLY |
| 452 | SIMPLIFIED. For the thorough understanding, please refer to the | 450 | SIMPLIFIED. For the thorough understanding, please refer to the |
| 453 | original document of ISO2022. | 451 | original document of ISO2022. |
| 454 | 452 | ||
| 455 | ISO2022 provides many mechanisms to encode several character sets | 453 | ISO2022 provides many mechanisms to encode several character sets |
| 456 | in 7-bit and 8-bit environment. If one choose 7-bite environment, | 454 | in 7-bit and 8-bit environment. If one chooses 7-bite environment, |
| 457 | all text is encoded by codes of less than 128. This may make the | 455 | all text is encoded by codes of less than 128. This may make the |
| 458 | encoded text a little bit longer, but the text get more stability | 456 | encoded text a little bit longer, but the text gets more stability |
| 459 | to pass through several gateways (some of them split MSB off). | 457 | to pass through several gateways (some of them strip off the MSB). |
| 460 | 458 | ||
| 461 | There are two kind of character set: control character set and | 459 | There are two kinds of character set: control character set and |
| 462 | graphic character set. The former contains control characters such | 460 | graphic character set. The former contains control characters such |
| 463 | as `newline' and `escape' to provide control functions (control | 461 | as `newline' and `escape' to provide control functions (control |
| 464 | functions are provided also by escape sequence). The latter | 462 | functions are provided also by escape sequences). The latter |
| 465 | contains graphic characters such as ' A' and '-'. Emacs recognizes | 463 | contains graphic characters such as ' A' and '-'. Emacs recognizes |
| 466 | two control character sets and many graphic character sets. | 464 | two control character sets and many graphic character sets. |
| 467 | 465 | ||
| @@ -565,7 +563,7 @@ detect_coding_emacs_mule (src, src_end) | |||
| 565 | '(' can be omitted. We call this as "short-form" here after. | 563 | '(' can be omitted. We call this as "short-form" here after. |
| 566 | 564 | ||
| 567 | Now you may notice that there are a lot of ways for encoding the | 565 | Now you may notice that there are a lot of ways for encoding the |
| 568 | same multilingual text in ISO2022. Actually, there exist many | 566 | same multilingual text in ISO2022. Actually, there exists many |
| 569 | coding systems such as Compound Text (used in X's inter client | 567 | coding systems such as Compound Text (used in X's inter client |
| 570 | communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR | 568 | communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR |
| 571 | (used in Korean Internet), EUC (Extended UNIX Code, used in Asian | 569 | (used in Korean Internet), EUC (Extended UNIX Code, used in Asian |
| @@ -1018,10 +1016,10 @@ decode_coding_iso2022 (coding, source, destination, | |||
| 1018 | return dst - destination; | 1016 | return dst - destination; |
| 1019 | } | 1017 | } |
| 1020 | 1018 | ||
| 1021 | /* ISO2022 encoding staffs. */ | 1019 | /* ISO2022 encoding stuff. */ |
| 1022 | 1020 | ||
| 1023 | /* | 1021 | /* |
| 1024 | It is not enough to say just "ISO2022" on encoding, but we have to | 1022 | It is not enough to say just "ISO2022" on encoding, we have to |
| 1025 | specify more details. In Emacs, each coding-system of ISO2022 | 1023 | specify more details. In Emacs, each coding-system of ISO2022 |
| 1026 | variant has the following specifications: | 1024 | variant has the following specifications: |
| 1027 | 1. Initial designation to G0 thru G3. | 1025 | 1. Initial designation to G0 thru G3. |
| @@ -1036,7 +1034,7 @@ decode_coding_iso2022 (coding, source, destination, | |||
| 1036 | 9. Use JISX0208-1983 in place of JISX0208-1978? | 1034 | 9. Use JISX0208-1983 in place of JISX0208-1978? |
| 1037 | These specifications are encoded in `coding->flags' as flag bits | 1035 | These specifications are encoded in `coding->flags' as flag bits |
| 1038 | defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more | 1036 | defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more |
| 1039 | detail. | 1037 | details. |
| 1040 | */ | 1038 | */ |
| 1041 | 1039 | ||
| 1042 | /* Produce codes (escape sequence) for designating CHARSET to graphic | 1040 | /* Produce codes (escape sequence) for designating CHARSET to graphic |
| @@ -1132,8 +1130,8 @@ decode_coding_iso2022 (coding, source, destination, | |||
| 1132 | CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \ | 1130 | CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \ |
| 1133 | } while (0) | 1131 | } while (0) |
| 1134 | 1132 | ||
| 1135 | /* Produce codes for a DIMENSION1 character of which character set is | 1133 | /* Produce codes for a DIMENSION1 character whose character set is |
| 1136 | CHARSET and position-code is C1. Designation and invocation | 1134 | CHARSET and whose position-code is C1. Designation and invocation |
| 1137 | sequences are also produced in advance if necessary. */ | 1135 | sequences are also produced in advance if necessary. */ |
| 1138 | 1136 | ||
| 1139 | 1137 | ||
| @@ -1166,8 +1164,8 @@ decode_coding_iso2022 (coding, source, destination, | |||
| 1166 | dst = encode_invocation_designation (charset, coding, dst); \ | 1164 | dst = encode_invocation_designation (charset, coding, dst); \ |
| 1167 | } while (1) | 1165 | } while (1) |
| 1168 | 1166 | ||
| 1169 | /* Produce codes for a DIMENSION2 character of which character set is | 1167 | /* Produce codes for a DIMENSION2 character whose character set is |
| 1170 | CHARSET and position-codes are C1 and C2. Designation and | 1168 | CHARSET and whose position-codes are C1 and C2. Designation and |
| 1171 | invocation codes are also produced in advance if necessary. */ | 1169 | invocation codes are also produced in advance if necessary. */ |
| 1172 | 1170 | ||
| 1173 | #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ | 1171 | #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ |
| @@ -1552,7 +1550,7 @@ encode_coding_iso2022 (coding, source, destination, | |||
| 1552 | 1550 | ||
| 1553 | /*** 4. SJIS and BIG5 handlers ***/ | 1551 | /*** 4. SJIS and BIG5 handlers ***/ |
| 1554 | 1552 | ||
| 1555 | /* Although SJIS and BIG5 are not ISO's coding system, They are used | 1553 | /* Although SJIS and BIG5 are not ISO's coding system, they are used |
| 1556 | quite widely. So, for the moment, Emacs supports them in the bare | 1554 | quite widely. So, for the moment, Emacs supports them in the bare |
| 1557 | C code. But, in the future, they may be supported only by CCL. */ | 1555 | C code. But, in the future, they may be supported only by CCL. */ |
| 1558 | 1556 | ||
| @@ -2167,7 +2165,7 @@ setup_coding_system (coding_system, coding) | |||
| 2167 | { | 2165 | { |
| 2168 | Lisp_Object type, eol_type; | 2166 | Lisp_Object type, eol_type; |
| 2169 | 2167 | ||
| 2170 | /* At first, set several fields default values. */ | 2168 | /* At first, set several fields to default values. */ |
| 2171 | coding->require_flushing = 0; | 2169 | coding->require_flushing = 0; |
| 2172 | coding->last_block = 0; | 2170 | coding->last_block = 0; |
| 2173 | coding->selective = 0; | 2171 | coding->selective = 0; |