diff options
| author | Kenichi Handa | 2002-03-08 00:19:39 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2002-03-08 00:19:39 +0000 |
| commit | e19c3639afcf187e326399bb31808d0eedb963cf (patch) | |
| tree | f21c431c6a01ca6be76b673f0c23abd8935775e2 /src/coding.c | |
| parent | ed9d8bdadcb6cc5b56a071154cf1e122fad63c93 (diff) | |
| download | emacs-e19c3639afcf187e326399bb31808d0eedb963cf.tar.gz emacs-e19c3639afcf187e326399bb31808d0eedb963cf.zip | |
(encode_coding_utf_8): Initialize produced_chars to 0.
(decode_coding_utf_16): Fix converting high and low bytes to
code-point.
(encode_coding_utf_16): Substitute coding->default_char for
non-Unicode characters.
(decode_coding): Don't call record_insert here.
(setup_coding_system): Initialize `surrogate' of
coding->spec.utf_16 to 0.
(EMIT_ONE_BYTE): Fix for multibyte case.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 111 |
1 files changed, 59 insertions, 52 deletions
diff --git a/src/coding.c b/src/coding.c index 1c6a84d6f8d..60b2d3658c9 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */ | |||
| 46 | 46 | ||
| 47 | CODING SYSTEM | 47 | CODING SYSTEM |
| 48 | 48 | ||
| 49 | Coding system is an encoding mechanism of one or more character | 49 | Coding system is an object for a encoding mechanism that contains |
| 50 | sets. Here's a list of coding system types supported by Emacs. | 50 | information about how to convert byte sequence to character |
| 51 | When we say "decode", it means converting a text encoded by some | 51 | sequences and vice versa. When we say "decode", it means converting |
| 52 | coding system into Emacs' internal format (emacs-utf-8), and when we | 52 | a byte sequence of a specific coding system into a character |
| 53 | say "encode", it means converting a text of emacs-utf-8 to some | 53 | sequence that is represented by Emacs' internal coding system |
| 54 | other coding system. | 54 | `emacs-utf-8', and when we say "encode", it means converting a |
| 55 | 55 | character sequence of emacs-utf-8 to a byte sequence of a specific | |
| 56 | Emacs represents a coding system by a Lisp symbol. Each symbol is a | 56 | coding system. |
| 57 | key to the hash table Vcharset_hash_table. This hash table | 57 | |
| 58 | associates the symbol to the corresponding detailed specifications. | 58 | In Emacs Lisp, a coding system is represented by a Lisp symbol. In |
| 59 | 59 | C level, a coding system is represented by a vector of attributes | |
| 60 | Before using a coding system for decoding and encoding, we setup a | 60 | stored in the hash table Vcharset_hash_table. The conversion from a |
| 61 | structure of type `struct coding_system'. This structure keeps | 61 | coding system symbol to attributes vector is done by looking up |
| 62 | various information about a specific code conversion (e.g. the | 62 | Vcharset_hash_table by the symbol. |
| 63 | location of source and destination data). | 63 | |
| 64 | 64 | Coding systems are classified into the following types depending on | |
| 65 | Coding systems are classified into the following types by how to | 65 | the mechanism of encoding. Here's a brief descrition about type. |
| 66 | represent a character in a byte sequence. Here's a brief descrition | ||
| 67 | about type. | ||
| 68 | |||
| 69 | o Emacs' internal format (emacs-utf-8) | ||
| 70 | |||
| 71 | The extended UTF-8 which allows eight-bit raw bytes mixed with | ||
| 72 | character codes. Emacs holds characters in buffers and strings by | ||
| 73 | this format. | ||
| 74 | 66 | ||
| 75 | o UTF-8 | 67 | o UTF-8 |
| 76 | 68 | ||
| @@ -137,6 +129,13 @@ END-OF-LINE FORMAT | |||
| 137 | independent, any coding system described above can take any format | 129 | independent, any coding system described above can take any format |
| 138 | of end-of-line (except for no-conversion). | 130 | of end-of-line (except for no-conversion). |
| 139 | 131 | ||
| 132 | STRUCT CODING_SYSTEM | ||
| 133 | |||
| 134 | Before using a coding system for code conversion (i.e. decoding and | ||
| 135 | encoding), we setup a structure of type `struct coding_system'. | ||
| 136 | This structure keeps various information about a specific code | ||
| 137 | conversion (e.g. the location of source and destination data). | ||
| 138 | |||
| 140 | */ | 139 | */ |
| 141 | 140 | ||
| 142 | /* COMMON MACROS */ | 141 | /* COMMON MACROS */ |
| @@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] = | |||
| 818 | 817 | ||
| 819 | /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ | 818 | /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ |
| 820 | 819 | ||
| 821 | #define EMIT_TWO_BYTES(c1, c2) \ | 820 | #define EMIT_TWO_BYTES(c1, c2) \ |
| 822 | do { \ | 821 | do { \ |
| 823 | produced_chars += 2; \ | 822 | produced_chars += 2; \ |
| 824 | if (multibytep) \ | 823 | if (multibytep) \ |
| 825 | { \ | 824 | { \ |
| 826 | CHAR_STRING_ADVANCE ((int) (c1), dst); \ | 825 | int ch; \ |
| 827 | CHAR_STRING_ADVANCE ((int) (c2), dst); \ | 826 | \ |
| 828 | } \ | 827 | ch = (c1); \ |
| 829 | else \ | 828 | if (ch >= 0x80) \ |
| 830 | { \ | 829 | ch = BYTE8_TO_CHAR (ch); \ |
| 831 | *dst++ = (c1); \ | 830 | CHAR_STRING_ADVANCE (ch, dst); \ |
| 832 | *dst++ = (c2); \ | 831 | ch = (c2); \ |
| 833 | } \ | 832 | if (ch >= 0x80) \ |
| 833 | ch = BYTE8_TO_CHAR (ch); \ | ||
| 834 | CHAR_STRING_ADVANCE (ch, dst); \ | ||
| 835 | } \ | ||
| 836 | else \ | ||
| 837 | { \ | ||
| 838 | *dst++ = (c1); \ | ||
| 839 | *dst++ = (c2); \ | ||
| 840 | } \ | ||
| 834 | } while (0) | 841 | } while (0) |
| 835 | 842 | ||
| 836 | 843 | ||
| @@ -889,10 +896,14 @@ coding_set_source (coding) | |||
| 889 | coding->source = GAP_END_ADDR + coding->src_pos_byte; | 896 | coding->source = GAP_END_ADDR + coding->src_pos_byte; |
| 890 | else | 897 | else |
| 891 | { | 898 | { |
| 892 | if (coding->src_pos < GPT | 899 | struct buffer *buf = XBUFFER (coding->src_object); |
| 893 | && coding->src_pos + coding->src_chars >= GPT) | 900 | EMACS_INT beg_byte = BUF_BEG_BYTE (buf); |
| 894 | move_gap_both (coding->src_pos, coding->src_pos_byte); | 901 | EMACS_INT gpt_byte = BUF_GPT_BYTE (buf); |
| 895 | coding->source = BYTE_POS_ADDR (coding->src_pos_byte); | 902 | unsigned char *beg_addr = BUF_BEG_ADDR (buf); |
| 903 | |||
| 904 | coding->source = beg_addr + coding->src_pos_byte - 1; | ||
| 905 | if (coding->src_pos_byte >= gpt_byte) | ||
| 906 | coding->source += BUF_GAP_SIZE (buf); | ||
| 896 | } | 907 | } |
| 897 | } | 908 | } |
| 898 | else if (STRINGP (coding->src_object)) | 909 | else if (STRINGP (coding->src_object)) |
| @@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding) | |||
| 1182 | int *charbuf_end = charbuf + coding->charbuf_used; | 1193 | int *charbuf_end = charbuf + coding->charbuf_used; |
| 1183 | unsigned char *dst = coding->destination + coding->produced; | 1194 | unsigned char *dst = coding->destination + coding->produced; |
| 1184 | unsigned char *dst_end = coding->destination + coding->dst_bytes; | 1195 | unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 1185 | int produced_chars; | 1196 | int produced_chars = 0; |
| 1186 | int c; | 1197 | int c; |
| 1187 | 1198 | ||
| 1188 | if (multibytep) | 1199 | if (multibytep) |
| @@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding) | |||
| 1290 | src_base = src; | 1301 | src_base = src; |
| 1291 | ONE_MORE_BYTE (c1); | 1302 | ONE_MORE_BYTE (c1); |
| 1292 | ONE_MORE_BYTE (c2); | 1303 | ONE_MORE_BYTE (c2); |
| 1293 | c = (c1 << 16) | c2; | 1304 | c = (c1 << 8) | c2; |
| 1294 | if (bom == utf_16_with_bom) | 1305 | if (bom == utf_16_with_bom) |
| 1295 | { | 1306 | { |
| 1296 | if (endian == utf_16_big_endian | 1307 | if (endian == utf_16_big_endian |
| @@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding) | |||
| 1333 | ONE_MORE_BYTE (c1); | 1344 | ONE_MORE_BYTE (c1); |
| 1334 | ONE_MORE_BYTE (c2); | 1345 | ONE_MORE_BYTE (c2); |
| 1335 | c = (endian == utf_16_big_endian | 1346 | c = (endian == utf_16_big_endian |
| 1336 | ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); | 1347 | ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); |
| 1337 | if (surrogate) | 1348 | if (surrogate) |
| 1338 | { | 1349 | { |
| 1339 | if (! UTF_16_LOW_SURROGATE_P (c)) | 1350 | if (! UTF_16_LOW_SURROGATE_P (c)) |
| @@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding) | |||
| 1404 | { | 1415 | { |
| 1405 | ASSURE_DESTINATION (safe_room); | 1416 | ASSURE_DESTINATION (safe_room); |
| 1406 | c = *charbuf++; | 1417 | c = *charbuf++; |
| 1407 | if (c >= 0x110000) | 1418 | if (c >= MAX_UNICODE_CHAR) |
| 1408 | c = 0xFFFF; | 1419 | c = coding->default_char; |
| 1409 | 1420 | ||
| 1410 | if (c < 0x10000) | 1421 | if (c < 0x10000) |
| 1411 | { | 1422 | { |
| @@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding) | |||
| 4504 | val = AREF (attrs, coding_attr_utf_16_endian); | 4515 | val = AREF (attrs, coding_attr_utf_16_endian); |
| 4505 | CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian | 4516 | CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian |
| 4506 | : utf_16_little_endian); | 4517 | : utf_16_little_endian); |
| 4518 | CODING_UTF_16_SURROGATE (coding) = 0; | ||
| 4507 | coding->detector = detect_coding_utf_16; | 4519 | coding->detector = detect_coding_utf_16; |
| 4508 | coding->decoder = decode_coding_utf_16; | 4520 | coding->decoder = decode_coding_utf_16; |
| 4509 | coding->encoder = encode_coding_utf_16; | 4521 | coding->encoder = encode_coding_utf_16; |
| @@ -5458,11 +5470,6 @@ decode_coding (coding) | |||
| 5458 | coding->consumed = coding->src_bytes; | 5470 | coding->consumed = coding->src_bytes; |
| 5459 | } | 5471 | } |
| 5460 | 5472 | ||
| 5461 | if (BUFFERP (coding->dst_object)) | ||
| 5462 | { | ||
| 5463 | record_insert (coding->dst_pos, coding->produced_char); | ||
| 5464 | } | ||
| 5465 | |||
| 5466 | return coding->result; | 5473 | return coding->result; |
| 5467 | } | 5474 | } |
| 5468 | 5475 | ||