aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2002-03-08 00:19:39 +0000
committerKenichi Handa2002-03-08 00:19:39 +0000
commite19c3639afcf187e326399bb31808d0eedb963cf (patch)
treef21c431c6a01ca6be76b673f0c23abd8935775e2 /src/coding.c
parented9d8bdadcb6cc5b56a071154cf1e122fad63c93 (diff)
downloademacs-e19c3639afcf187e326399bb31808d0eedb963cf.tar.gz
emacs-e19c3639afcf187e326399bb31808d0eedb963cf.zip
(encode_coding_utf_8): Initialize produced_chars to 0.
(decode_coding_utf_16): Fix converting high and low bytes to code-point. (encode_coding_utf_16): Substitute coding->default_char for non-Unicode characters. (decode_coding): Don't call record_insert here. (setup_coding_system): Initialize `surrogate' of coding->spec.utf_16 to 0. (EMIT_ONE_BYTE): Fix for multibyte case.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c111
1 files changed, 59 insertions, 52 deletions
diff --git a/src/coding.c b/src/coding.c
index 1c6a84d6f8d..60b2d3658c9 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */
46 46
47CODING SYSTEM 47CODING SYSTEM
48 48
49 Coding system is an encoding mechanism of one or more character 49 Coding system is an object for a encoding mechanism that contains
50 sets. Here's a list of coding system types supported by Emacs. 50 information about how to convert byte sequence to character
51 When we say "decode", it means converting a text encoded by some 51 sequences and vice versa. When we say "decode", it means converting
52 coding system into Emacs' internal format (emacs-utf-8), and when we 52 a byte sequence of a specific coding system into a character
53 say "encode", it means converting a text of emacs-utf-8 to some 53 sequence that is represented by Emacs' internal coding system
54 other coding system. 54 `emacs-utf-8', and when we say "encode", it means converting a
55 55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 Emacs represents a coding system by a Lisp symbol. Each symbol is a 56 coding system.
57 key to the hash table Vcharset_hash_table. This hash table 57
58 associates the symbol to the corresponding detailed specifications. 58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 59 C level, a coding system is represented by a vector of attributes
60 Before using a coding system for decoding and encoding, we setup a 60 stored in the hash table Vcharset_hash_table. The conversion from a
61 structure of type `struct coding_system'. This structure keeps 61 coding system symbol to attributes vector is done by looking up
62 various information about a specific code conversion (e.g. the 62 Vcharset_hash_table by the symbol.
63 location of source and destination data). 63
64 64 Coding systems are classified into the following types depending on
65 Coding systems are classified into the following types by how to 65 the mechanism of encoding. Here's a brief descrition about type.
66 represent a character in a byte sequence. Here's a brief descrition
67 about type.
68
69 o Emacs' internal format (emacs-utf-8)
70
71 The extended UTF-8 which allows eight-bit raw bytes mixed with
72 character codes. Emacs holds characters in buffers and strings by
73 this format.
74 66
75 o UTF-8 67 o UTF-8
76 68
@@ -137,6 +129,13 @@ END-OF-LINE FORMAT
137 independent, any coding system described above can take any format 129 independent, any coding system described above can take any format
138 of end-of-line (except for no-conversion). 130 of end-of-line (except for no-conversion).
139 131
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
140*/ 139*/
141 140
142/* COMMON MACROS */ 141/* COMMON MACROS */
@@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] =
818 817
819/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ 818/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
820 819
821#define EMIT_TWO_BYTES(c1, c2) \ 820#define EMIT_TWO_BYTES(c1, c2) \
822 do { \ 821 do { \
823 produced_chars += 2; \ 822 produced_chars += 2; \
824 if (multibytep) \ 823 if (multibytep) \
825 { \ 824 { \
826 CHAR_STRING_ADVANCE ((int) (c1), dst); \ 825 int ch; \
827 CHAR_STRING_ADVANCE ((int) (c2), dst); \ 826 \
828 } \ 827 ch = (c1); \
829 else \ 828 if (ch >= 0x80) \
830 { \ 829 ch = BYTE8_TO_CHAR (ch); \
831 *dst++ = (c1); \ 830 CHAR_STRING_ADVANCE (ch, dst); \
832 *dst++ = (c2); \ 831 ch = (c2); \
833 } \ 832 if (ch >= 0x80) \
833 ch = BYTE8_TO_CHAR (ch); \
834 CHAR_STRING_ADVANCE (ch, dst); \
835 } \
836 else \
837 { \
838 *dst++ = (c1); \
839 *dst++ = (c2); \
840 } \
834 } while (0) 841 } while (0)
835 842
836 843
@@ -889,10 +896,14 @@ coding_set_source (coding)
889 coding->source = GAP_END_ADDR + coding->src_pos_byte; 896 coding->source = GAP_END_ADDR + coding->src_pos_byte;
890 else 897 else
891 { 898 {
892 if (coding->src_pos < GPT 899 struct buffer *buf = XBUFFER (coding->src_object);
893 && coding->src_pos + coding->src_chars >= GPT) 900 EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
894 move_gap_both (coding->src_pos, coding->src_pos_byte); 901 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
895 coding->source = BYTE_POS_ADDR (coding->src_pos_byte); 902 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
903
904 coding->source = beg_addr + coding->src_pos_byte - 1;
905 if (coding->src_pos_byte >= gpt_byte)
906 coding->source += BUF_GAP_SIZE (buf);
896 } 907 }
897 } 908 }
898 else if (STRINGP (coding->src_object)) 909 else if (STRINGP (coding->src_object))
@@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding)
1182 int *charbuf_end = charbuf + coding->charbuf_used; 1193 int *charbuf_end = charbuf + coding->charbuf_used;
1183 unsigned char *dst = coding->destination + coding->produced; 1194 unsigned char *dst = coding->destination + coding->produced;
1184 unsigned char *dst_end = coding->destination + coding->dst_bytes; 1195 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1185 int produced_chars; 1196 int produced_chars = 0;
1186 int c; 1197 int c;
1187 1198
1188 if (multibytep) 1199 if (multibytep)
@@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding)
1290 src_base = src; 1301 src_base = src;
1291 ONE_MORE_BYTE (c1); 1302 ONE_MORE_BYTE (c1);
1292 ONE_MORE_BYTE (c2); 1303 ONE_MORE_BYTE (c2);
1293 c = (c1 << 16) | c2; 1304 c = (c1 << 8) | c2;
1294 if (bom == utf_16_with_bom) 1305 if (bom == utf_16_with_bom)
1295 { 1306 {
1296 if (endian == utf_16_big_endian 1307 if (endian == utf_16_big_endian
@@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding)
1333 ONE_MORE_BYTE (c1); 1344 ONE_MORE_BYTE (c1);
1334 ONE_MORE_BYTE (c2); 1345 ONE_MORE_BYTE (c2);
1335 c = (endian == utf_16_big_endian 1346 c = (endian == utf_16_big_endian
1336 ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); 1347 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1337 if (surrogate) 1348 if (surrogate)
1338 { 1349 {
1339 if (! UTF_16_LOW_SURROGATE_P (c)) 1350 if (! UTF_16_LOW_SURROGATE_P (c))
@@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding)
1404 { 1415 {
1405 ASSURE_DESTINATION (safe_room); 1416 ASSURE_DESTINATION (safe_room);
1406 c = *charbuf++; 1417 c = *charbuf++;
1407 if (c >= 0x110000) 1418 if (c >= MAX_UNICODE_CHAR)
1408 c = 0xFFFF; 1419 c = coding->default_char;
1409 1420
1410 if (c < 0x10000) 1421 if (c < 0x10000)
1411 { 1422 {
@@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding)
4504 val = AREF (attrs, coding_attr_utf_16_endian); 4515 val = AREF (attrs, coding_attr_utf_16_endian);
4505 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian 4516 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4506 : utf_16_little_endian); 4517 : utf_16_little_endian);
4518 CODING_UTF_16_SURROGATE (coding) = 0;
4507 coding->detector = detect_coding_utf_16; 4519 coding->detector = detect_coding_utf_16;
4508 coding->decoder = decode_coding_utf_16; 4520 coding->decoder = decode_coding_utf_16;
4509 coding->encoder = encode_coding_utf_16; 4521 coding->encoder = encode_coding_utf_16;
@@ -5458,11 +5470,6 @@ decode_coding (coding)
5458 coding->consumed = coding->src_bytes; 5470 coding->consumed = coding->src_bytes;
5459 } 5471 }
5460 5472
5461 if (BUFFERP (coding->dst_object))
5462 {
5463 record_insert (coding->dst_pos, coding->produced_char);
5464 }
5465
5466 return coding->result; 5473 return coding->result;
5467} 5474}
5468 5475