(encode_coding_utf_8): Initialize produced_chars to 0.

(decode_coding_utf_16): Fix converting high and low bytes to code-point. (encode_coding_utf_16): Substitute coding->default_char for non-Unicode characters. (decode_coding): Don't call record_insert here. (setup_coding_system): Initialize `surrogate' of coding->spec.utf_16 to 0. (EMIT_ONE_BYTE): Fix for multibyte case.
author: Kenichi Handa 2002-03-08 00:19:39 +0000
committer: Kenichi Handa 2002-03-08 00:19:39 +0000
commit: e19c3639afcf187e326399bb31808d0eedb963cf (patch)
tree: f21c431c6a01ca6be76b673f0c23abd8935775e2 /src/coding.c
parent: ed9d8bdadcb6cc5b56a071154cf1e122fad63c93 (diff)
download: emacs-e19c3639afcf187e326399bb31808d0eedb963cf.tar.gz
emacs-e19c3639afcf187e326399bb31808d0eedb963cf.zip
1 files changed, 59 insertions, 52 deletions
diff --git a/src/coding.c b/src/coding.c
index 1c6a84d6f8d..60b2d3658c9 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA.  */
 CODING SYSTEM
-  Coding system is an encoding mechanism of one or more character
+  Coding system is an object for a encoding mechanism that contains
-  sets.  Here's a list of coding system types supported by Emacs.
+  information about how to convert byte sequence to character
-  When we say "decode", it means converting a text encoded by some
+  sequences and vice versa.  When we say "decode", it means converting
-  coding system into Emacs' internal format (emacs-utf-8), and when we
+  a byte sequence of a specific coding system into a character
-  say "encode", it means converting a text of emacs-utf-8 to some
+  sequence that is represented by Emacs' internal coding system
-  other coding system.
+  `emacs-utf-8', and when we say "encode", it means converting a
+  character sequence of emacs-utf-8 to a byte sequence of a specific
-  Emacs represents a coding system by a Lisp symbol.  Each symbol is a
+  coding system.
-  key to the hash table Vcharset_hash_table.  This hash table
-  associates the symbol to the corresponding detailed specifications.
+  In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
+  C level, a coding system is represented by a vector of attributes
-  Before using a coding system for decoding and encoding, we setup a
+  stored in the hash table Vcharset_hash_table.  The conversion from a
-  structure of type `struct coding_system'.  This structure keeps
+  coding system symbol to attributes vector is done by looking up
-  various information about a specific code conversion (e.g.  the
+  Vcharset_hash_table by the symbol.
-  location of source and destination data).
+  Coding systems are classified into the following types depending on
-  Coding systems are classified into the following types by how to
+  the mechanism of encoding.  Here's a brief descrition about type.
-  represent a character in a byte sequence.  Here's a brief descrition
-  about type.
-  o Emacs' internal format (emacs-utf-8)
-  The extended UTF-8 which allows eight-bit raw bytes mixed with
-  character codes.  Emacs holds characters in buffers and strings by
-  this format.
  o UTF-8
@@ -137,6 +129,13 @@ END-OF-LINE FORMAT
  independent, any coding system described above can take any format
  of end-of-line (except for no-conversion).
+STRUCT CODING_SYSTEM
+  Before using a coding system for code conversion (i.e. decoding and
+  encoding), we setup a structure of type `struct coding_system'.
+  This structure keeps various information about a specific code
+  conversion (e.g.  the location of source and destination data).
 */
 /* COMMON MACROS */
@@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] =
 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
-#define EMIT_TWO_BYTES(c1, c2)                  \
+#define EMIT_TWO_BYTES(c1, c2)          \
-  do {                                          \
+  do {                                  \
-    produced_chars += 2;                        \
+    produced_chars += 2;                \
-    if (multibytep)                             \
+    if (multibytep)                     \
-      {                                         \
+      {                                 \
-        CHAR_STRING_ADVANCE ((int) (c1), dst);  \
+        int ch;                         \
-        CHAR_STRING_ADVANCE ((int) (c2), dst);  \
+                                        \
-      }                                         \
+        ch = (c1);                      \
-    else                                        \
+        if (ch >= 0x80)                 \
-      {                                         \
+          ch = BYTE8_TO_CHAR (ch);      \
-        *dst++ = (c1);                          \
+        CHAR_STRING_ADVANCE (ch, dst);  \
-        *dst++ = (c2);                          \
+        ch = (c2);                      \
-      }                                         \
+        if (ch >= 0x80)                 \
+          ch = BYTE8_TO_CHAR (ch);      \
+        CHAR_STRING_ADVANCE (ch, dst);  \
+      }                                 \
+    else                                \
+      {                                 \
+        *dst++ = (c1);                  \
+        *dst++ = (c2);                  \
+      }                                 \
  } while (0)
@@ -889,10 +896,14 @@ coding_set_source (coding)
        coding->source = GAP_END_ADDR + coding->src_pos_byte;
      else
        {
-          if (coding->src_pos < GPT
+          struct buffer *buf = XBUFFER (coding->src_object);
-              && coding->src_pos + coding->src_chars >= GPT)
+          EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
-            move_gap_both (coding->src_pos, coding->src_pos_byte);
+          EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
-          coding->source = BYTE_POS_ADDR (coding->src_pos_byte);
+          unsigned char *beg_addr = BUF_BEG_ADDR (buf);
+          coding->source = beg_addr + coding->src_pos_byte - 1;
+          if (coding->src_pos_byte >= gpt_byte)
+            coding->source += BUF_GAP_SIZE (buf);
        }
    }
  else if (STRINGP (coding->src_object))
@@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding)
  int *charbuf_end = charbuf + coding->charbuf_used;
  unsigned char *dst = coding->destination + coding->produced;
  unsigned char *dst_end = coding->destination + coding->dst_bytes;
-  int produced_chars;
+  int produced_chars = 0;
  int c;
  if (multibytep)
@@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding)
      src_base = src;
      ONE_MORE_BYTE (c1);
      ONE_MORE_BYTE (c2);
-      c = (c1 << 16) | c2;
+      c = (c1 << 8) | c2;
      if (bom == utf_16_with_bom)
        {
          if (endian == utf_16_big_endian
@@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding)
      ONE_MORE_BYTE (c1);
      ONE_MORE_BYTE (c2);
      c = (endian == utf_16_big_endian
-           ? ((c1 << 16) | c2) : ((c2 << 16) | c1));
+           ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
      if (surrogate)
        {
          if (! UTF_16_LOW_SURROGATE_P (c))
@@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding)
    {
      ASSURE_DESTINATION (safe_room);
      c = *charbuf++;
-      if (c >= 0x110000)
+      if (c >= MAX_UNICODE_CHAR)
-        c = 0xFFFF;
+        c = coding->default_char;
      if (c < 0x10000)
        {
@@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding)
      val = AREF (attrs, coding_attr_utf_16_endian);
      CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
                                       : utf_16_little_endian);
+      CODING_UTF_16_SURROGATE (coding) = 0;
      coding->detector = detect_coding_utf_16;
      coding->decoder = decode_coding_utf_16;
      coding->encoder = encode_coding_utf_16;
@@ -5458,11 +5470,6 @@ decode_coding (coding)
      coding->consumed = coding->src_bytes;
    }
-  if (BUFFERP (coding->dst_object))
-    {
-      record_insert (coding->dst_pos, coding->produced_char);
-    }
  return coding->result;
 }
author	Kenichi Handa	2002-03-08 00:19:39 +0000
committer	Kenichi Handa	2002-03-08 00:19:39 +0000
commit	e19c3639afcf187e326399bb31808d0eedb963cf (patch)
tree	f21c431c6a01ca6be76b673f0c23abd8935775e2 /src/coding.c
parent	ed9d8bdadcb6cc5b56a071154cf1e122fad63c93 (diff)
download	emacs-e19c3639afcf187e326399bb31808d0eedb963cf.tar.gz emacs-e19c3639afcf187e326399bb31808d0eedb963cf.zip

diff --git a/src/coding.c b/src/coding.c index 1c6a84d6f8d..60b2d3658c9 100644 --- a/src/coding.c +++ b/src/coding.c
@@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */
46		46
47	CODING SYSTEM	47	CODING SYSTEM
48		48
49	Coding system is an encoding mechanism of one or more character	49	Coding system is an object for a encoding mechanism that contains
50	sets. Here's a list of coding system types supported by Emacs.	50	information about how to convert byte sequence to character
51	When we say "decode", it means converting a text encoded by some	51	sequences and vice versa. When we say "decode", it means converting
52	coding system into Emacs' internal format (emacs-utf-8), and when we	52	a byte sequence of a specific coding system into a character
53	say "encode", it means converting a text of emacs-utf-8 to some	53	sequence that is represented by Emacs' internal coding system
54	other coding system.	54	`emacs-utf-8', and when we say "encode", it means converting a
55		55	character sequence of emacs-utf-8 to a byte sequence of a specific
56	Emacs represents a coding system by a Lisp symbol. Each symbol is a	56	coding system.
57	key to the hash table Vcharset_hash_table. This hash table	57
58	associates the symbol to the corresponding detailed specifications.	58	In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59		59	C level, a coding system is represented by a vector of attributes
60	Before using a coding system for decoding and encoding, we setup a	60	stored in the hash table Vcharset_hash_table. The conversion from a
61	structure of type `struct coding_system'. This structure keeps	61	coding system symbol to attributes vector is done by looking up
62	various information about a specific code conversion (e.g. the	62	Vcharset_hash_table by the symbol.
63	location of source and destination data).	63
64		64	Coding systems are classified into the following types depending on
65	Coding systems are classified into the following types by how to	65	the mechanism of encoding. Here's a brief descrition about type.
66	represent a character in a byte sequence. Here's a brief descrition
67	about type.
68
69	o Emacs' internal format (emacs-utf-8)
70
71	The extended UTF-8 which allows eight-bit raw bytes mixed with
72	character codes. Emacs holds characters in buffers and strings by
73	this format.
74		66
75	o UTF-8	67	o UTF-8
76		68
@@ -137,6 +129,13 @@ END-OF-LINE FORMAT
137	independent, any coding system described above can take any format	129	independent, any coding system described above can take any format
138	of end-of-line (except for no-conversion).	130	of end-of-line (except for no-conversion).
139		131
		132	STRUCT CODING_SYSTEM
		133
		134	Before using a coding system for code conversion (i.e. decoding and
		135	encoding), we setup a structure of type `struct coding_system'.
		136	This structure keeps various information about a specific code
		137	conversion (e.g. the location of source and destination data).
		138
140	*/	139	*/
141		140
142	/* COMMON MACROS */	141	/* COMMON MACROS */
@@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] =
818		817
819	/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */	818	/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
820		819
821	#define EMIT_TWO_BYTES(c1, c2) \	820	#define EMIT_TWO_BYTES(c1, c2) \
822	do { \	821	do { \
823	produced_chars += 2; \	822	produced_chars += 2; \
824	if (multibytep) \	823	if (multibytep) \
825	{ \	824	{ \
826	CHAR_STRING_ADVANCE ((int) (c1), dst); \	825	int ch; \
827	CHAR_STRING_ADVANCE ((int) (c2), dst); \	826	\
828	} \	827	ch = (c1); \
829	else \	828	if (ch >= 0x80) \
830	{ \	829	ch = BYTE8_TO_CHAR (ch); \
831	*dst++ = (c1); \	830	CHAR_STRING_ADVANCE (ch, dst); \
832	*dst++ = (c2); \	831	ch = (c2); \
833	} \	832	if (ch >= 0x80) \
		833	ch = BYTE8_TO_CHAR (ch); \
		834	CHAR_STRING_ADVANCE (ch, dst); \
		835	} \
		836	else \
		837	{ \
		838	*dst++ = (c1); \
		839	*dst++ = (c2); \
		840	} \
834	} while (0)	841	} while (0)
835		842
836		843
@@ -889,10 +896,14 @@ coding_set_source (coding)
889	coding->source = GAP_END_ADDR + coding->src_pos_byte;	896	coding->source = GAP_END_ADDR + coding->src_pos_byte;
890	else	897	else
891	{	898	{
892	if (coding->src_pos < GPT	899	struct buffer *buf = XBUFFER (coding->src_object);
893	&& coding->src_pos + coding->src_chars >= GPT)	900	EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
894	move_gap_both (coding->src_pos, coding->src_pos_byte);	901	EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
895	coding->source = BYTE_POS_ADDR (coding->src_pos_byte);	902	unsigned char *beg_addr = BUF_BEG_ADDR (buf);
		903
		904	coding->source = beg_addr + coding->src_pos_byte - 1;
		905	if (coding->src_pos_byte >= gpt_byte)
		906	coding->source += BUF_GAP_SIZE (buf);
896	}	907	}
897	}	908	}
898	else if (STRINGP (coding->src_object))	909	else if (STRINGP (coding->src_object))
@@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding)
1182	int *charbuf_end = charbuf + coding->charbuf_used;	1193	int *charbuf_end = charbuf + coding->charbuf_used;
1183	unsigned char *dst = coding->destination + coding->produced;	1194	unsigned char *dst = coding->destination + coding->produced;
1184	unsigned char *dst_end = coding->destination + coding->dst_bytes;	1195	unsigned char *dst_end = coding->destination + coding->dst_bytes;
1185	int produced_chars;	1196	int produced_chars = 0;
1186	int c;	1197	int c;
1187		1198
1188	if (multibytep)	1199	if (multibytep)
@@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding)
1290	src_base = src;	1301	src_base = src;
1291	ONE_MORE_BYTE (c1);	1302	ONE_MORE_BYTE (c1);
1292	ONE_MORE_BYTE (c2);	1303	ONE_MORE_BYTE (c2);
1293	c = (c1 << 16) \| c2;	1304	c = (c1 << 8) \| c2;
1294	if (bom == utf_16_with_bom)	1305	if (bom == utf_16_with_bom)
1295	{	1306	{
1296	if (endian == utf_16_big_endian	1307	if (endian == utf_16_big_endian
@@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding)
1333	ONE_MORE_BYTE (c1);	1344	ONE_MORE_BYTE (c1);
1334	ONE_MORE_BYTE (c2);	1345	ONE_MORE_BYTE (c2);
1335	c = (endian == utf_16_big_endian	1346	c = (endian == utf_16_big_endian
1336	? ((c1 << 16) \| c2) : ((c2 << 16) \| c1));	1347	? ((c1 << 8) \| c2) : ((c2 << 8) \| c1));
1337	if (surrogate)	1348	if (surrogate)
1338	{	1349	{
1339	if (! UTF_16_LOW_SURROGATE_P (c))	1350	if (! UTF_16_LOW_SURROGATE_P (c))
@@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding)
1404	{	1415	{
1405	ASSURE_DESTINATION (safe_room);	1416	ASSURE_DESTINATION (safe_room);
1406	c = *charbuf++;	1417	c = *charbuf++;
1407	if (c >= 0x110000)	1418	if (c >= MAX_UNICODE_CHAR)
1408	c = 0xFFFF;	1419	c = coding->default_char;
1409		1420
1410	if (c < 0x10000)	1421	if (c < 0x10000)
1411	{	1422	{
@@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding)
4504	val = AREF (attrs, coding_attr_utf_16_endian);	4515	val = AREF (attrs, coding_attr_utf_16_endian);
4505	CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian	4516	CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4506	: utf_16_little_endian);	4517	: utf_16_little_endian);
		4518	CODING_UTF_16_SURROGATE (coding) = 0;
4507	coding->detector = detect_coding_utf_16;	4519	coding->detector = detect_coding_utf_16;
4508	coding->decoder = decode_coding_utf_16;	4520	coding->decoder = decode_coding_utf_16;
4509	coding->encoder = encode_coding_utf_16;	4521	coding->encoder = encode_coding_utf_16;
@@ -5458,11 +5470,6 @@ decode_coding (coding)
5458	coding->consumed = coding->src_bytes;	5470	coding->consumed = coding->src_bytes;
5459	}	5471	}
5460		5472
5461	if (BUFFERP (coding->dst_object))
5462	{
5463	record_insert (coding->dst_pos, coding->produced_char);
5464	}
5465
5466	return coding->result;	5473	return coding->result;
5467	}	5474	}
5468		5475