aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c1091
1 files changed, 692 insertions, 399 deletions
diff --git a/src/coding.c b/src/coding.c
index 6c898b878f4..8340e8dc271 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -144,26 +144,23 @@ STRUCT CODING_SYSTEM
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions *** 144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145 145
146 These functions check if a byte sequence specified as a source in 146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains 147 CODING conforms to the format of XXX, and update the members of
148 a byte sequence which can be decoded into non-ASCII characters by 148 DETECT_INFO.
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151 149
152 It also resets some bits of an integer pointed by MASK. The macros 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154 151
155 Below is the template of these functions. */ 152 Below is the template of these functions. */
156 153
157#if 0 154#if 0
158static int 155static int
159detect_coding_XXX (coding, mask) 156detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding; 157 struct coding_system *coding;
161 int *mask; 158 struct coding_detection_info *detect_info;
162{ 159{
163 unsigned char *src = coding->source; 160 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes; 161 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte; 162 int multibytep = coding->src_multibyte;
166 int c; 163 int consumed_chars = 0;
167 int found = 0; 164 int found = 0;
168 ...; 165 ...;
169 166
@@ -172,18 +169,19 @@ detect_coding_XXX (coding, mask)
172 /* Get one byte from the source. If the souce is exausted, jump 169 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */ 170 to no_more_source:. */
174 ONE_MORE_BYTE (c); 171 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */ 172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
176 } 177 }
177 /* As the data is invalid for XXX, reset a proper bits. */ 178 /* The byte sequence is invalid for XXX. */
178 *mask &= ~CODING_CATEGORY_XXX; 179 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0; 180 return 0;
181
180 no_more_source: 182 no_more_source:
181 /* The source exausted. */ 183 /* The source exausted successfully. */
182 if (!found) 184 detect_info->found |= found;
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1; 185 return 1;
188} 186}
189#endif 187#endif
@@ -408,31 +406,38 @@ Lisp_Object Vsjis_coding_system;
408Lisp_Object Vbig5_coding_system; 406Lisp_Object Vbig5_coding_system;
409 407
410 408
411static int detect_coding_utf_8 P_ ((struct coding_system *, int *)); 409static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
412static void decode_coding_utf_8 P_ ((struct coding_system *)); 411static void decode_coding_utf_8 P_ ((struct coding_system *));
413static int encode_coding_utf_8 P_ ((struct coding_system *)); 412static int encode_coding_utf_8 P_ ((struct coding_system *));
414 413
415static int detect_coding_utf_16 P_ ((struct coding_system *, int *)); 414static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416static void decode_coding_utf_16 P_ ((struct coding_system *)); 416static void decode_coding_utf_16 P_ ((struct coding_system *));
417static int encode_coding_utf_16 P_ ((struct coding_system *)); 417static int encode_coding_utf_16 P_ ((struct coding_system *));
418 418
419static int detect_coding_iso_2022 P_ ((struct coding_system *, int *)); 419static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
420static void decode_coding_iso_2022 P_ ((struct coding_system *)); 421static void decode_coding_iso_2022 P_ ((struct coding_system *));
421static int encode_coding_iso_2022 P_ ((struct coding_system *)); 422static int encode_coding_iso_2022 P_ ((struct coding_system *));
422 423
423static int detect_coding_emacs_mule P_ ((struct coding_system *, int *)); 424static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
424static void decode_coding_emacs_mule P_ ((struct coding_system *)); 426static void decode_coding_emacs_mule P_ ((struct coding_system *));
425static int encode_coding_emacs_mule P_ ((struct coding_system *)); 427static int encode_coding_emacs_mule P_ ((struct coding_system *));
426 428
427static int detect_coding_sjis P_ ((struct coding_system *, int *)); 429static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
428static void decode_coding_sjis P_ ((struct coding_system *)); 431static void decode_coding_sjis P_ ((struct coding_system *));
429static int encode_coding_sjis P_ ((struct coding_system *)); 432static int encode_coding_sjis P_ ((struct coding_system *));
430 433
431static int detect_coding_big5 P_ ((struct coding_system *, int *)); 434static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
432static void decode_coding_big5 P_ ((struct coding_system *)); 436static void decode_coding_big5 P_ ((struct coding_system *));
433static int encode_coding_big5 P_ ((struct coding_system *)); 437static int encode_coding_big5 P_ ((struct coding_system *));
434 438
435static int detect_coding_ccl P_ ((struct coding_system *, int *)); 439static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
436static void decode_coding_ccl P_ ((struct coding_system *)); 441static void decode_coding_ccl P_ ((struct coding_system *));
437static int encode_coding_ccl P_ ((struct coding_system *)); 442static int encode_coding_ccl P_ ((struct coding_system *));
438 443
@@ -631,6 +636,7 @@ enum coding_category
631#define CATEGORY_MASK_BIG5 (1 << coding_category_big5) 636#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632#define CATEGORY_MASK_CCL (1 << coding_category_ccl) 637#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) 638#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
634 640
635/* This value is returned if detect_coding_mask () find nothing other 641/* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */ 642 than ASCII characters. */
@@ -1002,6 +1008,54 @@ alloc_destination (coding, nbytes, dst)
1002 return dst; 1008 return dst;
1003} 1009}
1004 1010
1011/** Macros for annotations. */
1012
1013/* Maximum length of annotation data (sum of annotations for
1014 composition and charset). */
1015#define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1016
1017/* An annotation data is stored in the array coding->charbuf in this
1018 format:
1019 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1020 LENGTH is the number of elements in the annotation.
1021 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1022 FROM and TO specify the range of text annotated. They are relative
1023 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1024
1025 The format of the following elements depend on ANNOTATION_MASK.
1026
1027 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1028 follows:
1029 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1030 METHOD is one of enum composition_method.
1031 Optionnal COMPOSITION-COMPONENTS are characters and composition
1032 rules.
1033
1034 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1035 follows. */
1036
1037#define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1038 do { \
1039 *(buf)++ = -(len); \
1040 *(buf)++ = (mask); \
1041 *(buf)++ = (from); \
1042 *(buf)++ = (to); \
1043 coding->annotated = 1; \
1044 } while (0);
1045
1046#define ADD_COMPOSITION_DATA(buf, from, to, method) \
1047 do { \
1048 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1049 *buf++ = method; \
1050 } while (0)
1051
1052
1053#define ADD_CHARSET_DATA(buf, from, to, id) \
1054 do { \
1055 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1056 *buf++ = id; \
1057 } while (0)
1058
1005 1059
1006/*** 2. Emacs' internal format (emacs-utf-8) ***/ 1060/*** 2. Emacs' internal format (emacs-utf-8) ***/
1007 1061
@@ -1011,8 +1065,8 @@ alloc_destination (coding, nbytes, dst)
1011/*** 3. UTF-8 ***/ 1065/*** 3. UTF-8 ***/
1012 1066
1013/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1067/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1014 Check if a text is encoded in UTF-8. If it is, return 1068 Check if a text is encoded in UTF-8. If it is, return 1, else
1015 CATEGORY_MASK_UTF_8, else return 0. */ 1069 return 0. */
1016 1070
1017#define UTF_8_1_OCTET_P(c) ((c) < 0x80) 1071#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1018#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) 1072#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
@@ -1022,9 +1076,9 @@ alloc_destination (coding, nbytes, dst)
1022#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) 1076#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1023 1077
1024static int 1078static int
1025detect_coding_utf_8 (coding, mask) 1079detect_coding_utf_8 (coding, detect_info)
1026 struct coding_system *coding; 1080 struct coding_system *coding;
1027 int *mask; 1081 struct coding_detection_info *detect_info;
1028{ 1082{
1029 unsigned char *src = coding->source, *src_base = src; 1083 unsigned char *src = coding->source, *src_base = src;
1030 unsigned char *src_end = coding->source + coding->src_bytes; 1084 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -1033,6 +1087,7 @@ detect_coding_utf_8 (coding, mask)
1033 int found = 0; 1087 int found = 0;
1034 int incomplete; 1088 int incomplete;
1035 1089
1090 detect_info->checked |= CATEGORY_MASK_UTF_8;
1036 /* A coding system of this category is always ASCII compatible. */ 1091 /* A coding system of this category is always ASCII compatible. */
1037 src += coding->head_ascii; 1092 src += coding->head_ascii;
1038 1093
@@ -1050,7 +1105,7 @@ detect_coding_utf_8 (coding, mask)
1050 break; 1105 break;
1051 if (UTF_8_2_OCTET_LEADING_P (c)) 1106 if (UTF_8_2_OCTET_LEADING_P (c))
1052 { 1107 {
1053 found++; 1108 found = CATEGORY_MASK_UTF_8;
1054 continue; 1109 continue;
1055 } 1110 }
1056 ONE_MORE_BYTE (c2); 1111 ONE_MORE_BYTE (c2);
@@ -1058,7 +1113,7 @@ detect_coding_utf_8 (coding, mask)
1058 break; 1113 break;
1059 if (UTF_8_3_OCTET_LEADING_P (c)) 1114 if (UTF_8_3_OCTET_LEADING_P (c))
1060 { 1115 {
1061 found++; 1116 found = CATEGORY_MASK_UTF_8;
1062 continue; 1117 continue;
1063 } 1118 }
1064 ONE_MORE_BYTE (c3); 1119 ONE_MORE_BYTE (c3);
@@ -1066,7 +1121,7 @@ detect_coding_utf_8 (coding, mask)
1066 break; 1121 break;
1067 if (UTF_8_4_OCTET_LEADING_P (c)) 1122 if (UTF_8_4_OCTET_LEADING_P (c))
1068 { 1123 {
1069 found++; 1124 found = CATEGORY_MASK_UTF_8;
1070 continue; 1125 continue;
1071 } 1126 }
1072 ONE_MORE_BYTE (c4); 1127 ONE_MORE_BYTE (c4);
@@ -1074,21 +1129,22 @@ detect_coding_utf_8 (coding, mask)
1074 break; 1129 break;
1075 if (UTF_8_5_OCTET_LEADING_P (c)) 1130 if (UTF_8_5_OCTET_LEADING_P (c))
1076 { 1131 {
1077 found++; 1132 found = CATEGORY_MASK_UTF_8;
1078 continue; 1133 continue;
1079 } 1134 }
1080 break; 1135 break;
1081 } 1136 }
1082 *mask &= ~CATEGORY_MASK_UTF_8; 1137 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1083 return 0; 1138 return 0;
1084 1139
1085 no_more_source: 1140 no_more_source:
1086 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 1141 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1087 { 1142 {
1088 *mask &= ~CATEGORY_MASK_UTF_8; 1143 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1089 return 0; 1144 return 0;
1090 } 1145 }
1091 return found; 1146 detect_info->found |= found;
1147 return 1;
1092} 1148}
1093 1149
1094 1150
@@ -1269,10 +1325,8 @@ encode_coding_utf_8 (coding)
1269 1325
1270 1326
1271/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1327/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1272 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or 1328 Check if a text is encoded in one of UTF-16 based coding systems.
1273 Little Endian (otherwise). If it is, return 1329 If it is, return 1, else return 0. */
1274 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1275 else return 0. */
1276 1330
1277#define UTF_16_HIGH_SURROGATE_P(val) \ 1331#define UTF_16_HIGH_SURROGATE_P(val) \
1278 (((val) & 0xFC00) == 0xD800) 1332 (((val) & 0xFC00) == 0xD800)
@@ -1287,9 +1341,9 @@ encode_coding_utf_8 (coding)
1287 1341
1288 1342
1289static int 1343static int
1290detect_coding_utf_16 (coding, mask) 1344detect_coding_utf_16 (coding, detect_info)
1291 struct coding_system *coding; 1345 struct coding_system *coding;
1292 int *mask; 1346 struct coding_detection_info *detect_info;
1293{ 1347{
1294 unsigned char *src = coding->source, *src_base = src; 1348 unsigned char *src = coding->source, *src_base = src;
1295 unsigned char *src_end = coding->source + coding->src_bytes; 1349 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -1297,21 +1351,29 @@ detect_coding_utf_16 (coding, mask)
1297 int consumed_chars = 0; 1351 int consumed_chars = 0;
1298 int c1, c2; 1352 int c1, c2;
1299 1353
1300 *mask &= ~CATEGORY_MASK_UTF_16; 1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
1301 1355
1356 if (coding->mode & CODING_MODE_LAST_BLOCK
1357 && (coding->src_bytes & 1))
1358 {
1359 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1360 return 0;
1361 }
1302 ONE_MORE_BYTE (c1); 1362 ONE_MORE_BYTE (c1);
1303 ONE_MORE_BYTE (c2); 1363 ONE_MORE_BYTE (c2);
1304 1364
1305 if ((c1 == 0xFF) && (c2 == 0xFE)) 1365 if ((c1 == 0xFF) && (c2 == 0xFE))
1306 *mask |= CATEGORY_MASK_UTF_16_LE; 1366 {
1367 detect_info->found |= CATEGORY_MASK_UTF_16_LE;
1368 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1369 }
1307 else if ((c1 == 0xFE) && (c2 == 0xFF)) 1370 else if ((c1 == 0xFE) && (c2 == 0xFF))
1308 *mask |= CATEGORY_MASK_UTF_16_BE; 1371 {
1309 else 1372 detect_info->found |= CATEGORY_MASK_UTF_16_BE;
1310 *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG; 1373 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1311 return 1; 1374 }
1312
1313 no_more_source: 1375 no_more_source:
1314 return 0; 1376 return 1;
1315} 1377}
1316 1378
1317static void 1379static void
@@ -1559,10 +1621,10 @@ encode_coding_utf_16 (coding)
1559char emacs_mule_bytes[256]; 1621char emacs_mule_bytes[256];
1560 1622
1561int 1623int
1562emacs_mule_char (coding, src, nbytes, nchars) 1624emacs_mule_char (coding, src, nbytes, nchars, id)
1563 struct coding_system *coding; 1625 struct coding_system *coding;
1564 unsigned char *src; 1626 unsigned char *src;
1565 int *nbytes, *nchars; 1627 int *nbytes, *nchars, *id;
1566{ 1628{
1567 unsigned char *src_end = coding->source + coding->src_bytes; 1629 unsigned char *src_end = coding->source + coding->src_bytes;
1568 int multibytep = coding->src_multibyte; 1630 int multibytep = coding->src_multibyte;
@@ -1627,6 +1689,8 @@ emacs_mule_char (coding, src, nbytes, nchars)
1627 goto invalid_code; 1689 goto invalid_code;
1628 *nbytes = src - src_base; 1690 *nbytes = src - src_base;
1629 *nchars = consumed_chars; 1691 *nchars = consumed_chars;
1692 if (id)
1693 *id = charset->id;
1630 return c; 1694 return c;
1631 1695
1632 no_more_source: 1696 no_more_source:
@@ -1638,12 +1702,13 @@ emacs_mule_char (coding, src, nbytes, nchars)
1638 1702
1639 1703
1640/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1704/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1641 Check if a text is encoded in `emacs-mule'. */ 1705 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1706 else return 0. */
1642 1707
1643static int 1708static int
1644detect_coding_emacs_mule (coding, mask) 1709detect_coding_emacs_mule (coding, detect_info)
1645 struct coding_system *coding; 1710 struct coding_system *coding;
1646 int *mask; 1711 struct coding_detection_info *detect_info;
1647{ 1712{
1648 unsigned char *src = coding->source, *src_base = src; 1713 unsigned char *src = coding->source, *src_base = src;
1649 unsigned char *src_end = coding->source + coding->src_bytes; 1714 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -1653,6 +1718,7 @@ detect_coding_emacs_mule (coding, mask)
1653 int found = 0; 1718 int found = 0;
1654 int incomplete; 1719 int incomplete;
1655 1720
1721 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1656 /* A coding system of this category is always ASCII compatible. */ 1722 /* A coding system of this category is always ASCII compatible. */
1657 src += coding->head_ascii; 1723 src += coding->head_ascii;
1658 1724
@@ -1680,7 +1746,7 @@ detect_coding_emacs_mule (coding, mask)
1680 1746
1681 if (src - src_base <= 4) 1747 if (src - src_base <= 4)
1682 break; 1748 break;
1683 found = 1; 1749 found = CATEGORY_MASK_EMACS_MULE;
1684 if (c == 0x80) 1750 if (c == 0x80)
1685 goto repeat; 1751 goto repeat;
1686 } 1752 }
@@ -1702,19 +1768,20 @@ detect_coding_emacs_mule (coding, mask)
1702 while (c >= 0xA0); 1768 while (c >= 0xA0);
1703 if (src - src_base != emacs_mule_bytes[*src_base]) 1769 if (src - src_base != emacs_mule_bytes[*src_base])
1704 break; 1770 break;
1705 found = 1; 1771 found = CATEGORY_MASK_EMACS_MULE;
1706 } 1772 }
1707 } 1773 }
1708 *mask &= ~CATEGORY_MASK_EMACS_MULE; 1774 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1709 return 0; 1775 return 0;
1710 1776
1711 no_more_source: 1777 no_more_source:
1712 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 1778 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1713 { 1779 {
1714 *mask &= ~CATEGORY_MASK_EMACS_MULE; 1780 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1715 return 0; 1781 return 0;
1716 } 1782 }
1717 return found; 1783 detect_info->found |= found;
1784 return 1;
1718} 1785}
1719 1786
1720 1787
@@ -1735,7 +1802,7 @@ detect_coding_emacs_mule (coding, mask)
1735 \ 1802 \
1736 if (src == src_end) \ 1803 if (src == src_end) \
1737 break; \ 1804 break; \
1738 c = emacs_mule_char (coding, src, &nbytes, &nchars); \ 1805 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1739 if (c < 0) \ 1806 if (c < 0) \
1740 { \ 1807 { \
1741 if (c == -2) \ 1808 if (c == -2) \
@@ -1792,16 +1859,6 @@ detect_coding_emacs_mule (coding, mask)
1792 } while (0) 1859 } while (0)
1793 1860
1794 1861
1795#define ADD_COMPOSITION_DATA(buf, method, nchars) \
1796 do { \
1797 *buf++ = -5; \
1798 *buf++ = coding->produced_char + char_offset; \
1799 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1800 *buf++ = method; \
1801 *buf++ = nchars; \
1802 } while (0)
1803
1804
1805#define DECODE_EMACS_MULE_21_COMPOSITION(c) \ 1862#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1806 do { \ 1863 do { \
1807 /* Emacs 21 style format. The first three bytes at SRC are \ 1864 /* Emacs 21 style format. The first three bytes at SRC are \
@@ -1810,6 +1867,7 @@ detect_coding_emacs_mule (coding, mask)
1810 number of characters composed by this composition. */ \ 1867 number of characters composed by this composition. */ \
1811 enum composition_method method = c - 0xF2; \ 1868 enum composition_method method = c - 0xF2; \
1812 int *charbuf_base = charbuf; \ 1869 int *charbuf_base = charbuf; \
1870 int from, to; \
1813 int consumed_chars_limit; \ 1871 int consumed_chars_limit; \
1814 int nbytes, nchars; \ 1872 int nbytes, nchars; \
1815 \ 1873 \
@@ -1819,7 +1877,9 @@ detect_coding_emacs_mule (coding, mask)
1819 goto invalid_code; \ 1877 goto invalid_code; \
1820 ONE_MORE_BYTE (c); \ 1878 ONE_MORE_BYTE (c); \
1821 nchars = c - 0xA0; \ 1879 nchars = c - 0xA0; \
1822 ADD_COMPOSITION_DATA (charbuf, method, nchars); \ 1880 from = coding->produced + char_offset; \
1881 to = from + nchars; \
1882 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1823 consumed_chars_limit = consumed_chars_base + nbytes; \ 1883 consumed_chars_limit = consumed_chars_base + nbytes; \
1824 if (method != COMPOSITION_RELATIVE) \ 1884 if (method != COMPOSITION_RELATIVE) \
1825 { \ 1885 { \
@@ -1843,9 +1903,11 @@ detect_coding_emacs_mule (coding, mask)
1843 do { \ 1903 do { \
1844 /* Emacs 20 style format for relative composition. */ \ 1904 /* Emacs 20 style format for relative composition. */ \
1845 /* Store multibyte form of characters to be composed. */ \ 1905 /* Store multibyte form of characters to be composed. */ \
1906 enum composition_method method = COMPOSITION_RELATIVE; \
1846 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ 1907 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1847 int *buf = components; \ 1908 int *buf = components; \
1848 int i, j; \ 1909 int i, j; \
1910 int from, to; \
1849 \ 1911 \
1850 src = src_base; \ 1912 src = src_base; \
1851 ONE_MORE_BYTE (c); /* skip 0x80 */ \ 1913 ONE_MORE_BYTE (c); /* skip 0x80 */ \
@@ -1853,7 +1915,9 @@ detect_coding_emacs_mule (coding, mask)
1853 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ 1915 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1854 if (i < 2) \ 1916 if (i < 2) \
1855 goto invalid_code; \ 1917 goto invalid_code; \
1856 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \ 1918 from = coding->produced_char + char_offset; \
1919 to = from + i; \
1920 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1857 for (j = 0; j < i; j++) \ 1921 for (j = 0; j < i; j++) \
1858 *charbuf++ = components[j]; \ 1922 *charbuf++ = components[j]; \
1859 } while (0) 1923 } while (0)
@@ -1863,9 +1927,11 @@ detect_coding_emacs_mule (coding, mask)
1863 do { \ 1927 do { \
1864 /* Emacs 20 style format for rule-base composition. */ \ 1928 /* Emacs 20 style format for rule-base composition. */ \
1865 /* Store multibyte form of characters to be composed. */ \ 1929 /* Store multibyte form of characters to be composed. */ \
1930 enum composition_method method = COMPOSITION_WITH_RULE; \
1866 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ 1931 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1867 int *buf = components; \ 1932 int *buf = components; \
1868 int i, j; \ 1933 int i, j; \
1934 int from, to; \
1869 \ 1935 \
1870 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ 1936 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1871 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ 1937 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
@@ -1877,7 +1943,9 @@ detect_coding_emacs_mule (coding, mask)
1877 goto invalid_code; \ 1943 goto invalid_code; \
1878 if (charbuf + i + (i / 2) + 1 < charbuf_end) \ 1944 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1879 goto no_more_source; \ 1945 goto no_more_source; \
1880 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \ 1946 from = coding->produced_char + char_offset; \
1947 to = from + i; \
1948 ADD_COMPOSITION_DATA (buf, from, to, method); \
1881 for (j = 0; j < i; j++) \ 1949 for (j = 0; j < i; j++) \
1882 *charbuf++ = components[j]; \ 1950 *charbuf++ = components[j]; \
1883 for (j = 0; j < i; j += 2) \ 1951 for (j = 0; j < i; j += 2) \
@@ -1893,11 +1961,13 @@ decode_coding_emacs_mule (coding)
1893 unsigned char *src_end = coding->source + coding->src_bytes; 1961 unsigned char *src_end = coding->source + coding->src_bytes;
1894 unsigned char *src_base; 1962 unsigned char *src_base;
1895 int *charbuf = coding->charbuf; 1963 int *charbuf = coding->charbuf;
1896 int *charbuf_end = charbuf + coding->charbuf_size; 1964 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1897 int consumed_chars = 0, consumed_chars_base; 1965 int consumed_chars = 0, consumed_chars_base;
1898 int char_offset = 0;
1899 int multibytep = coding->src_multibyte; 1966 int multibytep = coding->src_multibyte;
1900 Lisp_Object attrs, eol_type, charset_list; 1967 Lisp_Object attrs, eol_type, charset_list;
1968 int char_offset = coding->produced_char;
1969 int last_offset = char_offset;
1970 int last_id = charset_ascii;
1901 1971
1902 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 1972 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1903 1973
@@ -1935,8 +2005,6 @@ decode_coding_emacs_mule (coding)
1935 } 2005 }
1936 else if (c == 0x80) 2006 else if (c == 0x80)
1937 { 2007 {
1938 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1939 break;
1940 ONE_MORE_BYTE (c); 2008 ONE_MORE_BYTE (c);
1941 if (c - 0xF2 >= COMPOSITION_RELATIVE 2009 if (c - 0xF2 >= COMPOSITION_RELATIVE
1942 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) 2010 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
@@ -1947,20 +2015,28 @@ decode_coding_emacs_mule (coding)
1947 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); 2015 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1948 else 2016 else
1949 goto invalid_code; 2017 goto invalid_code;
1950 coding->annotated = 1;
1951 } 2018 }
1952 else if (c < 0xA0 && emacs_mule_bytes[c] > 1) 2019 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1953 { 2020 {
1954 int nbytes, nchars; 2021 int nbytes, nchars;
2022 int id;
2023
1955 src = src_base; 2024 src = src_base;
1956 consumed_chars = consumed_chars_base; 2025 consumed_chars = consumed_chars_base;
1957 c = emacs_mule_char (coding, src, &nbytes, &nchars); 2026 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1958 if (c < 0) 2027 if (c < 0)
1959 { 2028 {
1960 if (c == -2) 2029 if (c == -2)
1961 break; 2030 break;
1962 goto invalid_code; 2031 goto invalid_code;
1963 } 2032 }
2033 if (last_id != id)
2034 {
2035 if (last_id != charset_ascii)
2036 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2037 last_id = id;
2038 last_offset = char_offset;
2039 }
1964 *charbuf++ = c; 2040 *charbuf++ = c;
1965 src += nbytes; 2041 src += nbytes;
1966 consumed_chars += nchars; 2042 consumed_chars += nchars;
@@ -1973,10 +2049,13 @@ decode_coding_emacs_mule (coding)
1973 consumed_chars = consumed_chars_base; 2049 consumed_chars = consumed_chars_base;
1974 ONE_MORE_BYTE (c); 2050 ONE_MORE_BYTE (c);
1975 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 2051 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2052 char_offset++;
1976 coding->errors++; 2053 coding->errors++;
1977 } 2054 }
1978 2055
1979 no_more_source: 2056 no_more_source:
2057 if (last_id != charset_ascii)
2058 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
1980 coding->consumed_char += consumed_chars_base; 2059 coding->consumed_char += consumed_chars_base;
1981 coding->consumed = src_base - coding->source; 2060 coding->consumed = src_base - coding->source;
1982 coding->charbuf_used = charbuf - coding->charbuf; 2061 coding->charbuf_used = charbuf - coding->charbuf;
@@ -2011,6 +2090,7 @@ encode_coding_emacs_mule (coding)
2011 int produced_chars = 0; 2090 int produced_chars = 0;
2012 Lisp_Object attrs, eol_type, charset_list; 2091 Lisp_Object attrs, eol_type, charset_list;
2013 int c; 2092 int c;
2093 int preferred_charset_id = -1;
2014 2094
2015 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 2095 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2016 2096
@@ -2018,6 +2098,29 @@ encode_coding_emacs_mule (coding)
2018 { 2098 {
2019 ASSURE_DESTINATION (safe_room); 2099 ASSURE_DESTINATION (safe_room);
2020 c = *charbuf++; 2100 c = *charbuf++;
2101
2102 if (c < 0)
2103 {
2104 /* Handle an annotation. */
2105 switch (*charbuf)
2106 {
2107 case CODING_ANNOTATE_COMPOSITION_MASK:
2108 /* Not yet implemented. */
2109 break;
2110 case CODING_ANNOTATE_CHARSET_MASK:
2111 preferred_charset_id = charbuf[3];
2112 if (preferred_charset_id >= 0
2113 && NILP (Fmemq (make_number (preferred_charset_id),
2114 charset_list)))
2115 preferred_charset_id = -1;
2116 break;
2117 default:
2118 abort ();
2119 }
2120 charbuf += -c - 1;
2121 continue;
2122 }
2123
2021 if (ASCII_CHAR_P (c)) 2124 if (ASCII_CHAR_P (c))
2022 EMIT_ONE_ASCII_BYTE (c); 2125 EMIT_ONE_ASCII_BYTE (c);
2023 else if (CHAR_BYTE8_P (c)) 2126 else if (CHAR_BYTE8_P (c))
@@ -2033,7 +2136,14 @@ encode_coding_emacs_mule (coding)
2033 int emacs_mule_id; 2136 int emacs_mule_id;
2034 unsigned char leading_codes[2]; 2137 unsigned char leading_codes[2];
2035 2138
2036 charset = char_charset (c, charset_list, &code); 2139 if (preferred_charset_id >= 0)
2140 {
2141 charset = CHARSET_FROM_ID (preferred_charset_id);
2142 if (! CHAR_CHARSET_P (c, charset))
2143 charset = char_charset (c, charset_list, NULL);
2144 }
2145 else
2146 charset = char_charset (c, charset_list, &code);
2037 if (! charset) 2147 if (! charset)
2038 { 2148 {
2039 c = coding->default_char; 2149 c = coding->default_char;
@@ -2319,32 +2429,26 @@ setup_iso_safe_charsets (attrs)
2319 2429
2320 2430
2321/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 2431/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2322 Check if a text is encoded in ISO2022. If it is, returns an 2432 Check if a text is encoded in one of ISO-2022 based codig systems.
2323 integer in which appropriate flag bits any of: 2433 If it is, return 1, else return 0. */
2324 CATEGORY_MASK_ISO_7
2325 CATEGORY_MASK_ISO_7_TIGHT
2326 CATEGORY_MASK_ISO_8_1
2327 CATEGORY_MASK_ISO_8_2
2328 CATEGORY_MASK_ISO_7_ELSE
2329 CATEGORY_MASK_ISO_8_ELSE
2330 are set. If a code which should never appear in ISO2022 is found,
2331 returns 0. */
2332 2434
2333static int 2435static int
2334detect_coding_iso_2022 (coding, mask) 2436detect_coding_iso_2022 (coding, detect_info)
2335 struct coding_system *coding; 2437 struct coding_system *coding;
2336 int *mask; 2438 struct coding_detection_info *detect_info;
2337{ 2439{
2338 unsigned char *src = coding->source, *src_base = src; 2440 unsigned char *src = coding->source, *src_base = src;
2339 unsigned char *src_end = coding->source + coding->src_bytes; 2441 unsigned char *src_end = coding->source + coding->src_bytes;
2340 int multibytep = coding->src_multibyte; 2442 int multibytep = coding->src_multibyte;
2341 int mask_iso = CATEGORY_MASK_ISO; 2443 int single_shifting = 0;
2342 int mask_found = 0, mask_8bit_found = 0;
2343 int reg[4], shift_out = 0, single_shifting = 0;
2344 int id; 2444 int id;
2345 int c, c1; 2445 int c, c1;
2346 int consumed_chars = 0; 2446 int consumed_chars = 0;
2347 int i; 2447 int i;
2448 int rejected = 0;
2449 int found = 0;
2450
2451 detect_info->checked |= CATEGORY_MASK_ISO;
2348 2452
2349 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) 2453 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2350 { 2454 {
@@ -2363,8 +2467,7 @@ detect_coding_iso_2022 (coding, mask)
2363 /* A coding system of this category is always ASCII compatible. */ 2467 /* A coding system of this category is always ASCII compatible. */
2364 src += coding->head_ascii; 2468 src += coding->head_ascii;
2365 2469
2366 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1; 2470 while (rejected != CATEGORY_MASK_ISO)
2367 while (mask_iso && src < src_end)
2368 { 2471 {
2369 ONE_MORE_BYTE (c); 2472 ONE_MORE_BYTE (c);
2370 switch (c) 2473 switch (c)
@@ -2382,7 +2485,6 @@ detect_coding_iso_2022 (coding, mask)
2382 || (id = iso_charset_table[0][c >= ','][c1]) < 0) 2485 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2383 /* Invalid designation sequence. Just ignore. */ 2486 /* Invalid designation sequence. Just ignore. */
2384 break; 2487 break;
2385 reg[(c - '(') % 4] = id;
2386 } 2488 }
2387 else if (c == '$') 2489 else if (c == '$')
2388 { 2490 {
@@ -2390,7 +2492,7 @@ detect_coding_iso_2022 (coding, mask)
2390 ONE_MORE_BYTE (c); 2492 ONE_MORE_BYTE (c);
2391 if (c >= '@' && c <= 'B') 2493 if (c >= '@' && c <= 'B')
2392 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ 2494 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2393 reg[0] = id = iso_charset_table[1][0][c]; 2495 id = iso_charset_table[1][0][c];
2394 else if (c >= '(' && c <= '/') 2496 else if (c >= '(' && c <= '/')
2395 { 2497 {
2396 ONE_MORE_BYTE (c1); 2498 ONE_MORE_BYTE (c1);
@@ -2398,116 +2500,86 @@ detect_coding_iso_2022 (coding, mask)
2398 || (id = iso_charset_table[1][c >= ','][c1]) < 0) 2500 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2399 /* Invalid designation sequence. Just ignore. */ 2501 /* Invalid designation sequence. Just ignore. */
2400 break; 2502 break;
2401 reg[(c - '(') % 4] = id;
2402 } 2503 }
2403 else 2504 else
2404 /* Invalid designation sequence. Just ignore. */ 2505 /* Invalid designation sequence. Just ignore it. */
2405 break; 2506 break;
2406 } 2507 }
2407 else if (c == 'N' || c == 'O') 2508 else if (c == 'N' || c == 'O')
2408 { 2509 {
2409 /* ESC <Fe> for SS2 or SS3. */ 2510 /* ESC <Fe> for SS2 or SS3. */
2410 mask_iso &= CATEGORY_MASK_ISO_7_ELSE; 2511 single_shifting = 1;
2512 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2411 break; 2513 break;
2412 } 2514 }
2413 else if (c >= '0' && c <= '4') 2515 else if (c >= '0' && c <= '4')
2414 { 2516 {
2415 /* ESC <Fp> for start/end composition. */ 2517 /* ESC <Fp> for start/end composition. */
2416 mask_found |= CATEGORY_MASK_ISO; 2518 found |= CATEGORY_MASK_ISO;
2417 break; 2519 break;
2418 } 2520 }
2419 else 2521 else
2420 { 2522 {
2421 /* Invalid escape sequence. */ 2523 /* Invalid escape sequence. Just ignore it. */
2422 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2423 break; 2524 break;
2424 } 2525 }
2425 2526
2426 /* We found a valid designation sequence for CHARSET. */ 2527 /* We found a valid designation sequence for CHARSET. */
2427 mask_iso &= ~CATEGORY_MASK_ISO_8BIT; 2528 rejected |= CATEGORY_MASK_ISO_8BIT;
2428 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], 2529 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2429 id)) 2530 id))
2430 mask_found |= CATEGORY_MASK_ISO_7; 2531 found |= CATEGORY_MASK_ISO_7;
2431 else 2532 else
2432 mask_iso &= ~CATEGORY_MASK_ISO_7; 2533 rejected |= CATEGORY_MASK_ISO_7;
2433 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], 2534 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2434 id)) 2535 id))
2435 mask_found |= CATEGORY_MASK_ISO_7_TIGHT; 2536 found |= CATEGORY_MASK_ISO_7_TIGHT;
2436 else 2537 else
2437 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT; 2538 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2438 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], 2539 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2439 id)) 2540 id))
2440 mask_found |= CATEGORY_MASK_ISO_7_ELSE; 2541 found |= CATEGORY_MASK_ISO_7_ELSE;
2441 else 2542 else
2442 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE; 2543 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2443 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], 2544 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2444 id)) 2545 id))
2445 mask_found |= CATEGORY_MASK_ISO_8_ELSE; 2546 found |= CATEGORY_MASK_ISO_8_ELSE;
2446 else 2547 else
2447 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE; 2548 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2448 break; 2549 break;
2449 2550
2450 case ISO_CODE_SO: 2551 case ISO_CODE_SO:
2451 if (inhibit_iso_escape_detection)
2452 break;
2453 single_shifting = 0;
2454 if (shift_out == 0
2455 && (reg[1] >= 0
2456 || SHIFT_OUT_OK (coding_category_iso_7_else)
2457 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2458 {
2459 /* Locking shift out. */
2460 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2461 mask_found |= CATEGORY_MASK_ISO_ELSE;
2462 }
2463 break;
2464
2465 case ISO_CODE_SI: 2552 case ISO_CODE_SI:
2553 /* Locking shift out/in. */
2466 if (inhibit_iso_escape_detection) 2554 if (inhibit_iso_escape_detection)
2467 break; 2555 break;
2468 single_shifting = 0; 2556 single_shifting = 0;
2469 if (shift_out == 1) 2557 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2470 { 2558 found |= CATEGORY_MASK_ISO_ELSE;
2471 /* Locking shift in. */
2472 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2473 mask_found |= CATEGORY_MASK_ISO_ELSE;
2474 }
2475 break; 2559 break;
2476 2560
2477 case ISO_CODE_CSI: 2561 case ISO_CODE_CSI:
2562 /* Control sequence introducer. */
2478 single_shifting = 0; 2563 single_shifting = 0;
2564 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2565 found |= CATEGORY_MASK_ISO_8_ELSE;
2566 goto check_extra_latin;
2567
2568
2479 case ISO_CODE_SS2: 2569 case ISO_CODE_SS2:
2480 case ISO_CODE_SS3: 2570 case ISO_CODE_SS3:
2481 { 2571 /* Single shift. */
2482 int newmask = CATEGORY_MASK_ISO_8_ELSE; 2572 if (inhibit_iso_escape_detection)
2483 2573 break;
2484 mask_8bit_found = 1; 2574 single_shifting = 1;
2485 if (inhibit_iso_escape_detection) 2575 rejected |= CATEGORY_MASK_ISO_7BIT;
2486 break; 2576 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2487 if (c != ISO_CODE_CSI) 2577 & CODING_ISO_FLAG_SINGLE_SHIFT)
2488 { 2578 found |= CATEGORY_MASK_ISO_8_1;
2489 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) 2579 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2490 & CODING_ISO_FLAG_SINGLE_SHIFT) 2580 & CODING_ISO_FLAG_SINGLE_SHIFT)
2491 newmask |= CATEGORY_MASK_ISO_8_1; 2581 found |= CATEGORY_MASK_ISO_8_2;
2492 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) 2582 goto check_extra_latin;
2493 & CODING_ISO_FLAG_SINGLE_SHIFT)
2494 newmask |= CATEGORY_MASK_ISO_8_2;
2495 single_shifting = 1;
2496 }
2497 if (VECTORP (Vlatin_extra_code_table)
2498 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2499 {
2500 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2501 & CODING_ISO_FLAG_LATIN_EXTRA)
2502 newmask |= CATEGORY_MASK_ISO_8_1;
2503 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2504 & CODING_ISO_FLAG_LATIN_EXTRA)
2505 newmask |= CATEGORY_MASK_ISO_8_2;
2506 }
2507 mask_iso &= newmask;
2508 mask_found |= newmask;
2509 }
2510 break;
2511 2583
2512 default: 2584 default:
2513 if (c < 0x80) 2585 if (c < 0x80)
@@ -2515,39 +2587,16 @@ detect_coding_iso_2022 (coding, mask)
2515 single_shifting = 0; 2587 single_shifting = 0;
2516 break; 2588 break;
2517 } 2589 }
2518 else if (c < 0xA0) 2590 if (c >= 0xA0)
2519 { 2591 {
2520 single_shifting = 0; 2592 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2521 mask_8bit_found = 1; 2593 found |= CATEGORY_MASK_ISO_8_1;
2522 if (VECTORP (Vlatin_extra_code_table)
2523 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2524 {
2525 int newmask = 0;
2526
2527 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2528 & CODING_ISO_FLAG_LATIN_EXTRA)
2529 newmask |= CATEGORY_MASK_ISO_8_1;
2530 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2531 & CODING_ISO_FLAG_LATIN_EXTRA)
2532 newmask |= CATEGORY_MASK_ISO_8_2;
2533 mask_iso &= newmask;
2534 mask_found |= newmask;
2535 }
2536 else
2537 return 0;
2538 }
2539 else
2540 {
2541 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2542 | CATEGORY_MASK_ISO_7_ELSE);
2543 mask_found |= CATEGORY_MASK_ISO_8_1;
2544 mask_8bit_found = 1;
2545 /* Check the length of succeeding codes of the range 2594 /* Check the length of succeeding codes of the range
2546 0xA0..0FF. If the byte length is odd, we exclude 2595 0xA0..0FF. If the byte length is even, we include
2547 CATEGORY_MASK_ISO_8_2. We can check this only 2596 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2548 when we are not single shifting. */ 2597 only when we are not single shifting. */
2549 if (!single_shifting 2598 if (! single_shifting
2550 && mask_iso & CATEGORY_MASK_ISO_8_2) 2599 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2551 { 2600 {
2552 int i = 1; 2601 int i = 1;
2553 while (src < src_end) 2602 while (src < src_end)
@@ -2559,26 +2608,38 @@ detect_coding_iso_2022 (coding, mask)
2559 } 2608 }
2560 2609
2561 if (i & 1 && src < src_end) 2610 if (i & 1 && src < src_end)
2562 mask_iso &= ~CATEGORY_MASK_ISO_8_2; 2611 rejected |= CATEGORY_MASK_ISO_8_2;
2563 else 2612 else
2564 mask_found |= CATEGORY_MASK_ISO_8_2; 2613 found |= CATEGORY_MASK_ISO_8_2;
2565 } 2614 }
2615 break;
2566 } 2616 }
2567 break; 2617 check_extra_latin:
2618 single_shifting = 0;
2619 if (! VECTORP (Vlatin_extra_code_table)
2620 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2621 {
2622 rejected = CATEGORY_MASK_ISO;
2623 break;
2624 }
2625 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2626 & CODING_ISO_FLAG_LATIN_EXTRA)
2627 found |= CATEGORY_MASK_ISO_8_1;
2628 else
2629 rejected |= CATEGORY_MASK_ISO_8_1;
2630 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2631 & CODING_ISO_FLAG_LATIN_EXTRA)
2632 found |= CATEGORY_MASK_ISO_8_2;
2633 else
2634 rejected |= CATEGORY_MASK_ISO_8_2;
2568 } 2635 }
2569 } 2636 }
2637 detect_info->rejected |= CATEGORY_MASK_ISO;
2638 return 0;
2639
2570 no_more_source: 2640 no_more_source:
2571 if (!mask_iso) 2641 detect_info->rejected |= rejected;
2572 { 2642 detect_info->found |= (found & ~rejected);
2573 *mask &= ~CATEGORY_MASK_ISO;
2574 return 0;
2575 }
2576 if (!mask_found)
2577 return 0;
2578 *mask &= ~CATEGORY_MASK_ISO;
2579 *mask |= mask_iso & mask_found;
2580 if (! mask_8bit_found)
2581 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2582 return 1; 2643 return 1;
2583} 2644}
2584 2645
@@ -2694,8 +2755,10 @@ detect_coding_iso_2022 (coding, mask)
2694 : (component_idx + 1) / 2); \ 2755 : (component_idx + 1) / 2); \
2695 int i; \ 2756 int i; \
2696 int *saved_charbuf = charbuf; \ 2757 int *saved_charbuf = charbuf; \
2758 int from = coding->produced_char + char_offset; \
2759 int to = from + nchars; \
2697 \ 2760 \
2698 ADD_COMPOSITION_DATA (charbuf, method, nchars); \ 2761 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2699 if (method != COMPOSITION_RELATIVE) \ 2762 if (method != COMPOSITION_RELATIVE) \
2700 { \ 2763 { \
2701 if (component_len == 0) \ 2764 if (component_len == 0) \
@@ -2752,9 +2815,9 @@ decode_coding_iso_2022 (coding)
2752 unsigned char *src_end = coding->source + coding->src_bytes; 2815 unsigned char *src_end = coding->source + coding->src_bytes;
2753 unsigned char *src_base; 2816 unsigned char *src_base;
2754 int *charbuf = coding->charbuf; 2817 int *charbuf = coding->charbuf;
2755 int *charbuf_end = charbuf + coding->charbuf_size - 4; 2818 int *charbuf_end
2819 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2756 int consumed_chars = 0, consumed_chars_base; 2820 int consumed_chars = 0, consumed_chars_base;
2757 int char_offset = 0;
2758 int multibytep = coding->src_multibyte; 2821 int multibytep = coding->src_multibyte;
2759 /* Charsets invoked to graphic plane 0 and 1 respectively. */ 2822 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2760 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); 2823 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
@@ -2774,6 +2837,9 @@ decode_coding_iso_2022 (coding)
2774 int component_idx; 2837 int component_idx;
2775 int component_len; 2838 int component_len;
2776 Lisp_Object attrs, eol_type, charset_list; 2839 Lisp_Object attrs, eol_type, charset_list;
2840 int char_offset = coding->produced_char;
2841 int last_offset = char_offset;
2842 int last_id = charset_ascii;
2777 2843
2778 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 2844 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2779 setup_iso_safe_charsets (attrs); 2845 setup_iso_safe_charsets (attrs);
@@ -3051,6 +3117,15 @@ decode_coding_iso_2022 (coding)
3051 } 3117 }
3052 } 3118 }
3053 3119
3120 if (charset->id != charset_ascii
3121 && last_id != charset->id)
3122 {
3123 if (last_id != charset_ascii)
3124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3125 last_id = charset->id;
3126 last_offset = char_offset;
3127 }
3128
3054 /* Now we know CHARSET and 1st position code C1 of a character. 3129 /* Now we know CHARSET and 1st position code C1 of a character.
3055 Produce a decoded character while getting 2nd position code 3130 Produce a decoded character while getting 2nd position code
3056 C2 if necessary. */ 3131 C2 if necessary. */
@@ -3082,6 +3157,7 @@ decode_coding_iso_2022 (coding)
3082 *charbuf++ = *src_base; 3157 *charbuf++ = *src_base;
3083 else 3158 else
3084 *charbuf++ = BYTE8_TO_CHAR (*src_base); 3159 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3160 char_offset++;
3085 } 3161 }
3086 } 3162 }
3087 else if (composition_state == COMPOSING_NO) 3163 else if (composition_state == COMPOSING_NO)
@@ -3105,10 +3181,13 @@ decode_coding_iso_2022 (coding)
3105 consumed_chars = consumed_chars_base; 3181 consumed_chars = consumed_chars_base;
3106 ONE_MORE_BYTE (c); 3182 ONE_MORE_BYTE (c);
3107 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3183 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3184 char_offset++;
3108 coding->errors++; 3185 coding->errors++;
3109 } 3186 }
3110 3187
3111 no_more_source: 3188 no_more_source:
3189 if (last_id != charset_ascii)
3190 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3112 coding->consumed_char += consumed_chars_base; 3191 coding->consumed_char += consumed_chars_base;
3113 coding->consumed = src_base - coding->source; 3192 coding->consumed = src_base - coding->source;
3114 coding->charbuf_used = charbuf - coding->charbuf; 3193 coding->charbuf_used = charbuf - coding->charbuf;
@@ -3530,9 +3609,12 @@ encode_coding_iso_2022 (coding)
3530 Lisp_Object attrs, eol_type, charset_list; 3609 Lisp_Object attrs, eol_type, charset_list;
3531 int ascii_compatible; 3610 int ascii_compatible;
3532 int c; 3611 int c;
3612 int preferred_charset_id = -1;
3533 3613
3534 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 3614 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3535 setup_iso_safe_charsets (attrs); 3615 setup_iso_safe_charsets (attrs);
3616 /* Charset list may have been changed. */
3617 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3536 coding->safe_charsets 3618 coding->safe_charsets
3537 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data; 3619 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3538 3620
@@ -3555,6 +3637,28 @@ encode_coding_iso_2022 (coding)
3555 3637
3556 c = *charbuf++; 3638 c = *charbuf++;
3557 3639
3640 if (c < 0)
3641 {
3642 /* Handle an annotation. */
3643 switch (*charbuf)
3644 {
3645 case CODING_ANNOTATE_COMPOSITION_MASK:
3646 /* Not yet implemented. */
3647 break;
3648 case CODING_ANNOTATE_CHARSET_MASK:
3649 preferred_charset_id = charbuf[3];
3650 if (preferred_charset_id >= 0
3651 && NILP (Fmemq (make_number (preferred_charset_id),
3652 charset_list)))
3653 preferred_charset_id = -1;
3654 break;
3655 default:
3656 abort ();
3657 }
3658 charbuf += -c - 1;
3659 continue;
3660 }
3661
3558 /* Now encode the character C. */ 3662 /* Now encode the character C. */
3559 if (c < 0x20 || c == 0x7F) 3663 if (c < 0x20 || c == 0x7F)
3560 { 3664 {
@@ -3595,8 +3699,16 @@ encode_coding_iso_2022 (coding)
3595 } 3699 }
3596 else 3700 else
3597 { 3701 {
3598 struct charset *charset = char_charset (c, charset_list, NULL); 3702 struct charset *charset;
3599 3703
3704 if (preferred_charset_id >= 0)
3705 {
3706 charset = CHARSET_FROM_ID (preferred_charset_id);
3707 if (! CHAR_CHARSET_P (c, charset))
3708 charset = char_charset (c, charset_list, NULL);
3709 }
3710 else
3711 charset = char_charset (c, charset_list, NULL);
3600 if (!charset) 3712 if (!charset)
3601 { 3713 {
3602 if (coding->mode & CODING_MODE_SAFE_ENCODING) 3714 if (coding->mode & CODING_MODE_SAFE_ENCODING)
@@ -3669,9 +3781,9 @@ encode_coding_iso_2022 (coding)
3669 CATEGORY_MASK_SJIS, else return 0. */ 3781 CATEGORY_MASK_SJIS, else return 0. */
3670 3782
3671static int 3783static int
3672detect_coding_sjis (coding, mask) 3784detect_coding_sjis (coding, detect_info)
3673 struct coding_system *coding; 3785 struct coding_system *coding;
3674 int *mask; 3786 struct coding_detection_info *detect_info;
3675{ 3787{
3676 unsigned char *src = coding->source, *src_base = src; 3788 unsigned char *src = coding->source, *src_base = src;
3677 unsigned char *src_end = coding->source + coding->src_bytes; 3789 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -3681,6 +3793,7 @@ detect_coding_sjis (coding, mask)
3681 int c; 3793 int c;
3682 int incomplete; 3794 int incomplete;
3683 3795
3796 detect_info->checked |= CATEGORY_MASK_SJIS;
3684 /* A coding system of this category is always ASCII compatible. */ 3797 /* A coding system of this category is always ASCII compatible. */
3685 src += coding->head_ascii; 3798 src += coding->head_ascii;
3686 3799
@@ -3696,23 +3809,24 @@ detect_coding_sjis (coding, mask)
3696 ONE_MORE_BYTE (c); 3809 ONE_MORE_BYTE (c);
3697 if (c < 0x40 || c == 0x7F || c > 0xFC) 3810 if (c < 0x40 || c == 0x7F || c > 0xFC)
3698 break; 3811 break;
3699 found = 1; 3812 found = CATEGORY_MASK_SJIS;
3700 } 3813 }
3701 else if (c >= 0xA0 && c < 0xE0) 3814 else if (c >= 0xA0 && c < 0xE0)
3702 found = 1; 3815 found = CATEGORY_MASK_SJIS;
3703 else 3816 else
3704 break; 3817 break;
3705 } 3818 }
3706 *mask &= ~CATEGORY_MASK_SJIS; 3819 detect_info->rejected |= CATEGORY_MASK_SJIS;
3707 return 0; 3820 return 0;
3708 3821
3709 no_more_source: 3822 no_more_source:
3710 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 3823 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3711 { 3824 {
3712 *mask &= ~CATEGORY_MASK_SJIS; 3825 detect_info->rejected |= CATEGORY_MASK_SJIS;
3713 return 0; 3826 return 0;
3714 } 3827 }
3715 return found; 3828 detect_info->found |= found;
3829 return 1;
3716} 3830}
3717 3831
3718/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3832/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
@@ -3720,9 +3834,9 @@ detect_coding_sjis (coding, mask)
3720 CATEGORY_MASK_BIG5, else return 0. */ 3834 CATEGORY_MASK_BIG5, else return 0. */
3721 3835
3722static int 3836static int
3723detect_coding_big5 (coding, mask) 3837detect_coding_big5 (coding, detect_info)
3724 struct coding_system *coding; 3838 struct coding_system *coding;
3725 int *mask; 3839 struct coding_detection_info *detect_info;
3726{ 3840{
3727 unsigned char *src = coding->source, *src_base = src; 3841 unsigned char *src = coding->source, *src_base = src;
3728 unsigned char *src_end = coding->source + coding->src_bytes; 3842 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -3732,6 +3846,7 @@ detect_coding_big5 (coding, mask)
3732 int c; 3846 int c;
3733 int incomplete; 3847 int incomplete;
3734 3848
3849 detect_info->checked |= CATEGORY_MASK_BIG5;
3735 /* A coding system of this category is always ASCII compatible. */ 3850 /* A coding system of this category is always ASCII compatible. */
3736 src += coding->head_ascii; 3851 src += coding->head_ascii;
3737 3852
@@ -3747,21 +3862,22 @@ detect_coding_big5 (coding, mask)
3747 ONE_MORE_BYTE (c); 3862 ONE_MORE_BYTE (c);
3748 if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) 3863 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3749 return 0; 3864 return 0;
3750 found = 1; 3865 found = CATEGORY_MASK_BIG5;
3751 } 3866 }
3752 else 3867 else
3753 break; 3868 break;
3754 } 3869 }
3755 *mask &= ~CATEGORY_MASK_BIG5; 3870 detect_info->rejected |= CATEGORY_MASK_BIG5;
3756 return 0; 3871 return 0;
3757 3872
3758 no_more_source: 3873 no_more_source:
3759 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 3874 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3760 { 3875 {
3761 *mask &= ~CATEGORY_MASK_BIG5; 3876 detect_info->rejected |= CATEGORY_MASK_BIG5;
3762 return 0; 3877 return 0;
3763 } 3878 }
3764 return found; 3879 detect_info->found |= found;
3880 return 1;
3765} 3881}
3766 3882
3767/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". 3883/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
@@ -3775,11 +3891,14 @@ decode_coding_sjis (coding)
3775 unsigned char *src_end = coding->source + coding->src_bytes; 3891 unsigned char *src_end = coding->source + coding->src_bytes;
3776 unsigned char *src_base; 3892 unsigned char *src_base;
3777 int *charbuf = coding->charbuf; 3893 int *charbuf = coding->charbuf;
3778 int *charbuf_end = charbuf + coding->charbuf_size; 3894 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3779 int consumed_chars = 0, consumed_chars_base; 3895 int consumed_chars = 0, consumed_chars_base;
3780 int multibytep = coding->src_multibyte; 3896 int multibytep = coding->src_multibyte;
3781 struct charset *charset_roman, *charset_kanji, *charset_kana; 3897 struct charset *charset_roman, *charset_kanji, *charset_kana;
3782 Lisp_Object attrs, eol_type, charset_list, val; 3898 Lisp_Object attrs, eol_type, charset_list, val;
3899 int char_offset = coding->produced_char;
3900 int last_offset = char_offset;
3901 int last_id = charset_ascii;
3783 3902
3784 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 3903 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3785 3904
@@ -3842,9 +3961,18 @@ decode_coding_sjis (coding)
3842 charset = charset_kana; 3961 charset = charset_kana;
3843 } 3962 }
3844 } 3963 }
3964 if (charset->id != charset_ascii
3965 && last_id != charset->id)
3966 {
3967 if (last_id != charset_ascii)
3968 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3969 last_id = charset->id;
3970 last_offset = char_offset;
3971 }
3845 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); 3972 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3846 } 3973 }
3847 *charbuf++ = c; 3974 *charbuf++ = c;
3975 char_offset++;
3848 continue; 3976 continue;
3849 3977
3850 invalid_code: 3978 invalid_code:
@@ -3852,10 +3980,13 @@ decode_coding_sjis (coding)
3852 consumed_chars = consumed_chars_base; 3980 consumed_chars = consumed_chars_base;
3853 ONE_MORE_BYTE (c); 3981 ONE_MORE_BYTE (c);
3854 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3982 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3983 char_offset++;
3855 coding->errors++; 3984 coding->errors++;
3856 } 3985 }
3857 3986
3858 no_more_source: 3987 no_more_source:
3988 if (last_id != charset_ascii)
3989 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3859 coding->consumed_char += consumed_chars_base; 3990 coding->consumed_char += consumed_chars_base;
3860 coding->consumed = src_base - coding->source; 3991 coding->consumed = src_base - coding->source;
3861 coding->charbuf_used = charbuf - coding->charbuf; 3992 coding->charbuf_used = charbuf - coding->charbuf;
@@ -3869,11 +4000,14 @@ decode_coding_big5 (coding)
3869 unsigned char *src_end = coding->source + coding->src_bytes; 4000 unsigned char *src_end = coding->source + coding->src_bytes;
3870 unsigned char *src_base; 4001 unsigned char *src_base;
3871 int *charbuf = coding->charbuf; 4002 int *charbuf = coding->charbuf;
3872 int *charbuf_end = charbuf + coding->charbuf_size; 4003 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3873 int consumed_chars = 0, consumed_chars_base; 4004 int consumed_chars = 0, consumed_chars_base;
3874 int multibytep = coding->src_multibyte; 4005 int multibytep = coding->src_multibyte;
3875 struct charset *charset_roman, *charset_big5; 4006 struct charset *charset_roman, *charset_big5;
3876 Lisp_Object attrs, eol_type, charset_list, val; 4007 Lisp_Object attrs, eol_type, charset_list, val;
4008 int char_offset = coding->produced_char;
4009 int last_offset = char_offset;
4010 int last_id = charset_ascii;
3877 4011
3878 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 4012 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3879 val = charset_list; 4013 val = charset_list;
@@ -3923,10 +4057,19 @@ decode_coding_big5 (coding)
3923 c = c << 8 | c1; 4057 c = c << 8 | c1;
3924 charset = charset_big5; 4058 charset = charset_big5;
3925 } 4059 }
4060 if (charset->id != charset_ascii
4061 && last_id != charset->id)
4062 {
4063 if (last_id != charset_ascii)
4064 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4065 last_id = charset->id;
4066 last_offset = char_offset;
4067 }
3926 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); 4068 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3927 } 4069 }
3928 4070
3929 *charbuf++ = c; 4071 *charbuf++ = c;
4072 char_offset++;
3930 continue; 4073 continue;
3931 4074
3932 invalid_code: 4075 invalid_code:
@@ -3934,10 +4077,13 @@ decode_coding_big5 (coding)
3934 consumed_chars = consumed_chars_base; 4077 consumed_chars = consumed_chars_base;
3935 ONE_MORE_BYTE (c); 4078 ONE_MORE_BYTE (c);
3936 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 4079 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4080 char_offset++;
3937 coding->errors++; 4081 coding->errors++;
3938 } 4082 }
3939 4083
3940 no_more_source: 4084 no_more_source:
4085 if (last_id != charset_ascii)
4086 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3941 coding->consumed_char += consumed_chars_base; 4087 coding->consumed_char += consumed_chars_base;
3942 coding->consumed = src_base - coding->source; 4088 coding->consumed = src_base - coding->source;
3943 coding->charbuf_used = charbuf - coding->charbuf; 4089 coding->charbuf_used = charbuf - coding->charbuf;
@@ -4106,9 +4252,9 @@ encode_coding_big5 (coding)
4106 CATEGORY_MASK_CCL, else return 0. */ 4252 CATEGORY_MASK_CCL, else return 0. */
4107 4253
4108static int 4254static int
4109detect_coding_ccl (coding, mask) 4255detect_coding_ccl (coding, detect_info)
4110 struct coding_system *coding; 4256 struct coding_system *coding;
4111 int *mask; 4257 struct coding_detection_info *detect_info;
4112{ 4258{
4113 unsigned char *src = coding->source, *src_base = src; 4259 unsigned char *src = coding->source, *src_base = src;
4114 unsigned char *src_end = coding->source + coding->src_bytes; 4260 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -4119,6 +4265,8 @@ detect_coding_ccl (coding, mask)
4119 int head_ascii = coding->head_ascii; 4265 int head_ascii = coding->head_ascii;
4120 Lisp_Object attrs; 4266 Lisp_Object attrs;
4121 4267
4268 detect_info->checked |= CATEGORY_MASK_CCL;
4269
4122 coding = &coding_categories[coding_category_ccl]; 4270 coding = &coding_categories[coding_category_ccl];
4123 attrs = CODING_ID_ATTRS (coding->id); 4271 attrs = CODING_ID_ATTRS (coding->id);
4124 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) 4272 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
@@ -4130,14 +4278,15 @@ detect_coding_ccl (coding, mask)
4130 ONE_MORE_BYTE (c); 4278 ONE_MORE_BYTE (c);
4131 if (! valids[c]) 4279 if (! valids[c])
4132 break; 4280 break;
4133 if (!found && valids[c] > 1) 4281 if ((valids[c] > 1))
4134 found = 1; 4282 found = CATEGORY_MASK_CCL;
4135 } 4283 }
4136 *mask &= ~CATEGORY_MASK_CCL; 4284 detect_info->rejected |= CATEGORY_MASK_CCL;
4137 return 0; 4285 return 0;
4138 4286
4139 no_more_source: 4287 no_more_source:
4140 return found; 4288 detect_info->found |= found;
4289 return 1;
4141} 4290}
4142 4291
4143static void 4292static void
@@ -4375,10 +4524,14 @@ encode_coding_raw_text (coding)
4375 return 0; 4524 return 0;
4376} 4525}
4377 4526
4527/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4528 Check if a text is encoded in a charset-based coding system. If it
4529 is, return 1, else return 0. */
4530
4378static int 4531static int
4379detect_coding_charset (coding, mask) 4532detect_coding_charset (coding, detect_info)
4380 struct coding_system *coding; 4533 struct coding_system *coding;
4381 int *mask; 4534 struct coding_detection_info *detect_info;
4382{ 4535{
4383 unsigned char *src = coding->source, *src_base = src; 4536 unsigned char *src = coding->source, *src_base = src;
4384 unsigned char *src_end = coding->source + coding->src_bytes; 4537 unsigned char *src_end = coding->source + coding->src_bytes;
@@ -4387,6 +4540,8 @@ detect_coding_charset (coding, mask)
4387 Lisp_Object attrs, valids; 4540 Lisp_Object attrs, valids;
4388 int found = 0; 4541 int found = 0;
4389 4542
4543 detect_info->checked |= CATEGORY_MASK_CHARSET;
4544
4390 coding = &coding_categories[coding_category_charset]; 4545 coding = &coding_categories[coding_category_charset];
4391 attrs = CODING_ID_ATTRS (coding->id); 4546 attrs = CODING_ID_ATTRS (coding->id);
4392 valids = AREF (attrs, coding_attr_charset_valids); 4547 valids = AREF (attrs, coding_attr_charset_valids);
@@ -4402,13 +4557,14 @@ detect_coding_charset (coding, mask)
4402 if (NILP (AREF (valids, c))) 4557 if (NILP (AREF (valids, c)))
4403 break; 4558 break;
4404 if (c >= 0x80) 4559 if (c >= 0x80)
4405 found = 1; 4560 found = CATEGORY_MASK_CHARSET;
4406 } 4561 }
4407 *mask &= ~CATEGORY_MASK_CHARSET; 4562 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4408 return 0; 4563 return 0;
4409 4564
4410 no_more_source: 4565 no_more_source:
4411 return (found || NILP (CODING_ATTR_ASCII_COMPAT (attrs))); 4566 detect_info->found |= found;
4567 return 1;
4412} 4568}
4413 4569
4414static void 4570static void
@@ -4419,10 +4575,13 @@ decode_coding_charset (coding)
4419 unsigned char *src_end = coding->source + coding->src_bytes; 4575 unsigned char *src_end = coding->source + coding->src_bytes;
4420 unsigned char *src_base; 4576 unsigned char *src_base;
4421 int *charbuf = coding->charbuf; 4577 int *charbuf = coding->charbuf;
4422 int *charbuf_end = charbuf + coding->charbuf_size; 4578 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4423 int consumed_chars = 0, consumed_chars_base; 4579 int consumed_chars = 0, consumed_chars_base;
4424 int multibytep = coding->src_multibyte; 4580 int multibytep = coding->src_multibyte;
4425 Lisp_Object attrs, eol_type, charset_list, valids; 4581 Lisp_Object attrs, eol_type, charset_list, valids;
4582 int char_offset = coding->produced_char;
4583 int last_offset = char_offset;
4584 int last_id = charset_ascii;
4426 4585
4427 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 4586 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4428 valids = AREF (attrs, coding_attr_charset_valids); 4587 valids = AREF (attrs, coding_attr_charset_valids);
@@ -4503,8 +4662,17 @@ decode_coding_charset (coding)
4503 } 4662 }
4504 if (c < 0) 4663 if (c < 0)
4505 goto invalid_code; 4664 goto invalid_code;
4665 if (charset->id != charset_ascii
4666 && last_id != charset->id)
4667 {
4668 if (last_id != charset_ascii)
4669 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4670 last_id = charset->id;
4671 last_offset = char_offset;
4672 }
4506 } 4673 }
4507 *charbuf++ = c; 4674 *charbuf++ = c;
4675 char_offset++;
4508 continue; 4676 continue;
4509 4677
4510 invalid_code: 4678 invalid_code:
@@ -4512,10 +4680,13 @@ decode_coding_charset (coding)
4512 consumed_chars = consumed_chars_base; 4680 consumed_chars = consumed_chars_base;
4513 ONE_MORE_BYTE (c); 4681 ONE_MORE_BYTE (c);
4514 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 4682 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4683 char_offset++;
4515 coding->errors++; 4684 coding->errors++;
4516 } 4685 }
4517 4686
4518 no_more_source: 4687 no_more_source:
4688 if (last_id != charset_ascii)
4689 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4519 coding->consumed_char += consumed_chars_base; 4690 coding->consumed_char += consumed_chars_base;
4520 coding->consumed = src_base - coding->source; 4691 coding->consumed = src_base - coding->source;
4521 coding->charbuf_used = charbuf - coding->charbuf; 4692 coding->charbuf_used = charbuf - coding->charbuf;
@@ -4632,6 +4803,7 @@ setup_coding_system (coding_system, coding)
4632 { 4803 {
4633 int i; 4804 int i;
4634 int flags = XINT (AREF (attrs, coding_attr_iso_flags)); 4805 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4806 enum coding_category category = XINT (CODING_ATTR_CATEGORY (attrs));
4635 4807
4636 /* Invoke graphic register 0 to plane 0. */ 4808 /* Invoke graphic register 0 to plane 0. */
4637 CODING_ISO_INVOCATION (coding, 0) = 0; 4809 CODING_ISO_INVOCATION (coding, 0) = 0;
@@ -4655,6 +4827,8 @@ setup_coding_system (coding_system, coding)
4655 | CODING_REQUIRE_FLUSHING_MASK); 4827 | CODING_REQUIRE_FLUSHING_MASK);
4656 if (flags & CODING_ISO_FLAG_COMPOSITION) 4828 if (flags & CODING_ISO_FLAG_COMPOSITION)
4657 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; 4829 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4830 if (flags & CODING_ISO_FLAG_DESIGNATION)
4831 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4658 if (flags & CODING_ISO_FLAG_FULL_SUPPORT) 4832 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4659 { 4833 {
4660 setup_iso_safe_charsets (attrs); 4834 setup_iso_safe_charsets (attrs);
@@ -4930,9 +5104,12 @@ coding_inherit_eol_type (coding_system, parent)
4930#define EOL_SEEN_CR 2 5104#define EOL_SEEN_CR 2
4931#define EOL_SEEN_CRLF 4 5105#define EOL_SEEN_CRLF 4
4932 5106
4933/* Detect how end-of-line of a text of length CODING->src_bytes 5107/* Detect how end-of-line of a text of length SRC_BYTES pointed by
4934 pointed by CODING->source is encoded. Return one of 5108 SOURCE is encoded. If CATEGORY is one of
4935 EOL_SEEN_XXX. */ 5109 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5110 two-byte, else they are encoded by one-byte.
5111
5112 Return one of EOL_SEEN_XXX. */
4936 5113
4937#define MAX_EOL_CHECK_COUNT 3 5114#define MAX_EOL_CHECK_COUNT 3
4938 5115
@@ -5057,7 +5234,6 @@ detect_coding (coding)
5057 now. */ 5234 now. */
5058 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 5235 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5059 { 5236 {
5060 int mask = CATEGORY_MASK_ANY;
5061 int c, i; 5237 int c, i;
5062 5238
5063 for (src = coding->source; src < src_end; src++) 5239 for (src = coding->source; src < src_end; src++)
@@ -5072,46 +5248,43 @@ detect_coding (coding)
5072 5248
5073 if (coding->head_ascii < coding->src_bytes) 5249 if (coding->head_ascii < coding->src_bytes)
5074 { 5250 {
5075 int detected = 0; 5251 struct coding_detection_info detect_info;
5252 enum coding_category category;
5253 struct coding_system *this;
5076 5254
5255 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5077 for (i = 0; i < coding_category_raw_text; i++) 5256 for (i = 0; i < coding_category_raw_text; i++)
5078 { 5257 {
5079 enum coding_category category = coding_priorities[i]; 5258 category = coding_priorities[i];
5080 struct coding_system *this = coding_categories + category; 5259 this = coding_categories + category;
5081
5082 if (this->id < 0) 5260 if (this->id < 0)
5083 { 5261 {
5084 /* No coding system of this category is defined. */ 5262 /* No coding system of this category is defined. */
5085 mask &= ~(1 << category); 5263 detect_info.rejected |= (1 << category);
5086 } 5264 }
5087 else if (category >= coding_category_raw_text 5265 else if (category >= coding_category_raw_text)
5088 || detected & (1 << category))
5089 continue; 5266 continue;
5090 else 5267 else if (detect_info.checked & (1 << category))
5091 { 5268 {
5092 detected |= detected_mask[category]; 5269 if (detect_info.found & (1 << category))
5093 if ((*(this->detector)) (coding, &mask) 5270 break;
5094 && (mask & (1 << category)))
5095 {
5096 mask = 1 << category;
5097 break;
5098 }
5099 } 5271 }
5272 else if ((*(this->detector)) (coding, &detect_info)
5273 && detect_info.found & (1 << category))
5274 break;
5100 } 5275 }
5101 if (! mask) 5276 if (i < coding_category_raw_text)
5277 setup_coding_system (CODING_ID_NAME (this->id), coding);
5278 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5102 setup_coding_system (Qraw_text, coding); 5279 setup_coding_system (Qraw_text, coding);
5103 else if (mask != CATEGORY_MASK_ANY) 5280 else if (detect_info.rejected)
5104 for (i = 0; i < coding_category_raw_text; i++) 5281 for (i = 0; i < coding_category_raw_text; i++)
5105 { 5282 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5106 enum coding_category category = coding_priorities[i]; 5283 {
5107 struct coding_system *this = coding_categories + category; 5284 this = coding_categories + coding_priorities[i];
5108 5285 setup_coding_system (CODING_ID_NAME (this->id), coding);
5109 if (mask & (1 << category)) 5286 break;
5110 { 5287 }
5111 setup_coding_system (CODING_ID_NAME (this->id), coding);
5112 break;
5113 }
5114 }
5115 } 5288 }
5116 } 5289 }
5117 5290
@@ -5408,9 +5581,9 @@ produce_chars (coding)
5408 return produced_chars; 5581 return produced_chars;
5409} 5582}
5410 5583
5411/* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ] 5584/* Compose text in CODING->object according to the annotation data at
5412 or 5585 CHARBUF. CHARBUF is an array:
5413 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ] 5586 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5414 */ 5587 */
5415 5588
5416static INLINE void 5589static INLINE void
@@ -5418,18 +5591,15 @@ produce_composition (coding, charbuf)
5418 struct coding_system *coding; 5591 struct coding_system *coding;
5419 int *charbuf; 5592 int *charbuf;
5420{ 5593{
5421 Lisp_Object buffer;
5422 int len; 5594 int len;
5423 EMACS_INT pos; 5595 EMACS_INT from, to;
5424 enum composition_method method; 5596 enum composition_method method;
5425 int cmp_len;
5426 Lisp_Object components; 5597 Lisp_Object components;
5427 5598
5428 buffer = coding->dst_object;
5429 len = -charbuf[0]; 5599 len = -charbuf[0];
5430 pos = coding->dst_pos + charbuf[1]; 5600 from = coding->dst_pos + charbuf[2];
5431 method = (enum composition_method) (charbuf[3]); 5601 to = coding->dst_pos + charbuf[3];
5432 cmp_len = charbuf[4]; 5602 method = (enum composition_method) (charbuf[4]);
5433 5603
5434 if (method == COMPOSITION_RELATIVE) 5604 if (method == COMPOSITION_RELATIVE)
5435 components = Qnil; 5605 components = Qnil;
@@ -5445,65 +5615,30 @@ produce_composition (coding, charbuf)
5445 components = (method == COMPOSITION_WITH_ALTCHARS 5615 components = (method == COMPOSITION_WITH_ALTCHARS
5446 ? Fstring (len, args) : Fvector (len, args)); 5616 ? Fstring (len, args) : Fvector (len, args));
5447 } 5617 }
5448 compose_text (pos, pos + cmp_len, components, Qnil, Qnil); 5618 compose_text (from, to, components, Qnil, coding->dst_object);
5449} 5619}
5450 5620
5451static int *
5452save_composition_data (buf, buf_end, prop)
5453 int *buf, *buf_end;
5454 Lisp_Object prop;
5455{
5456 enum composition_method method = COMPOSITION_METHOD (prop);
5457 int cmp_len = COMPOSITION_LENGTH (prop);
5458
5459 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5460 return NULL;
5461 5621
5462 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK; 5622/* Put `charset' property on text in CODING->object according to
5463 buf[2] = method; 5623 the annotation data at CHARBUF. CHARBUF is an array:
5464 buf[3] = cmp_len; 5624 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5465 5625 */
5466 if (method == COMPOSITION_RELATIVE)
5467 buf[0] = 4;
5468 else
5469 {
5470 Lisp_Object components;
5471 int len, i;
5472 5626
5473 components = COMPOSITION_COMPONENTS (prop); 5627static INLINE void
5474 if (VECTORP (components)) 5628produce_charset (coding, charbuf)
5475 { 5629 struct coding_system *coding;
5476 len = XVECTOR (components)->size; 5630 int *charbuf;
5477 for (i = 0; i < len; i++) 5631{
5478 buf[4 + i] = XINT (AREF (components, i)); 5632 EMACS_INT from = coding->dst_pos + charbuf[2];
5479 } 5633 EMACS_INT to = coding->dst_pos + charbuf[3];
5480 else if (STRINGP (components)) 5634 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5481 {
5482 int i_byte;
5483 5635
5484 len = XSTRING (components)->size; 5636 Fput_text_property (make_number (from), make_number (to),
5485 i = i_byte = 0; 5637 Qcharset, CHARSET_NAME (charset),
5486 while (i < len) 5638 coding->dst_object);
5487 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5488 }
5489 else if (INTEGERP (components))
5490 {
5491 len = 1;
5492 buf[4] = XINT (components);
5493 }
5494 else if (CONSP (components))
5495 {
5496 for (len = 0; CONSP (components);
5497 len++, components = XCDR (components))
5498 buf[4 + len] = XINT (XCAR (components));
5499 }
5500 else
5501 abort ();
5502 buf[0] = 4 + len;
5503 }
5504 return (buf + buf[0]);
5505} 5639}
5506 5640
5641
5507#define CHARBUF_SIZE 0x4000 5642#define CHARBUF_SIZE 0x4000
5508 5643
5509#define ALLOC_CONVERSION_WORK_AREA(coding) \ 5644#define ALLOC_CONVERSION_WORK_AREA(coding) \
@@ -5534,6 +5669,9 @@ produce_annotation (coding)
5534 int *charbuf = coding->charbuf; 5669 int *charbuf = coding->charbuf;
5535 int *charbuf_end = charbuf + coding->charbuf_used; 5670 int *charbuf_end = charbuf + coding->charbuf_used;
5536 5671
5672 if (NILP (coding->dst_object))
5673 return;
5674
5537 while (charbuf < charbuf_end) 5675 while (charbuf < charbuf_end)
5538 { 5676 {
5539 if (*charbuf >= 0) 5677 if (*charbuf >= 0)
@@ -5541,11 +5679,14 @@ produce_annotation (coding)
5541 else 5679 else
5542 { 5680 {
5543 int len = -*charbuf; 5681 int len = -*charbuf;
5544 switch (charbuf[2]) 5682 switch (charbuf[1])
5545 { 5683 {
5546 case CODING_ANNOTATE_COMPOSITION_MASK: 5684 case CODING_ANNOTATE_COMPOSITION_MASK:
5547 produce_composition (coding, charbuf); 5685 produce_composition (coding, charbuf);
5548 break; 5686 break;
5687 case CODING_ANNOTATE_CHARSET_MASK:
5688 produce_charset (coding, charbuf);
5689 break;
5549 default: 5690 default:
5550 abort (); 5691 abort ();
5551 } 5692 }
@@ -5669,41 +5810,159 @@ decode_coding (coding)
5669 return coding->result; 5810 return coding->result;
5670} 5811}
5671 5812
5813
5814/* Extract an annotation data from a composition starting at POS and
5815 ending before LIMIT of CODING->src_object (buffer or string), store
5816 the data in BUF, set *STOP to a starting position of the next
5817 composition (if any) or to LIMIT, and return the address of the
5818 next element of BUF.
5819
5820 If such an annotation is not found, set *STOP to a starting
5821 position of a composition after POS (if any) or to LIMIT, and
5822 return BUF. */
5823
5824static INLINE int *
5825handle_composition_annotation (pos, limit, coding, buf, stop)
5826 EMACS_INT pos, limit;
5827 struct coding_system *coding;
5828 int *buf;
5829 EMACS_INT *stop;
5830{
5831 EMACS_INT start, end;
5832 Lisp_Object prop;
5833
5834 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5835 || end > limit)
5836 *stop = limit;
5837 else if (start > pos)
5838 *stop = start;
5839 else
5840 {
5841 if (start == pos)
5842 {
5843 /* We found a composition. Store the corresponding
5844 annotation data in BUF. */
5845 int *head = buf;
5846 enum composition_method method = COMPOSITION_METHOD (prop);
5847 int nchars = COMPOSITION_LENGTH (prop);
5848
5849 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5850 if (method != COMPOSITION_RELATIVE)
5851 {
5852 Lisp_Object components;
5853 int len, i, i_byte;
5854
5855 components = COMPOSITION_COMPONENTS (prop);
5856 if (VECTORP (components))
5857 {
5858 len = XVECTOR (components)->size;
5859 for (i = 0; i < len; i++)
5860 *buf++ = XINT (AREF (components, i));
5861 }
5862 else if (STRINGP (components))
5863 {
5864 len = XSTRING (components)->size;
5865 i = i_byte = 0;
5866 while (i < len)
5867 {
5868 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5869 buf++;
5870 }
5871 }
5872 else if (INTEGERP (components))
5873 {
5874 len = 1;
5875 *buf++ = XINT (components);
5876 }
5877 else if (CONSP (components))
5878 {
5879 for (len = 0; CONSP (components);
5880 len++, components = XCDR (components))
5881 *buf++ = XINT (XCAR (components));
5882 }
5883 else
5884 abort ();
5885 *head -= len;
5886 }
5887 }
5888
5889 if (find_composition (end, limit, &start, &end, &prop,
5890 coding->src_object)
5891 && end <= limit)
5892 *stop = start;
5893 else
5894 *stop = limit;
5895 }
5896 return buf;
5897}
5898
5899
5900/* Extract an annotation data from a text property `charset' at POS of
5901 CODING->src_object (buffer of string), store the data in BUF, set
5902 *STOP to the position where the value of `charset' property changes
5903 (limiting by LIMIT), and return the address of the next element of
5904 BUF.
5905
5906 If the property value is nil, set *STOP to the position where the
5907 property value is non-nil (limiting by LIMIT), and return BUF. */
5908
5909static INLINE int *
5910handle_charset_annotation (pos, limit, coding, buf, stop)
5911 EMACS_INT pos, limit;
5912 struct coding_system *coding;
5913 int *buf;
5914 EMACS_INT *stop;
5915{
5916 Lisp_Object val, next;
5917 int id;
5918
5919 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5920 if (! NILP (val) && CHARSETP (val))
5921 id = XINT (CHARSET_SYMBOL_ID (val));
5922 else
5923 id = -1;
5924 ADD_CHARSET_DATA (buf, 0, 0, id);
5925 next = Fnext_single_property_change (make_number (pos), Qcharset,
5926 coding->src_object,
5927 make_number (limit));
5928 *stop = XINT (next);
5929 return buf;
5930}
5931
5932
5672static void 5933static void
5673consume_chars (coding) 5934consume_chars (coding)
5674 struct coding_system *coding; 5935 struct coding_system *coding;
5675{ 5936{
5676 int *buf = coding->charbuf; 5937 int *buf = coding->charbuf;
5677 /* -1 is to compensate for CRLF. */ 5938 int *buf_end = coding->charbuf + coding->charbuf_size;
5678 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5679 const unsigned char *src = coding->source + coding->consumed; 5939 const unsigned char *src = coding->source + coding->consumed;
5680 int pos = coding->src_pos + coding->consumed_char; 5940 EMACS_INT pos = coding->src_pos + coding->consumed_char;
5681 int end_pos = coding->src_pos + coding->src_chars; 5941 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5682 int multibytep = coding->src_multibyte; 5942 int multibytep = coding->src_multibyte;
5683 Lisp_Object eol_type; 5943 Lisp_Object eol_type;
5684 int c; 5944 int c;
5685 int start, end, stop; 5945 EMACS_INT stop, stop_composition, stop_charset;
5686 Lisp_Object object, prop; 5946 int id;
5687 5947
5688 eol_type = CODING_ID_EOL_TYPE (coding->id); 5948 eol_type = CODING_ID_EOL_TYPE (coding->id);
5689 if (VECTORP (eol_type)) 5949 if (VECTORP (eol_type))
5690 eol_type = Qunix; 5950 eol_type = Qunix;
5691 5951
5692 object = coding->src_object;
5693
5694 /* Note: composition handling is not yet implemented. */ 5952 /* Note: composition handling is not yet implemented. */
5695 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; 5953 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5696 5954
5697 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK 5955 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5698 && find_composition (pos, end_pos, &start, &end, &prop, object) 5956 stop = stop_composition = pos;
5699 && end <= end_pos 5957 else
5700 && (start >= pos 5958 stop = stop_composition = end_pos;
5701 || (find_composition (end, end_pos, &start, &end, &prop, object) 5959 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5702 && end <= end_pos))) 5960 stop = stop_charset = pos;
5703 stop = start;
5704 else 5961 else
5705 stop = end_pos; 5962 stop_charset = end_pos;
5706 5963
5964 /* Compensate for CRLF and annotation. */
5965 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5707 while (buf < buf_end) 5966 while (buf < buf_end)
5708 { 5967 {
5709 if (pos == stop) 5968 if (pos == stop)
@@ -5712,15 +5971,14 @@ consume_chars (coding)
5712 5971
5713 if (pos == end_pos) 5972 if (pos == end_pos)
5714 break; 5973 break;
5715 p = save_composition_data (buf, buf_end, prop); 5974 if (pos == stop_composition)
5716 if (p == NULL) 5975 buf = handle_composition_annotation (pos, end_pos, coding,
5717 break; 5976 buf, &stop_composition);
5718 buf = p; 5977 if (pos == stop_charset)
5719 if (find_composition (end, end_pos, &start, &end, &prop, object) 5978 buf = handle_charset_annotation (pos, end_pos, coding,
5720 && end <= end_pos) 5979 buf, &stop_charset);
5721 stop = start; 5980 stop = (stop_composition < stop_charset
5722 else 5981 ? stop_composition : stop_charset);
5723 stop = end_pos;
5724 } 5982 }
5725 5983
5726 if (! multibytep) 5984 if (! multibytep)
@@ -6162,16 +6420,16 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6162 else if (BUFFERP (src_object)) 6420 else if (BUFFERP (src_object))
6163 { 6421 {
6164 set_buffer_internal (XBUFFER (src_object)); 6422 set_buffer_internal (XBUFFER (src_object));
6165 if (from != GPT)
6166 move_gap_both (from, from_byte);
6167 if (EQ (src_object, dst_object)) 6423 if (EQ (src_object, dst_object))
6168 { 6424 {
6169 del_range_both (from, from_byte, to, to_byte, 1); 6425 coding->src_object = del_range_1 (from, to, 1, 1);
6170 coding->src_pos = -chars; 6426 coding->src_pos = 0;
6171 coding->src_pos_byte = -bytes; 6427 coding->src_pos_byte = 0;
6172 } 6428 }
6173 else 6429 else
6174 { 6430 {
6431 if (from < GPT && to >= GPT)
6432 move_gap_both (from, from_byte);
6175 coding->src_pos = from; 6433 coding->src_pos = from;
6176 coding->src_pos_byte = from_byte; 6434 coding->src_pos_byte = from_byte;
6177 } 6435 }
@@ -6320,12 +6578,11 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6320{ 6578{
6321 unsigned char *src_end = src + src_bytes; 6579 unsigned char *src_end = src + src_bytes;
6322 int mask = CATEGORY_MASK_ANY; 6580 int mask = CATEGORY_MASK_ANY;
6323 int detected = 0;
6324 int c, i;
6325 Lisp_Object attrs, eol_type; 6581 Lisp_Object attrs, eol_type;
6326 Lisp_Object val; 6582 Lisp_Object val;
6327 struct coding_system coding; 6583 struct coding_system coding;
6328 int id; 6584 int id;
6585 struct coding_detection_info detect_info;
6329 6586
6330 if (NILP (coding_system)) 6587 if (NILP (coding_system))
6331 coding_system = Qundecided; 6588 coding_system = Qundecided;
@@ -6340,9 +6597,15 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6340 coding.consumed = 0; 6597 coding.consumed = 0;
6341 coding.mode |= CODING_MODE_LAST_BLOCK; 6598 coding.mode |= CODING_MODE_LAST_BLOCK;
6342 6599
6600 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6601
6343 /* At first, detect text-format if necessary. */ 6602 /* At first, detect text-format if necessary. */
6344 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) 6603 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6345 { 6604 {
6605 enum coding_category category;
6606 struct coding_system *this;
6607 int c, i;
6608
6346 for (; src < src_end; src++) 6609 for (; src < src_end; src++)
6347 { 6610 {
6348 c = *src; 6611 c = *src;
@@ -6357,64 +6620,92 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6357 if (src < src_end) 6620 if (src < src_end)
6358 for (i = 0; i < coding_category_raw_text; i++) 6621 for (i = 0; i < coding_category_raw_text; i++)
6359 { 6622 {
6360 enum coding_category category = coding_priorities[i]; 6623 category = coding_priorities[i];
6361 struct coding_system *this = coding_categories + category; 6624 this = coding_categories + category;
6362 6625
6363 if (this->id < 0) 6626 if (this->id < 0)
6364 { 6627 {
6365 /* No coding system of this category is defined. */ 6628 /* No coding system of this category is defined. */
6366 mask &= ~(1 << category); 6629 detect_info.rejected |= (1 << category);
6367 } 6630 }
6368 else if (category >= coding_category_raw_text 6631 else if (category >= coding_category_raw_text)
6369 || detected & (1 << category))
6370 continue; 6632 continue;
6633 else if (detect_info.checked & (1 << category))
6634 {
6635 if (highest
6636 && (detect_info.found & (1 << category)))
6637 break;
6638 }
6371 else 6639 else
6372 { 6640 {
6373 detected |= detected_mask[category]; 6641 if ((*(this->detector)) (&coding, &detect_info)
6374 if ((*(coding_categories[category].detector)) (&coding, &mask)
6375 && highest 6642 && highest
6376 && (mask & (1 << category))) 6643 && (detect_info.found & (1 << category)))
6377 { 6644 break;
6378 mask = 1 << category;
6379 break;
6380 }
6381 } 6645 }
6382 } 6646 }
6383 6647
6384 if (!mask) 6648
6649 if (detect_info.rejected == CATEGORY_MASK_ANY)
6385 { 6650 {
6651 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6386 id = coding_categories[coding_category_raw_text].id; 6652 id = coding_categories[coding_category_raw_text].id;
6387 val = Fcons (make_number (id), Qnil); 6653 val = Fcons (make_number (id), Qnil);
6388 } 6654 }
6389 else if (mask == CATEGORY_MASK_ANY) 6655 else if (! detect_info.rejected && ! detect_info.found)
6390 { 6656 {
6657 detect_info.found = CATEGORY_MASK_ANY;
6391 id = coding_categories[coding_category_undecided].id; 6658 id = coding_categories[coding_category_undecided].id;
6392 val = Fcons (make_number (id), Qnil); 6659 val = Fcons (make_number (id), Qnil);
6393 } 6660 }
6394 else if (highest) 6661 else if (highest)
6395 { 6662 {
6396 for (i = 0; i < coding_category_raw_text; i++) 6663 if (detect_info.found)
6397 if (mask & (1 << coding_priorities[i])) 6664 {
6398 { 6665 detect_info.found = 1 << category;
6399 id = coding_categories[coding_priorities[i]].id; 6666 val = Fcons (make_number (this->id), Qnil);
6400 val = Fcons (make_number (id), Qnil); 6667 }
6401 break; 6668 else
6402 } 6669 for (i = 0; i < coding_category_raw_text; i++)
6403 } 6670 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6671 {
6672 detect_info.found = 1 << coding_priorities[i];
6673 id = coding_categories[coding_priorities[i]].id;
6674 val = Fcons (make_number (id), Qnil);
6675 break;
6676 }
6677 }
6404 else 6678 else
6405 { 6679 {
6680 int mask = detect_info.rejected | detect_info.found;
6681 int found = 0;
6406 val = Qnil; 6682 val = Qnil;
6683
6407 for (i = coding_category_raw_text - 1; i >= 0; i--) 6684 for (i = coding_category_raw_text - 1; i >= 0; i--)
6408 if (mask & (1 << coding_priorities[i])) 6685 {
6409 { 6686 category = coding_priorities[i];
6410 id = coding_categories[coding_priorities[i]].id; 6687 if (! (mask & (1 << category)))
6411 val = Fcons (make_number (id), val); 6688 {
6412 } 6689 found |= 1 << category;
6690 id = coding_categories[category].id;
6691 val = Fcons (make_number (id), val);
6692 }
6693 }
6694 for (i = coding_category_raw_text - 1; i >= 0; i--)
6695 {
6696 category = coding_priorities[i];
6697 if (detect_info.found & (1 << category))
6698 {
6699 id = coding_categories[category].id;
6700 val = Fcons (make_number (id), val);
6701 }
6702 }
6703 detect_info.found |= found;
6413 } 6704 }
6414 } 6705 }
6415 else 6706 else
6416 { 6707 {
6417 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); 6708 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6418 val = Fcons (make_number (coding.id), Qnil); 6709 val = Fcons (make_number (coding.id), Qnil);
6419 } 6710 }
6420 6711
@@ -6425,13 +6716,15 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6425 6716
6426 if (VECTORP (eol_type)) 6717 if (VECTORP (eol_type))
6427 { 6718 {
6428 if (mask & ~CATEGORY_MASK_UTF_16) 6719 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6429 normal_eol = detect_eol (coding.source, src_bytes, 6720 normal_eol = detect_eol (coding.source, src_bytes,
6430 coding_category_raw_text); 6721 coding_category_raw_text);
6431 if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG)) 6722 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6723 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6432 utf_16_be_eol = detect_eol (coding.source, src_bytes, 6724 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6433 coding_category_utf_16_be); 6725 coding_category_utf_16_be);
6434 if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG)) 6726 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6727 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6435 utf_16_le_eol = detect_eol (coding.source, src_bytes, 6728 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6436 coding_category_utf_16_le); 6729 coding_category_utf_16_le);
6437 } 6730 }