diff options
| -rw-r--r-- | src/coding.c | 1091 |
1 files changed, 692 insertions, 399 deletions
diff --git a/src/coding.c b/src/coding.c index 6c898b878f4..8340e8dc271 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -144,26 +144,23 @@ STRUCT CODING_SYSTEM | |||
| 144 | /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** | 144 | /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** |
| 145 | 145 | ||
| 146 | These functions check if a byte sequence specified as a source in | 146 | These functions check if a byte sequence specified as a source in |
| 147 | CODING conforms to the format of XXX. Return 1 if the data contains | 147 | CODING conforms to the format of XXX, and update the members of |
| 148 | a byte sequence which can be decoded into non-ASCII characters by | 148 | DETECT_INFO. |
| 149 | the coding system. Otherwize (i.e. the data contains only ASCII | ||
| 150 | characters or invalid sequence) return 0. | ||
| 151 | 149 | ||
| 152 | It also resets some bits of an integer pointed by MASK. The macros | 150 | Return 1 if the byte sequence conforms to XXX, otherwise return 0. |
| 153 | CATEGORY_MASK_XXX specifies each bit of this integer. | ||
| 154 | 151 | ||
| 155 | Below is the template of these functions. */ | 152 | Below is the template of these functions. */ |
| 156 | 153 | ||
| 157 | #if 0 | 154 | #if 0 |
| 158 | static int | 155 | static int |
| 159 | detect_coding_XXX (coding, mask) | 156 | detect_coding_XXX (coding, detect_info) |
| 160 | struct coding_system *coding; | 157 | struct coding_system *coding; |
| 161 | int *mask; | 158 | struct coding_detection_info *detect_info; |
| 162 | { | 159 | { |
| 163 | unsigned char *src = coding->source; | 160 | unsigned char *src = coding->source; |
| 164 | unsigned char *src_end = coding->source + coding->src_bytes; | 161 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 165 | int multibytep = coding->src_multibyte; | 162 | int multibytep = coding->src_multibyte; |
| 166 | int c; | 163 | int consumed_chars = 0; |
| 167 | int found = 0; | 164 | int found = 0; |
| 168 | ...; | 165 | ...; |
| 169 | 166 | ||
| @@ -172,18 +169,19 @@ detect_coding_XXX (coding, mask) | |||
| 172 | /* Get one byte from the source. If the souce is exausted, jump | 169 | /* Get one byte from the source. If the souce is exausted, jump |
| 173 | to no_more_source:. */ | 170 | to no_more_source:. */ |
| 174 | ONE_MORE_BYTE (c); | 171 | ONE_MORE_BYTE (c); |
| 175 | /* Check if it conforms to XXX. If not, break the loop. */ | 172 | |
| 173 | if (! __C_conforms_to_XXX___ (c)) | ||
| 174 | break; | ||
| 175 | if (! __C_strongly_suggests_XXX__ (c)) | ||
| 176 | found = CATEGORY_MASK_XXX; | ||
| 176 | } | 177 | } |
| 177 | /* As the data is invalid for XXX, reset a proper bits. */ | 178 | /* The byte sequence is invalid for XXX. */ |
| 178 | *mask &= ~CODING_CATEGORY_XXX; | 179 | detect_info->rejected |= CATEGORY_MASK_XXX; |
| 179 | return 0; | 180 | return 0; |
| 181 | |||
| 180 | no_more_source: | 182 | no_more_source: |
| 181 | /* The source exausted. */ | 183 | /* The source exausted successfully. */ |
| 182 | if (!found) | 184 | detect_info->found |= found; |
| 183 | /* ASCII characters only. */ | ||
| 184 | return 0; | ||
| 185 | /* Some data should be decoded into non-ASCII characters. */ | ||
| 186 | *mask &= CODING_CATEGORY_XXX; | ||
| 187 | return 1; | 185 | return 1; |
| 188 | } | 186 | } |
| 189 | #endif | 187 | #endif |
| @@ -408,31 +406,38 @@ Lisp_Object Vsjis_coding_system; | |||
| 408 | Lisp_Object Vbig5_coding_system; | 406 | Lisp_Object Vbig5_coding_system; |
| 409 | 407 | ||
| 410 | 408 | ||
| 411 | static int detect_coding_utf_8 P_ ((struct coding_system *, int *)); | 409 | static int detect_coding_utf_8 P_ ((struct coding_system *, |
| 410 | struct coding_detection_info *info)); | ||
| 412 | static void decode_coding_utf_8 P_ ((struct coding_system *)); | 411 | static void decode_coding_utf_8 P_ ((struct coding_system *)); |
| 413 | static int encode_coding_utf_8 P_ ((struct coding_system *)); | 412 | static int encode_coding_utf_8 P_ ((struct coding_system *)); |
| 414 | 413 | ||
| 415 | static int detect_coding_utf_16 P_ ((struct coding_system *, int *)); | 414 | static int detect_coding_utf_16 P_ ((struct coding_system *, |
| 415 | struct coding_detection_info *info)); | ||
| 416 | static void decode_coding_utf_16 P_ ((struct coding_system *)); | 416 | static void decode_coding_utf_16 P_ ((struct coding_system *)); |
| 417 | static int encode_coding_utf_16 P_ ((struct coding_system *)); | 417 | static int encode_coding_utf_16 P_ ((struct coding_system *)); |
| 418 | 418 | ||
| 419 | static int detect_coding_iso_2022 P_ ((struct coding_system *, int *)); | 419 | static int detect_coding_iso_2022 P_ ((struct coding_system *, |
| 420 | struct coding_detection_info *info)); | ||
| 420 | static void decode_coding_iso_2022 P_ ((struct coding_system *)); | 421 | static void decode_coding_iso_2022 P_ ((struct coding_system *)); |
| 421 | static int encode_coding_iso_2022 P_ ((struct coding_system *)); | 422 | static int encode_coding_iso_2022 P_ ((struct coding_system *)); |
| 422 | 423 | ||
| 423 | static int detect_coding_emacs_mule P_ ((struct coding_system *, int *)); | 424 | static int detect_coding_emacs_mule P_ ((struct coding_system *, |
| 425 | struct coding_detection_info *info)); | ||
| 424 | static void decode_coding_emacs_mule P_ ((struct coding_system *)); | 426 | static void decode_coding_emacs_mule P_ ((struct coding_system *)); |
| 425 | static int encode_coding_emacs_mule P_ ((struct coding_system *)); | 427 | static int encode_coding_emacs_mule P_ ((struct coding_system *)); |
| 426 | 428 | ||
| 427 | static int detect_coding_sjis P_ ((struct coding_system *, int *)); | 429 | static int detect_coding_sjis P_ ((struct coding_system *, |
| 430 | struct coding_detection_info *info)); | ||
| 428 | static void decode_coding_sjis P_ ((struct coding_system *)); | 431 | static void decode_coding_sjis P_ ((struct coding_system *)); |
| 429 | static int encode_coding_sjis P_ ((struct coding_system *)); | 432 | static int encode_coding_sjis P_ ((struct coding_system *)); |
| 430 | 433 | ||
| 431 | static int detect_coding_big5 P_ ((struct coding_system *, int *)); | 434 | static int detect_coding_big5 P_ ((struct coding_system *, |
| 435 | struct coding_detection_info *info)); | ||
| 432 | static void decode_coding_big5 P_ ((struct coding_system *)); | 436 | static void decode_coding_big5 P_ ((struct coding_system *)); |
| 433 | static int encode_coding_big5 P_ ((struct coding_system *)); | 437 | static int encode_coding_big5 P_ ((struct coding_system *)); |
| 434 | 438 | ||
| 435 | static int detect_coding_ccl P_ ((struct coding_system *, int *)); | 439 | static int detect_coding_ccl P_ ((struct coding_system *, |
| 440 | struct coding_detection_info *info)); | ||
| 436 | static void decode_coding_ccl P_ ((struct coding_system *)); | 441 | static void decode_coding_ccl P_ ((struct coding_system *)); |
| 437 | static int encode_coding_ccl P_ ((struct coding_system *)); | 442 | static int encode_coding_ccl P_ ((struct coding_system *)); |
| 438 | 443 | ||
| @@ -631,6 +636,7 @@ enum coding_category | |||
| 631 | #define CATEGORY_MASK_BIG5 (1 << coding_category_big5) | 636 | #define CATEGORY_MASK_BIG5 (1 << coding_category_big5) |
| 632 | #define CATEGORY_MASK_CCL (1 << coding_category_ccl) | 637 | #define CATEGORY_MASK_CCL (1 << coding_category_ccl) |
| 633 | #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) | 638 | #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) |
| 639 | #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text) | ||
| 634 | 640 | ||
| 635 | /* This value is returned if detect_coding_mask () find nothing other | 641 | /* This value is returned if detect_coding_mask () find nothing other |
| 636 | than ASCII characters. */ | 642 | than ASCII characters. */ |
| @@ -1002,6 +1008,54 @@ alloc_destination (coding, nbytes, dst) | |||
| 1002 | return dst; | 1008 | return dst; |
| 1003 | } | 1009 | } |
| 1004 | 1010 | ||
| 1011 | /** Macros for annotations. */ | ||
| 1012 | |||
| 1013 | /* Maximum length of annotation data (sum of annotations for | ||
| 1014 | composition and charset). */ | ||
| 1015 | #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5) | ||
| 1016 | |||
| 1017 | /* An annotation data is stored in the array coding->charbuf in this | ||
| 1018 | format: | ||
| 1019 | [ -LENGTH ANNOTATION_MASK FROM TO ... ] | ||
| 1020 | LENGTH is the number of elements in the annotation. | ||
| 1021 | ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK. | ||
| 1022 | FROM and TO specify the range of text annotated. They are relative | ||
| 1023 | to coding->src_pos (on encoding) or coding->dst_pos (on decoding). | ||
| 1024 | |||
| 1025 | The format of the following elements depend on ANNOTATION_MASK. | ||
| 1026 | |||
| 1027 | In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements | ||
| 1028 | follows: | ||
| 1029 | ... METHOD [ COMPOSITION-COMPONENTS ... ] | ||
| 1030 | METHOD is one of enum composition_method. | ||
| 1031 | Optionnal COMPOSITION-COMPONENTS are characters and composition | ||
| 1032 | rules. | ||
| 1033 | |||
| 1034 | In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID | ||
| 1035 | follows. */ | ||
| 1036 | |||
| 1037 | #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \ | ||
| 1038 | do { \ | ||
| 1039 | *(buf)++ = -(len); \ | ||
| 1040 | *(buf)++ = (mask); \ | ||
| 1041 | *(buf)++ = (from); \ | ||
| 1042 | *(buf)++ = (to); \ | ||
| 1043 | coding->annotated = 1; \ | ||
| 1044 | } while (0); | ||
| 1045 | |||
| 1046 | #define ADD_COMPOSITION_DATA(buf, from, to, method) \ | ||
| 1047 | do { \ | ||
| 1048 | ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \ | ||
| 1049 | *buf++ = method; \ | ||
| 1050 | } while (0) | ||
| 1051 | |||
| 1052 | |||
| 1053 | #define ADD_CHARSET_DATA(buf, from, to, id) \ | ||
| 1054 | do { \ | ||
| 1055 | ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \ | ||
| 1056 | *buf++ = id; \ | ||
| 1057 | } while (0) | ||
| 1058 | |||
| 1005 | 1059 | ||
| 1006 | /*** 2. Emacs' internal format (emacs-utf-8) ***/ | 1060 | /*** 2. Emacs' internal format (emacs-utf-8) ***/ |
| 1007 | 1061 | ||
| @@ -1011,8 +1065,8 @@ alloc_destination (coding, nbytes, dst) | |||
| 1011 | /*** 3. UTF-8 ***/ | 1065 | /*** 3. UTF-8 ***/ |
| 1012 | 1066 | ||
| 1013 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 1067 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 1014 | Check if a text is encoded in UTF-8. If it is, return | 1068 | Check if a text is encoded in UTF-8. If it is, return 1, else |
| 1015 | CATEGORY_MASK_UTF_8, else return 0. */ | 1069 | return 0. */ |
| 1016 | 1070 | ||
| 1017 | #define UTF_8_1_OCTET_P(c) ((c) < 0x80) | 1071 | #define UTF_8_1_OCTET_P(c) ((c) < 0x80) |
| 1018 | #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) | 1072 | #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) |
| @@ -1022,9 +1076,9 @@ alloc_destination (coding, nbytes, dst) | |||
| 1022 | #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | 1076 | #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) |
| 1023 | 1077 | ||
| 1024 | static int | 1078 | static int |
| 1025 | detect_coding_utf_8 (coding, mask) | 1079 | detect_coding_utf_8 (coding, detect_info) |
| 1026 | struct coding_system *coding; | 1080 | struct coding_system *coding; |
| 1027 | int *mask; | 1081 | struct coding_detection_info *detect_info; |
| 1028 | { | 1082 | { |
| 1029 | unsigned char *src = coding->source, *src_base = src; | 1083 | unsigned char *src = coding->source, *src_base = src; |
| 1030 | unsigned char *src_end = coding->source + coding->src_bytes; | 1084 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -1033,6 +1087,7 @@ detect_coding_utf_8 (coding, mask) | |||
| 1033 | int found = 0; | 1087 | int found = 0; |
| 1034 | int incomplete; | 1088 | int incomplete; |
| 1035 | 1089 | ||
| 1090 | detect_info->checked |= CATEGORY_MASK_UTF_8; | ||
| 1036 | /* A coding system of this category is always ASCII compatible. */ | 1091 | /* A coding system of this category is always ASCII compatible. */ |
| 1037 | src += coding->head_ascii; | 1092 | src += coding->head_ascii; |
| 1038 | 1093 | ||
| @@ -1050,7 +1105,7 @@ detect_coding_utf_8 (coding, mask) | |||
| 1050 | break; | 1105 | break; |
| 1051 | if (UTF_8_2_OCTET_LEADING_P (c)) | 1106 | if (UTF_8_2_OCTET_LEADING_P (c)) |
| 1052 | { | 1107 | { |
| 1053 | found++; | 1108 | found = CATEGORY_MASK_UTF_8; |
| 1054 | continue; | 1109 | continue; |
| 1055 | } | 1110 | } |
| 1056 | ONE_MORE_BYTE (c2); | 1111 | ONE_MORE_BYTE (c2); |
| @@ -1058,7 +1113,7 @@ detect_coding_utf_8 (coding, mask) | |||
| 1058 | break; | 1113 | break; |
| 1059 | if (UTF_8_3_OCTET_LEADING_P (c)) | 1114 | if (UTF_8_3_OCTET_LEADING_P (c)) |
| 1060 | { | 1115 | { |
| 1061 | found++; | 1116 | found = CATEGORY_MASK_UTF_8; |
| 1062 | continue; | 1117 | continue; |
| 1063 | } | 1118 | } |
| 1064 | ONE_MORE_BYTE (c3); | 1119 | ONE_MORE_BYTE (c3); |
| @@ -1066,7 +1121,7 @@ detect_coding_utf_8 (coding, mask) | |||
| 1066 | break; | 1121 | break; |
| 1067 | if (UTF_8_4_OCTET_LEADING_P (c)) | 1122 | if (UTF_8_4_OCTET_LEADING_P (c)) |
| 1068 | { | 1123 | { |
| 1069 | found++; | 1124 | found = CATEGORY_MASK_UTF_8; |
| 1070 | continue; | 1125 | continue; |
| 1071 | } | 1126 | } |
| 1072 | ONE_MORE_BYTE (c4); | 1127 | ONE_MORE_BYTE (c4); |
| @@ -1074,21 +1129,22 @@ detect_coding_utf_8 (coding, mask) | |||
| 1074 | break; | 1129 | break; |
| 1075 | if (UTF_8_5_OCTET_LEADING_P (c)) | 1130 | if (UTF_8_5_OCTET_LEADING_P (c)) |
| 1076 | { | 1131 | { |
| 1077 | found++; | 1132 | found = CATEGORY_MASK_UTF_8; |
| 1078 | continue; | 1133 | continue; |
| 1079 | } | 1134 | } |
| 1080 | break; | 1135 | break; |
| 1081 | } | 1136 | } |
| 1082 | *mask &= ~CATEGORY_MASK_UTF_8; | 1137 | detect_info->rejected |= CATEGORY_MASK_UTF_8; |
| 1083 | return 0; | 1138 | return 0; |
| 1084 | 1139 | ||
| 1085 | no_more_source: | 1140 | no_more_source: |
| 1086 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | 1141 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1087 | { | 1142 | { |
| 1088 | *mask &= ~CATEGORY_MASK_UTF_8; | 1143 | detect_info->rejected |= CATEGORY_MASK_UTF_8; |
| 1089 | return 0; | 1144 | return 0; |
| 1090 | } | 1145 | } |
| 1091 | return found; | 1146 | detect_info->found |= found; |
| 1147 | return 1; | ||
| 1092 | } | 1148 | } |
| 1093 | 1149 | ||
| 1094 | 1150 | ||
| @@ -1269,10 +1325,8 @@ encode_coding_utf_8 (coding) | |||
| 1269 | 1325 | ||
| 1270 | 1326 | ||
| 1271 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 1327 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 1272 | Check if a text is encoded in UTF-16 Big Endian (endian == 1) or | 1328 | Check if a text is encoded in one of UTF-16 based coding systems. |
| 1273 | Little Endian (otherwise). If it is, return | 1329 | If it is, return 1, else return 0. */ |
| 1274 | CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE, | ||
| 1275 | else return 0. */ | ||
| 1276 | 1330 | ||
| 1277 | #define UTF_16_HIGH_SURROGATE_P(val) \ | 1331 | #define UTF_16_HIGH_SURROGATE_P(val) \ |
| 1278 | (((val) & 0xFC00) == 0xD800) | 1332 | (((val) & 0xFC00) == 0xD800) |
| @@ -1287,9 +1341,9 @@ encode_coding_utf_8 (coding) | |||
| 1287 | 1341 | ||
| 1288 | 1342 | ||
| 1289 | static int | 1343 | static int |
| 1290 | detect_coding_utf_16 (coding, mask) | 1344 | detect_coding_utf_16 (coding, detect_info) |
| 1291 | struct coding_system *coding; | 1345 | struct coding_system *coding; |
| 1292 | int *mask; | 1346 | struct coding_detection_info *detect_info; |
| 1293 | { | 1347 | { |
| 1294 | unsigned char *src = coding->source, *src_base = src; | 1348 | unsigned char *src = coding->source, *src_base = src; |
| 1295 | unsigned char *src_end = coding->source + coding->src_bytes; | 1349 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -1297,21 +1351,29 @@ detect_coding_utf_16 (coding, mask) | |||
| 1297 | int consumed_chars = 0; | 1351 | int consumed_chars = 0; |
| 1298 | int c1, c2; | 1352 | int c1, c2; |
| 1299 | 1353 | ||
| 1300 | *mask &= ~CATEGORY_MASK_UTF_16; | 1354 | detect_info->checked |= CATEGORY_MASK_UTF_16; |
| 1301 | 1355 | ||
| 1356 | if (coding->mode & CODING_MODE_LAST_BLOCK | ||
| 1357 | && (coding->src_bytes & 1)) | ||
| 1358 | { | ||
| 1359 | detect_info->rejected |= CATEGORY_MASK_UTF_16; | ||
| 1360 | return 0; | ||
| 1361 | } | ||
| 1302 | ONE_MORE_BYTE (c1); | 1362 | ONE_MORE_BYTE (c1); |
| 1303 | ONE_MORE_BYTE (c2); | 1363 | ONE_MORE_BYTE (c2); |
| 1304 | 1364 | ||
| 1305 | if ((c1 == 0xFF) && (c2 == 0xFE)) | 1365 | if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 1306 | *mask |= CATEGORY_MASK_UTF_16_LE; | 1366 | { |
| 1367 | detect_info->found |= CATEGORY_MASK_UTF_16_LE; | ||
| 1368 | detect_info->rejected |= CATEGORY_MASK_UTF_16_BE; | ||
| 1369 | } | ||
| 1307 | else if ((c1 == 0xFE) && (c2 == 0xFF)) | 1370 | else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 1308 | *mask |= CATEGORY_MASK_UTF_16_BE; | 1371 | { |
| 1309 | else | 1372 | detect_info->found |= CATEGORY_MASK_UTF_16_BE; |
| 1310 | *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG; | 1373 | detect_info->rejected |= CATEGORY_MASK_UTF_16_LE; |
| 1311 | return 1; | 1374 | } |
| 1312 | |||
| 1313 | no_more_source: | 1375 | no_more_source: |
| 1314 | return 0; | 1376 | return 1; |
| 1315 | } | 1377 | } |
| 1316 | 1378 | ||
| 1317 | static void | 1379 | static void |
| @@ -1559,10 +1621,10 @@ encode_coding_utf_16 (coding) | |||
| 1559 | char emacs_mule_bytes[256]; | 1621 | char emacs_mule_bytes[256]; |
| 1560 | 1622 | ||
| 1561 | int | 1623 | int |
| 1562 | emacs_mule_char (coding, src, nbytes, nchars) | 1624 | emacs_mule_char (coding, src, nbytes, nchars, id) |
| 1563 | struct coding_system *coding; | 1625 | struct coding_system *coding; |
| 1564 | unsigned char *src; | 1626 | unsigned char *src; |
| 1565 | int *nbytes, *nchars; | 1627 | int *nbytes, *nchars, *id; |
| 1566 | { | 1628 | { |
| 1567 | unsigned char *src_end = coding->source + coding->src_bytes; | 1629 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 1568 | int multibytep = coding->src_multibyte; | 1630 | int multibytep = coding->src_multibyte; |
| @@ -1627,6 +1689,8 @@ emacs_mule_char (coding, src, nbytes, nchars) | |||
| 1627 | goto invalid_code; | 1689 | goto invalid_code; |
| 1628 | *nbytes = src - src_base; | 1690 | *nbytes = src - src_base; |
| 1629 | *nchars = consumed_chars; | 1691 | *nchars = consumed_chars; |
| 1692 | if (id) | ||
| 1693 | *id = charset->id; | ||
| 1630 | return c; | 1694 | return c; |
| 1631 | 1695 | ||
| 1632 | no_more_source: | 1696 | no_more_source: |
| @@ -1638,12 +1702,13 @@ emacs_mule_char (coding, src, nbytes, nchars) | |||
| 1638 | 1702 | ||
| 1639 | 1703 | ||
| 1640 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 1704 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 1641 | Check if a text is encoded in `emacs-mule'. */ | 1705 | Check if a text is encoded in `emacs-mule'. If it is, return 1, |
| 1706 | else return 0. */ | ||
| 1642 | 1707 | ||
| 1643 | static int | 1708 | static int |
| 1644 | detect_coding_emacs_mule (coding, mask) | 1709 | detect_coding_emacs_mule (coding, detect_info) |
| 1645 | struct coding_system *coding; | 1710 | struct coding_system *coding; |
| 1646 | int *mask; | 1711 | struct coding_detection_info *detect_info; |
| 1647 | { | 1712 | { |
| 1648 | unsigned char *src = coding->source, *src_base = src; | 1713 | unsigned char *src = coding->source, *src_base = src; |
| 1649 | unsigned char *src_end = coding->source + coding->src_bytes; | 1714 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -1653,6 +1718,7 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1653 | int found = 0; | 1718 | int found = 0; |
| 1654 | int incomplete; | 1719 | int incomplete; |
| 1655 | 1720 | ||
| 1721 | detect_info->checked |= CATEGORY_MASK_EMACS_MULE; | ||
| 1656 | /* A coding system of this category is always ASCII compatible. */ | 1722 | /* A coding system of this category is always ASCII compatible. */ |
| 1657 | src += coding->head_ascii; | 1723 | src += coding->head_ascii; |
| 1658 | 1724 | ||
| @@ -1680,7 +1746,7 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1680 | 1746 | ||
| 1681 | if (src - src_base <= 4) | 1747 | if (src - src_base <= 4) |
| 1682 | break; | 1748 | break; |
| 1683 | found = 1; | 1749 | found = CATEGORY_MASK_EMACS_MULE; |
| 1684 | if (c == 0x80) | 1750 | if (c == 0x80) |
| 1685 | goto repeat; | 1751 | goto repeat; |
| 1686 | } | 1752 | } |
| @@ -1702,19 +1768,20 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1702 | while (c >= 0xA0); | 1768 | while (c >= 0xA0); |
| 1703 | if (src - src_base != emacs_mule_bytes[*src_base]) | 1769 | if (src - src_base != emacs_mule_bytes[*src_base]) |
| 1704 | break; | 1770 | break; |
| 1705 | found = 1; | 1771 | found = CATEGORY_MASK_EMACS_MULE; |
| 1706 | } | 1772 | } |
| 1707 | } | 1773 | } |
| 1708 | *mask &= ~CATEGORY_MASK_EMACS_MULE; | 1774 | detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; |
| 1709 | return 0; | 1775 | return 0; |
| 1710 | 1776 | ||
| 1711 | no_more_source: | 1777 | no_more_source: |
| 1712 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | 1778 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1713 | { | 1779 | { |
| 1714 | *mask &= ~CATEGORY_MASK_EMACS_MULE; | 1780 | detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; |
| 1715 | return 0; | 1781 | return 0; |
| 1716 | } | 1782 | } |
| 1717 | return found; | 1783 | detect_info->found |= found; |
| 1784 | return 1; | ||
| 1718 | } | 1785 | } |
| 1719 | 1786 | ||
| 1720 | 1787 | ||
| @@ -1735,7 +1802,7 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1735 | \ | 1802 | \ |
| 1736 | if (src == src_end) \ | 1803 | if (src == src_end) \ |
| 1737 | break; \ | 1804 | break; \ |
| 1738 | c = emacs_mule_char (coding, src, &nbytes, &nchars); \ | 1805 | c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\ |
| 1739 | if (c < 0) \ | 1806 | if (c < 0) \ |
| 1740 | { \ | 1807 | { \ |
| 1741 | if (c == -2) \ | 1808 | if (c == -2) \ |
| @@ -1792,16 +1859,6 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1792 | } while (0) | 1859 | } while (0) |
| 1793 | 1860 | ||
| 1794 | 1861 | ||
| 1795 | #define ADD_COMPOSITION_DATA(buf, method, nchars) \ | ||
| 1796 | do { \ | ||
| 1797 | *buf++ = -5; \ | ||
| 1798 | *buf++ = coding->produced_char + char_offset; \ | ||
| 1799 | *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \ | ||
| 1800 | *buf++ = method; \ | ||
| 1801 | *buf++ = nchars; \ | ||
| 1802 | } while (0) | ||
| 1803 | |||
| 1804 | |||
| 1805 | #define DECODE_EMACS_MULE_21_COMPOSITION(c) \ | 1862 | #define DECODE_EMACS_MULE_21_COMPOSITION(c) \ |
| 1806 | do { \ | 1863 | do { \ |
| 1807 | /* Emacs 21 style format. The first three bytes at SRC are \ | 1864 | /* Emacs 21 style format. The first three bytes at SRC are \ |
| @@ -1810,6 +1867,7 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1810 | number of characters composed by this composition. */ \ | 1867 | number of characters composed by this composition. */ \ |
| 1811 | enum composition_method method = c - 0xF2; \ | 1868 | enum composition_method method = c - 0xF2; \ |
| 1812 | int *charbuf_base = charbuf; \ | 1869 | int *charbuf_base = charbuf; \ |
| 1870 | int from, to; \ | ||
| 1813 | int consumed_chars_limit; \ | 1871 | int consumed_chars_limit; \ |
| 1814 | int nbytes, nchars; \ | 1872 | int nbytes, nchars; \ |
| 1815 | \ | 1873 | \ |
| @@ -1819,7 +1877,9 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1819 | goto invalid_code; \ | 1877 | goto invalid_code; \ |
| 1820 | ONE_MORE_BYTE (c); \ | 1878 | ONE_MORE_BYTE (c); \ |
| 1821 | nchars = c - 0xA0; \ | 1879 | nchars = c - 0xA0; \ |
| 1822 | ADD_COMPOSITION_DATA (charbuf, method, nchars); \ | 1880 | from = coding->produced + char_offset; \ |
| 1881 | to = from + nchars; \ | ||
| 1882 | ADD_COMPOSITION_DATA (charbuf, from, to, method); \ | ||
| 1823 | consumed_chars_limit = consumed_chars_base + nbytes; \ | 1883 | consumed_chars_limit = consumed_chars_base + nbytes; \ |
| 1824 | if (method != COMPOSITION_RELATIVE) \ | 1884 | if (method != COMPOSITION_RELATIVE) \ |
| 1825 | { \ | 1885 | { \ |
| @@ -1843,9 +1903,11 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1843 | do { \ | 1903 | do { \ |
| 1844 | /* Emacs 20 style format for relative composition. */ \ | 1904 | /* Emacs 20 style format for relative composition. */ \ |
| 1845 | /* Store multibyte form of characters to be composed. */ \ | 1905 | /* Store multibyte form of characters to be composed. */ \ |
| 1906 | enum composition_method method = COMPOSITION_RELATIVE; \ | ||
| 1846 | int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ | 1907 | int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ |
| 1847 | int *buf = components; \ | 1908 | int *buf = components; \ |
| 1848 | int i, j; \ | 1909 | int i, j; \ |
| 1910 | int from, to; \ | ||
| 1849 | \ | 1911 | \ |
| 1850 | src = src_base; \ | 1912 | src = src_base; \ |
| 1851 | ONE_MORE_BYTE (c); /* skip 0x80 */ \ | 1913 | ONE_MORE_BYTE (c); /* skip 0x80 */ \ |
| @@ -1853,7 +1915,9 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1853 | DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ | 1915 | DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ |
| 1854 | if (i < 2) \ | 1916 | if (i < 2) \ |
| 1855 | goto invalid_code; \ | 1917 | goto invalid_code; \ |
| 1856 | ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \ | 1918 | from = coding->produced_char + char_offset; \ |
| 1919 | to = from + i; \ | ||
| 1920 | ADD_COMPOSITION_DATA (charbuf, from, to, method); \ | ||
| 1857 | for (j = 0; j < i; j++) \ | 1921 | for (j = 0; j < i; j++) \ |
| 1858 | *charbuf++ = components[j]; \ | 1922 | *charbuf++ = components[j]; \ |
| 1859 | } while (0) | 1923 | } while (0) |
| @@ -1863,9 +1927,11 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1863 | do { \ | 1927 | do { \ |
| 1864 | /* Emacs 20 style format for rule-base composition. */ \ | 1928 | /* Emacs 20 style format for rule-base composition. */ \ |
| 1865 | /* Store multibyte form of characters to be composed. */ \ | 1929 | /* Store multibyte form of characters to be composed. */ \ |
| 1930 | enum composition_method method = COMPOSITION_WITH_RULE; \ | ||
| 1866 | int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ | 1931 | int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ |
| 1867 | int *buf = components; \ | 1932 | int *buf = components; \ |
| 1868 | int i, j; \ | 1933 | int i, j; \ |
| 1934 | int from, to; \ | ||
| 1869 | \ | 1935 | \ |
| 1870 | DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ | 1936 | DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ |
| 1871 | for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ | 1937 | for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ |
| @@ -1877,7 +1943,9 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1877 | goto invalid_code; \ | 1943 | goto invalid_code; \ |
| 1878 | if (charbuf + i + (i / 2) + 1 < charbuf_end) \ | 1944 | if (charbuf + i + (i / 2) + 1 < charbuf_end) \ |
| 1879 | goto no_more_source; \ | 1945 | goto no_more_source; \ |
| 1880 | ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \ | 1946 | from = coding->produced_char + char_offset; \ |
| 1947 | to = from + i; \ | ||
| 1948 | ADD_COMPOSITION_DATA (buf, from, to, method); \ | ||
| 1881 | for (j = 0; j < i; j++) \ | 1949 | for (j = 0; j < i; j++) \ |
| 1882 | *charbuf++ = components[j]; \ | 1950 | *charbuf++ = components[j]; \ |
| 1883 | for (j = 0; j < i; j += 2) \ | 1951 | for (j = 0; j < i; j += 2) \ |
| @@ -1893,11 +1961,13 @@ decode_coding_emacs_mule (coding) | |||
| 1893 | unsigned char *src_end = coding->source + coding->src_bytes; | 1961 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 1894 | unsigned char *src_base; | 1962 | unsigned char *src_base; |
| 1895 | int *charbuf = coding->charbuf; | 1963 | int *charbuf = coding->charbuf; |
| 1896 | int *charbuf_end = charbuf + coding->charbuf_size; | 1964 | int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; |
| 1897 | int consumed_chars = 0, consumed_chars_base; | 1965 | int consumed_chars = 0, consumed_chars_base; |
| 1898 | int char_offset = 0; | ||
| 1899 | int multibytep = coding->src_multibyte; | 1966 | int multibytep = coding->src_multibyte; |
| 1900 | Lisp_Object attrs, eol_type, charset_list; | 1967 | Lisp_Object attrs, eol_type, charset_list; |
| 1968 | int char_offset = coding->produced_char; | ||
| 1969 | int last_offset = char_offset; | ||
| 1970 | int last_id = charset_ascii; | ||
| 1901 | 1971 | ||
| 1902 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 1972 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 1903 | 1973 | ||
| @@ -1935,8 +2005,6 @@ decode_coding_emacs_mule (coding) | |||
| 1935 | } | 2005 | } |
| 1936 | else if (c == 0x80) | 2006 | else if (c == 0x80) |
| 1937 | { | 2007 | { |
| 1938 | if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end) | ||
| 1939 | break; | ||
| 1940 | ONE_MORE_BYTE (c); | 2008 | ONE_MORE_BYTE (c); |
| 1941 | if (c - 0xF2 >= COMPOSITION_RELATIVE | 2009 | if (c - 0xF2 >= COMPOSITION_RELATIVE |
| 1942 | && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) | 2010 | && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) |
| @@ -1947,20 +2015,28 @@ decode_coding_emacs_mule (coding) | |||
| 1947 | DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); | 2015 | DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); |
| 1948 | else | 2016 | else |
| 1949 | goto invalid_code; | 2017 | goto invalid_code; |
| 1950 | coding->annotated = 1; | ||
| 1951 | } | 2018 | } |
| 1952 | else if (c < 0xA0 && emacs_mule_bytes[c] > 1) | 2019 | else if (c < 0xA0 && emacs_mule_bytes[c] > 1) |
| 1953 | { | 2020 | { |
| 1954 | int nbytes, nchars; | 2021 | int nbytes, nchars; |
| 2022 | int id; | ||
| 2023 | |||
| 1955 | src = src_base; | 2024 | src = src_base; |
| 1956 | consumed_chars = consumed_chars_base; | 2025 | consumed_chars = consumed_chars_base; |
| 1957 | c = emacs_mule_char (coding, src, &nbytes, &nchars); | 2026 | c = emacs_mule_char (coding, src, &nbytes, &nchars, &id); |
| 1958 | if (c < 0) | 2027 | if (c < 0) |
| 1959 | { | 2028 | { |
| 1960 | if (c == -2) | 2029 | if (c == -2) |
| 1961 | break; | 2030 | break; |
| 1962 | goto invalid_code; | 2031 | goto invalid_code; |
| 1963 | } | 2032 | } |
| 2033 | if (last_id != id) | ||
| 2034 | { | ||
| 2035 | if (last_id != charset_ascii) | ||
| 2036 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 2037 | last_id = id; | ||
| 2038 | last_offset = char_offset; | ||
| 2039 | } | ||
| 1964 | *charbuf++ = c; | 2040 | *charbuf++ = c; |
| 1965 | src += nbytes; | 2041 | src += nbytes; |
| 1966 | consumed_chars += nchars; | 2042 | consumed_chars += nchars; |
| @@ -1973,10 +2049,13 @@ decode_coding_emacs_mule (coding) | |||
| 1973 | consumed_chars = consumed_chars_base; | 2049 | consumed_chars = consumed_chars_base; |
| 1974 | ONE_MORE_BYTE (c); | 2050 | ONE_MORE_BYTE (c); |
| 1975 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 2051 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 2052 | char_offset++; | ||
| 1976 | coding->errors++; | 2053 | coding->errors++; |
| 1977 | } | 2054 | } |
| 1978 | 2055 | ||
| 1979 | no_more_source: | 2056 | no_more_source: |
| 2057 | if (last_id != charset_ascii) | ||
| 2058 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 1980 | coding->consumed_char += consumed_chars_base; | 2059 | coding->consumed_char += consumed_chars_base; |
| 1981 | coding->consumed = src_base - coding->source; | 2060 | coding->consumed = src_base - coding->source; |
| 1982 | coding->charbuf_used = charbuf - coding->charbuf; | 2061 | coding->charbuf_used = charbuf - coding->charbuf; |
| @@ -2011,6 +2090,7 @@ encode_coding_emacs_mule (coding) | |||
| 2011 | int produced_chars = 0; | 2090 | int produced_chars = 0; |
| 2012 | Lisp_Object attrs, eol_type, charset_list; | 2091 | Lisp_Object attrs, eol_type, charset_list; |
| 2013 | int c; | 2092 | int c; |
| 2093 | int preferred_charset_id = -1; | ||
| 2014 | 2094 | ||
| 2015 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 2095 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 2016 | 2096 | ||
| @@ -2018,6 +2098,29 @@ encode_coding_emacs_mule (coding) | |||
| 2018 | { | 2098 | { |
| 2019 | ASSURE_DESTINATION (safe_room); | 2099 | ASSURE_DESTINATION (safe_room); |
| 2020 | c = *charbuf++; | 2100 | c = *charbuf++; |
| 2101 | |||
| 2102 | if (c < 0) | ||
| 2103 | { | ||
| 2104 | /* Handle an annotation. */ | ||
| 2105 | switch (*charbuf) | ||
| 2106 | { | ||
| 2107 | case CODING_ANNOTATE_COMPOSITION_MASK: | ||
| 2108 | /* Not yet implemented. */ | ||
| 2109 | break; | ||
| 2110 | case CODING_ANNOTATE_CHARSET_MASK: | ||
| 2111 | preferred_charset_id = charbuf[3]; | ||
| 2112 | if (preferred_charset_id >= 0 | ||
| 2113 | && NILP (Fmemq (make_number (preferred_charset_id), | ||
| 2114 | charset_list))) | ||
| 2115 | preferred_charset_id = -1; | ||
| 2116 | break; | ||
| 2117 | default: | ||
| 2118 | abort (); | ||
| 2119 | } | ||
| 2120 | charbuf += -c - 1; | ||
| 2121 | continue; | ||
| 2122 | } | ||
| 2123 | |||
| 2021 | if (ASCII_CHAR_P (c)) | 2124 | if (ASCII_CHAR_P (c)) |
| 2022 | EMIT_ONE_ASCII_BYTE (c); | 2125 | EMIT_ONE_ASCII_BYTE (c); |
| 2023 | else if (CHAR_BYTE8_P (c)) | 2126 | else if (CHAR_BYTE8_P (c)) |
| @@ -2033,7 +2136,14 @@ encode_coding_emacs_mule (coding) | |||
| 2033 | int emacs_mule_id; | 2136 | int emacs_mule_id; |
| 2034 | unsigned char leading_codes[2]; | 2137 | unsigned char leading_codes[2]; |
| 2035 | 2138 | ||
| 2036 | charset = char_charset (c, charset_list, &code); | 2139 | if (preferred_charset_id >= 0) |
| 2140 | { | ||
| 2141 | charset = CHARSET_FROM_ID (preferred_charset_id); | ||
| 2142 | if (! CHAR_CHARSET_P (c, charset)) | ||
| 2143 | charset = char_charset (c, charset_list, NULL); | ||
| 2144 | } | ||
| 2145 | else | ||
| 2146 | charset = char_charset (c, charset_list, &code); | ||
| 2037 | if (! charset) | 2147 | if (! charset) |
| 2038 | { | 2148 | { |
| 2039 | c = coding->default_char; | 2149 | c = coding->default_char; |
| @@ -2319,32 +2429,26 @@ setup_iso_safe_charsets (attrs) | |||
| 2319 | 2429 | ||
| 2320 | 2430 | ||
| 2321 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2431 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2322 | Check if a text is encoded in ISO2022. If it is, returns an | 2432 | Check if a text is encoded in one of ISO-2022 based codig systems. |
| 2323 | integer in which appropriate flag bits any of: | 2433 | If it is, return 1, else return 0. */ |
| 2324 | CATEGORY_MASK_ISO_7 | ||
| 2325 | CATEGORY_MASK_ISO_7_TIGHT | ||
| 2326 | CATEGORY_MASK_ISO_8_1 | ||
| 2327 | CATEGORY_MASK_ISO_8_2 | ||
| 2328 | CATEGORY_MASK_ISO_7_ELSE | ||
| 2329 | CATEGORY_MASK_ISO_8_ELSE | ||
| 2330 | are set. If a code which should never appear in ISO2022 is found, | ||
| 2331 | returns 0. */ | ||
| 2332 | 2434 | ||
| 2333 | static int | 2435 | static int |
| 2334 | detect_coding_iso_2022 (coding, mask) | 2436 | detect_coding_iso_2022 (coding, detect_info) |
| 2335 | struct coding_system *coding; | 2437 | struct coding_system *coding; |
| 2336 | int *mask; | 2438 | struct coding_detection_info *detect_info; |
| 2337 | { | 2439 | { |
| 2338 | unsigned char *src = coding->source, *src_base = src; | 2440 | unsigned char *src = coding->source, *src_base = src; |
| 2339 | unsigned char *src_end = coding->source + coding->src_bytes; | 2441 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 2340 | int multibytep = coding->src_multibyte; | 2442 | int multibytep = coding->src_multibyte; |
| 2341 | int mask_iso = CATEGORY_MASK_ISO; | 2443 | int single_shifting = 0; |
| 2342 | int mask_found = 0, mask_8bit_found = 0; | ||
| 2343 | int reg[4], shift_out = 0, single_shifting = 0; | ||
| 2344 | int id; | 2444 | int id; |
| 2345 | int c, c1; | 2445 | int c, c1; |
| 2346 | int consumed_chars = 0; | 2446 | int consumed_chars = 0; |
| 2347 | int i; | 2447 | int i; |
| 2448 | int rejected = 0; | ||
| 2449 | int found = 0; | ||
| 2450 | |||
| 2451 | detect_info->checked |= CATEGORY_MASK_ISO; | ||
| 2348 | 2452 | ||
| 2349 | for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) | 2453 | for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) |
| 2350 | { | 2454 | { |
| @@ -2363,8 +2467,7 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2363 | /* A coding system of this category is always ASCII compatible. */ | 2467 | /* A coding system of this category is always ASCII compatible. */ |
| 2364 | src += coding->head_ascii; | 2468 | src += coding->head_ascii; |
| 2365 | 2469 | ||
| 2366 | reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1; | 2470 | while (rejected != CATEGORY_MASK_ISO) |
| 2367 | while (mask_iso && src < src_end) | ||
| 2368 | { | 2471 | { |
| 2369 | ONE_MORE_BYTE (c); | 2472 | ONE_MORE_BYTE (c); |
| 2370 | switch (c) | 2473 | switch (c) |
| @@ -2382,7 +2485,6 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2382 | || (id = iso_charset_table[0][c >= ','][c1]) < 0) | 2485 | || (id = iso_charset_table[0][c >= ','][c1]) < 0) |
| 2383 | /* Invalid designation sequence. Just ignore. */ | 2486 | /* Invalid designation sequence. Just ignore. */ |
| 2384 | break; | 2487 | break; |
| 2385 | reg[(c - '(') % 4] = id; | ||
| 2386 | } | 2488 | } |
| 2387 | else if (c == '$') | 2489 | else if (c == '$') |
| 2388 | { | 2490 | { |
| @@ -2390,7 +2492,7 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2390 | ONE_MORE_BYTE (c); | 2492 | ONE_MORE_BYTE (c); |
| 2391 | if (c >= '@' && c <= 'B') | 2493 | if (c >= '@' && c <= 'B') |
| 2392 | /* Designation for JISX0208.1978, GB2312, or JISX0208. */ | 2494 | /* Designation for JISX0208.1978, GB2312, or JISX0208. */ |
| 2393 | reg[0] = id = iso_charset_table[1][0][c]; | 2495 | id = iso_charset_table[1][0][c]; |
| 2394 | else if (c >= '(' && c <= '/') | 2496 | else if (c >= '(' && c <= '/') |
| 2395 | { | 2497 | { |
| 2396 | ONE_MORE_BYTE (c1); | 2498 | ONE_MORE_BYTE (c1); |
| @@ -2398,116 +2500,86 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2398 | || (id = iso_charset_table[1][c >= ','][c1]) < 0) | 2500 | || (id = iso_charset_table[1][c >= ','][c1]) < 0) |
| 2399 | /* Invalid designation sequence. Just ignore. */ | 2501 | /* Invalid designation sequence. Just ignore. */ |
| 2400 | break; | 2502 | break; |
| 2401 | reg[(c - '(') % 4] = id; | ||
| 2402 | } | 2503 | } |
| 2403 | else | 2504 | else |
| 2404 | /* Invalid designation sequence. Just ignore. */ | 2505 | /* Invalid designation sequence. Just ignore it. */ |
| 2405 | break; | 2506 | break; |
| 2406 | } | 2507 | } |
| 2407 | else if (c == 'N' || c == 'O') | 2508 | else if (c == 'N' || c == 'O') |
| 2408 | { | 2509 | { |
| 2409 | /* ESC <Fe> for SS2 or SS3. */ | 2510 | /* ESC <Fe> for SS2 or SS3. */ |
| 2410 | mask_iso &= CATEGORY_MASK_ISO_7_ELSE; | 2511 | single_shifting = 1; |
| 2512 | rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; | ||
| 2411 | break; | 2513 | break; |
| 2412 | } | 2514 | } |
| 2413 | else if (c >= '0' && c <= '4') | 2515 | else if (c >= '0' && c <= '4') |
| 2414 | { | 2516 | { |
| 2415 | /* ESC <Fp> for start/end composition. */ | 2517 | /* ESC <Fp> for start/end composition. */ |
| 2416 | mask_found |= CATEGORY_MASK_ISO; | 2518 | found |= CATEGORY_MASK_ISO; |
| 2417 | break; | 2519 | break; |
| 2418 | } | 2520 | } |
| 2419 | else | 2521 | else |
| 2420 | { | 2522 | { |
| 2421 | /* Invalid escape sequence. */ | 2523 | /* Invalid escape sequence. Just ignore it. */ |
| 2422 | mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE; | ||
| 2423 | break; | 2524 | break; |
| 2424 | } | 2525 | } |
| 2425 | 2526 | ||
| 2426 | /* We found a valid designation sequence for CHARSET. */ | 2527 | /* We found a valid designation sequence for CHARSET. */ |
| 2427 | mask_iso &= ~CATEGORY_MASK_ISO_8BIT; | 2528 | rejected |= CATEGORY_MASK_ISO_8BIT; |
| 2428 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], | 2529 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], |
| 2429 | id)) | 2530 | id)) |
| 2430 | mask_found |= CATEGORY_MASK_ISO_7; | 2531 | found |= CATEGORY_MASK_ISO_7; |
| 2431 | else | 2532 | else |
| 2432 | mask_iso &= ~CATEGORY_MASK_ISO_7; | 2533 | rejected |= CATEGORY_MASK_ISO_7; |
| 2433 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], | 2534 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], |
| 2434 | id)) | 2535 | id)) |
| 2435 | mask_found |= CATEGORY_MASK_ISO_7_TIGHT; | 2536 | found |= CATEGORY_MASK_ISO_7_TIGHT; |
| 2436 | else | 2537 | else |
| 2437 | mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT; | 2538 | rejected |= CATEGORY_MASK_ISO_7_TIGHT; |
| 2438 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], | 2539 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], |
| 2439 | id)) | 2540 | id)) |
| 2440 | mask_found |= CATEGORY_MASK_ISO_7_ELSE; | 2541 | found |= CATEGORY_MASK_ISO_7_ELSE; |
| 2441 | else | 2542 | else |
| 2442 | mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE; | 2543 | rejected |= CATEGORY_MASK_ISO_7_ELSE; |
| 2443 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], | 2544 | if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], |
| 2444 | id)) | 2545 | id)) |
| 2445 | mask_found |= CATEGORY_MASK_ISO_8_ELSE; | 2546 | found |= CATEGORY_MASK_ISO_8_ELSE; |
| 2446 | else | 2547 | else |
| 2447 | mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE; | 2548 | rejected |= CATEGORY_MASK_ISO_8_ELSE; |
| 2448 | break; | 2549 | break; |
| 2449 | 2550 | ||
| 2450 | case ISO_CODE_SO: | 2551 | case ISO_CODE_SO: |
| 2451 | if (inhibit_iso_escape_detection) | ||
| 2452 | break; | ||
| 2453 | single_shifting = 0; | ||
| 2454 | if (shift_out == 0 | ||
| 2455 | && (reg[1] >= 0 | ||
| 2456 | || SHIFT_OUT_OK (coding_category_iso_7_else) | ||
| 2457 | || SHIFT_OUT_OK (coding_category_iso_8_else))) | ||
| 2458 | { | ||
| 2459 | /* Locking shift out. */ | ||
| 2460 | mask_iso &= ~CATEGORY_MASK_ISO_7BIT; | ||
| 2461 | mask_found |= CATEGORY_MASK_ISO_ELSE; | ||
| 2462 | } | ||
| 2463 | break; | ||
| 2464 | |||
| 2465 | case ISO_CODE_SI: | 2552 | case ISO_CODE_SI: |
| 2553 | /* Locking shift out/in. */ | ||
| 2466 | if (inhibit_iso_escape_detection) | 2554 | if (inhibit_iso_escape_detection) |
| 2467 | break; | 2555 | break; |
| 2468 | single_shifting = 0; | 2556 | single_shifting = 0; |
| 2469 | if (shift_out == 1) | 2557 | rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; |
| 2470 | { | 2558 | found |= CATEGORY_MASK_ISO_ELSE; |
| 2471 | /* Locking shift in. */ | ||
| 2472 | mask_iso &= ~CATEGORY_MASK_ISO_7BIT; | ||
| 2473 | mask_found |= CATEGORY_MASK_ISO_ELSE; | ||
| 2474 | } | ||
| 2475 | break; | 2559 | break; |
| 2476 | 2560 | ||
| 2477 | case ISO_CODE_CSI: | 2561 | case ISO_CODE_CSI: |
| 2562 | /* Control sequence introducer. */ | ||
| 2478 | single_shifting = 0; | 2563 | single_shifting = 0; |
| 2564 | rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE; | ||
| 2565 | found |= CATEGORY_MASK_ISO_8_ELSE; | ||
| 2566 | goto check_extra_latin; | ||
| 2567 | |||
| 2568 | |||
| 2479 | case ISO_CODE_SS2: | 2569 | case ISO_CODE_SS2: |
| 2480 | case ISO_CODE_SS3: | 2570 | case ISO_CODE_SS3: |
| 2481 | { | 2571 | /* Single shift. */ |
| 2482 | int newmask = CATEGORY_MASK_ISO_8_ELSE; | 2572 | if (inhibit_iso_escape_detection) |
| 2483 | 2573 | break; | |
| 2484 | mask_8bit_found = 1; | 2574 | single_shifting = 1; |
| 2485 | if (inhibit_iso_escape_detection) | 2575 | rejected |= CATEGORY_MASK_ISO_7BIT; |
| 2486 | break; | 2576 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) |
| 2487 | if (c != ISO_CODE_CSI) | 2577 | & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2488 | { | 2578 | found |= CATEGORY_MASK_ISO_8_1; |
| 2489 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) | 2579 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) |
| 2490 | & CODING_ISO_FLAG_SINGLE_SHIFT) | 2580 | & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2491 | newmask |= CATEGORY_MASK_ISO_8_1; | 2581 | found |= CATEGORY_MASK_ISO_8_2; |
| 2492 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) | 2582 | goto check_extra_latin; |
| 2493 | & CODING_ISO_FLAG_SINGLE_SHIFT) | ||
| 2494 | newmask |= CATEGORY_MASK_ISO_8_2; | ||
| 2495 | single_shifting = 1; | ||
| 2496 | } | ||
| 2497 | if (VECTORP (Vlatin_extra_code_table) | ||
| 2498 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | ||
| 2499 | { | ||
| 2500 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) | ||
| 2501 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2502 | newmask |= CATEGORY_MASK_ISO_8_1; | ||
| 2503 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) | ||
| 2504 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2505 | newmask |= CATEGORY_MASK_ISO_8_2; | ||
| 2506 | } | ||
| 2507 | mask_iso &= newmask; | ||
| 2508 | mask_found |= newmask; | ||
| 2509 | } | ||
| 2510 | break; | ||
| 2511 | 2583 | ||
| 2512 | default: | 2584 | default: |
| 2513 | if (c < 0x80) | 2585 | if (c < 0x80) |
| @@ -2515,39 +2587,16 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2515 | single_shifting = 0; | 2587 | single_shifting = 0; |
| 2516 | break; | 2588 | break; |
| 2517 | } | 2589 | } |
| 2518 | else if (c < 0xA0) | 2590 | if (c >= 0xA0) |
| 2519 | { | 2591 | { |
| 2520 | single_shifting = 0; | 2592 | rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE; |
| 2521 | mask_8bit_found = 1; | 2593 | found |= CATEGORY_MASK_ISO_8_1; |
| 2522 | if (VECTORP (Vlatin_extra_code_table) | ||
| 2523 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | ||
| 2524 | { | ||
| 2525 | int newmask = 0; | ||
| 2526 | |||
| 2527 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) | ||
| 2528 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2529 | newmask |= CATEGORY_MASK_ISO_8_1; | ||
| 2530 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) | ||
| 2531 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2532 | newmask |= CATEGORY_MASK_ISO_8_2; | ||
| 2533 | mask_iso &= newmask; | ||
| 2534 | mask_found |= newmask; | ||
| 2535 | } | ||
| 2536 | else | ||
| 2537 | return 0; | ||
| 2538 | } | ||
| 2539 | else | ||
| 2540 | { | ||
| 2541 | mask_iso &= ~(CATEGORY_MASK_ISO_7BIT | ||
| 2542 | | CATEGORY_MASK_ISO_7_ELSE); | ||
| 2543 | mask_found |= CATEGORY_MASK_ISO_8_1; | ||
| 2544 | mask_8bit_found = 1; | ||
| 2545 | /* Check the length of succeeding codes of the range | 2594 | /* Check the length of succeeding codes of the range |
| 2546 | 0xA0..0FF. If the byte length is odd, we exclude | 2595 | 0xA0..0FF. If the byte length is even, we include |
| 2547 | CATEGORY_MASK_ISO_8_2. We can check this only | 2596 | CATEGORY_MASK_ISO_8_2 in `found'. We can check this |
| 2548 | when we are not single shifting. */ | 2597 | only when we are not single shifting. */ |
| 2549 | if (!single_shifting | 2598 | if (! single_shifting |
| 2550 | && mask_iso & CATEGORY_MASK_ISO_8_2) | 2599 | && ! (rejected & CATEGORY_MASK_ISO_8_2)) |
| 2551 | { | 2600 | { |
| 2552 | int i = 1; | 2601 | int i = 1; |
| 2553 | while (src < src_end) | 2602 | while (src < src_end) |
| @@ -2559,26 +2608,38 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2559 | } | 2608 | } |
| 2560 | 2609 | ||
| 2561 | if (i & 1 && src < src_end) | 2610 | if (i & 1 && src < src_end) |
| 2562 | mask_iso &= ~CATEGORY_MASK_ISO_8_2; | 2611 | rejected |= CATEGORY_MASK_ISO_8_2; |
| 2563 | else | 2612 | else |
| 2564 | mask_found |= CATEGORY_MASK_ISO_8_2; | 2613 | found |= CATEGORY_MASK_ISO_8_2; |
| 2565 | } | 2614 | } |
| 2615 | break; | ||
| 2566 | } | 2616 | } |
| 2567 | break; | 2617 | check_extra_latin: |
| 2618 | single_shifting = 0; | ||
| 2619 | if (! VECTORP (Vlatin_extra_code_table) | ||
| 2620 | || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | ||
| 2621 | { | ||
| 2622 | rejected = CATEGORY_MASK_ISO; | ||
| 2623 | break; | ||
| 2624 | } | ||
| 2625 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) | ||
| 2626 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2627 | found |= CATEGORY_MASK_ISO_8_1; | ||
| 2628 | else | ||
| 2629 | rejected |= CATEGORY_MASK_ISO_8_1; | ||
| 2630 | if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) | ||
| 2631 | & CODING_ISO_FLAG_LATIN_EXTRA) | ||
| 2632 | found |= CATEGORY_MASK_ISO_8_2; | ||
| 2633 | else | ||
| 2634 | rejected |= CATEGORY_MASK_ISO_8_2; | ||
| 2568 | } | 2635 | } |
| 2569 | } | 2636 | } |
| 2637 | detect_info->rejected |= CATEGORY_MASK_ISO; | ||
| 2638 | return 0; | ||
| 2639 | |||
| 2570 | no_more_source: | 2640 | no_more_source: |
| 2571 | if (!mask_iso) | 2641 | detect_info->rejected |= rejected; |
| 2572 | { | 2642 | detect_info->found |= (found & ~rejected); |
| 2573 | *mask &= ~CATEGORY_MASK_ISO; | ||
| 2574 | return 0; | ||
| 2575 | } | ||
| 2576 | if (!mask_found) | ||
| 2577 | return 0; | ||
| 2578 | *mask &= ~CATEGORY_MASK_ISO; | ||
| 2579 | *mask |= mask_iso & mask_found; | ||
| 2580 | if (! mask_8bit_found) | ||
| 2581 | *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); | ||
| 2582 | return 1; | 2643 | return 1; |
| 2583 | } | 2644 | } |
| 2584 | 2645 | ||
| @@ -2694,8 +2755,10 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2694 | : (component_idx + 1) / 2); \ | 2755 | : (component_idx + 1) / 2); \ |
| 2695 | int i; \ | 2756 | int i; \ |
| 2696 | int *saved_charbuf = charbuf; \ | 2757 | int *saved_charbuf = charbuf; \ |
| 2758 | int from = coding->produced_char + char_offset; \ | ||
| 2759 | int to = from + nchars; \ | ||
| 2697 | \ | 2760 | \ |
| 2698 | ADD_COMPOSITION_DATA (charbuf, method, nchars); \ | 2761 | ADD_COMPOSITION_DATA (charbuf, from, to, method); \ |
| 2699 | if (method != COMPOSITION_RELATIVE) \ | 2762 | if (method != COMPOSITION_RELATIVE) \ |
| 2700 | { \ | 2763 | { \ |
| 2701 | if (component_len == 0) \ | 2764 | if (component_len == 0) \ |
| @@ -2752,9 +2815,9 @@ decode_coding_iso_2022 (coding) | |||
| 2752 | unsigned char *src_end = coding->source + coding->src_bytes; | 2815 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 2753 | unsigned char *src_base; | 2816 | unsigned char *src_base; |
| 2754 | int *charbuf = coding->charbuf; | 2817 | int *charbuf = coding->charbuf; |
| 2755 | int *charbuf_end = charbuf + coding->charbuf_size - 4; | 2818 | int *charbuf_end |
| 2819 | = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH; | ||
| 2756 | int consumed_chars = 0, consumed_chars_base; | 2820 | int consumed_chars = 0, consumed_chars_base; |
| 2757 | int char_offset = 0; | ||
| 2758 | int multibytep = coding->src_multibyte; | 2821 | int multibytep = coding->src_multibyte; |
| 2759 | /* Charsets invoked to graphic plane 0 and 1 respectively. */ | 2822 | /* Charsets invoked to graphic plane 0 and 1 respectively. */ |
| 2760 | int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); | 2823 | int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| @@ -2774,6 +2837,9 @@ decode_coding_iso_2022 (coding) | |||
| 2774 | int component_idx; | 2837 | int component_idx; |
| 2775 | int component_len; | 2838 | int component_len; |
| 2776 | Lisp_Object attrs, eol_type, charset_list; | 2839 | Lisp_Object attrs, eol_type, charset_list; |
| 2840 | int char_offset = coding->produced_char; | ||
| 2841 | int last_offset = char_offset; | ||
| 2842 | int last_id = charset_ascii; | ||
| 2777 | 2843 | ||
| 2778 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 2844 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 2779 | setup_iso_safe_charsets (attrs); | 2845 | setup_iso_safe_charsets (attrs); |
| @@ -3051,6 +3117,15 @@ decode_coding_iso_2022 (coding) | |||
| 3051 | } | 3117 | } |
| 3052 | } | 3118 | } |
| 3053 | 3119 | ||
| 3120 | if (charset->id != charset_ascii | ||
| 3121 | && last_id != charset->id) | ||
| 3122 | { | ||
| 3123 | if (last_id != charset_ascii) | ||
| 3124 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 3125 | last_id = charset->id; | ||
| 3126 | last_offset = char_offset; | ||
| 3127 | } | ||
| 3128 | |||
| 3054 | /* Now we know CHARSET and 1st position code C1 of a character. | 3129 | /* Now we know CHARSET and 1st position code C1 of a character. |
| 3055 | Produce a decoded character while getting 2nd position code | 3130 | Produce a decoded character while getting 2nd position code |
| 3056 | C2 if necessary. */ | 3131 | C2 if necessary. */ |
| @@ -3082,6 +3157,7 @@ decode_coding_iso_2022 (coding) | |||
| 3082 | *charbuf++ = *src_base; | 3157 | *charbuf++ = *src_base; |
| 3083 | else | 3158 | else |
| 3084 | *charbuf++ = BYTE8_TO_CHAR (*src_base); | 3159 | *charbuf++ = BYTE8_TO_CHAR (*src_base); |
| 3160 | char_offset++; | ||
| 3085 | } | 3161 | } |
| 3086 | } | 3162 | } |
| 3087 | else if (composition_state == COMPOSING_NO) | 3163 | else if (composition_state == COMPOSING_NO) |
| @@ -3105,10 +3181,13 @@ decode_coding_iso_2022 (coding) | |||
| 3105 | consumed_chars = consumed_chars_base; | 3181 | consumed_chars = consumed_chars_base; |
| 3106 | ONE_MORE_BYTE (c); | 3182 | ONE_MORE_BYTE (c); |
| 3107 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 3183 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 3184 | char_offset++; | ||
| 3108 | coding->errors++; | 3185 | coding->errors++; |
| 3109 | } | 3186 | } |
| 3110 | 3187 | ||
| 3111 | no_more_source: | 3188 | no_more_source: |
| 3189 | if (last_id != charset_ascii) | ||
| 3190 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 3112 | coding->consumed_char += consumed_chars_base; | 3191 | coding->consumed_char += consumed_chars_base; |
| 3113 | coding->consumed = src_base - coding->source; | 3192 | coding->consumed = src_base - coding->source; |
| 3114 | coding->charbuf_used = charbuf - coding->charbuf; | 3193 | coding->charbuf_used = charbuf - coding->charbuf; |
| @@ -3530,9 +3609,12 @@ encode_coding_iso_2022 (coding) | |||
| 3530 | Lisp_Object attrs, eol_type, charset_list; | 3609 | Lisp_Object attrs, eol_type, charset_list; |
| 3531 | int ascii_compatible; | 3610 | int ascii_compatible; |
| 3532 | int c; | 3611 | int c; |
| 3612 | int preferred_charset_id = -1; | ||
| 3533 | 3613 | ||
| 3534 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 3614 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3535 | setup_iso_safe_charsets (attrs); | 3615 | setup_iso_safe_charsets (attrs); |
| 3616 | /* Charset list may have been changed. */ | ||
| 3617 | charset_list = CODING_ATTR_CHARSET_LIST (attrs); \ | ||
| 3536 | coding->safe_charsets | 3618 | coding->safe_charsets |
| 3537 | = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data; | 3619 | = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data; |
| 3538 | 3620 | ||
| @@ -3555,6 +3637,28 @@ encode_coding_iso_2022 (coding) | |||
| 3555 | 3637 | ||
| 3556 | c = *charbuf++; | 3638 | c = *charbuf++; |
| 3557 | 3639 | ||
| 3640 | if (c < 0) | ||
| 3641 | { | ||
| 3642 | /* Handle an annotation. */ | ||
| 3643 | switch (*charbuf) | ||
| 3644 | { | ||
| 3645 | case CODING_ANNOTATE_COMPOSITION_MASK: | ||
| 3646 | /* Not yet implemented. */ | ||
| 3647 | break; | ||
| 3648 | case CODING_ANNOTATE_CHARSET_MASK: | ||
| 3649 | preferred_charset_id = charbuf[3]; | ||
| 3650 | if (preferred_charset_id >= 0 | ||
| 3651 | && NILP (Fmemq (make_number (preferred_charset_id), | ||
| 3652 | charset_list))) | ||
| 3653 | preferred_charset_id = -1; | ||
| 3654 | break; | ||
| 3655 | default: | ||
| 3656 | abort (); | ||
| 3657 | } | ||
| 3658 | charbuf += -c - 1; | ||
| 3659 | continue; | ||
| 3660 | } | ||
| 3661 | |||
| 3558 | /* Now encode the character C. */ | 3662 | /* Now encode the character C. */ |
| 3559 | if (c < 0x20 || c == 0x7F) | 3663 | if (c < 0x20 || c == 0x7F) |
| 3560 | { | 3664 | { |
| @@ -3595,8 +3699,16 @@ encode_coding_iso_2022 (coding) | |||
| 3595 | } | 3699 | } |
| 3596 | else | 3700 | else |
| 3597 | { | 3701 | { |
| 3598 | struct charset *charset = char_charset (c, charset_list, NULL); | 3702 | struct charset *charset; |
| 3599 | 3703 | ||
| 3704 | if (preferred_charset_id >= 0) | ||
| 3705 | { | ||
| 3706 | charset = CHARSET_FROM_ID (preferred_charset_id); | ||
| 3707 | if (! CHAR_CHARSET_P (c, charset)) | ||
| 3708 | charset = char_charset (c, charset_list, NULL); | ||
| 3709 | } | ||
| 3710 | else | ||
| 3711 | charset = char_charset (c, charset_list, NULL); | ||
| 3600 | if (!charset) | 3712 | if (!charset) |
| 3601 | { | 3713 | { |
| 3602 | if (coding->mode & CODING_MODE_SAFE_ENCODING) | 3714 | if (coding->mode & CODING_MODE_SAFE_ENCODING) |
| @@ -3669,9 +3781,9 @@ encode_coding_iso_2022 (coding) | |||
| 3669 | CATEGORY_MASK_SJIS, else return 0. */ | 3781 | CATEGORY_MASK_SJIS, else return 0. */ |
| 3670 | 3782 | ||
| 3671 | static int | 3783 | static int |
| 3672 | detect_coding_sjis (coding, mask) | 3784 | detect_coding_sjis (coding, detect_info) |
| 3673 | struct coding_system *coding; | 3785 | struct coding_system *coding; |
| 3674 | int *mask; | 3786 | struct coding_detection_info *detect_info; |
| 3675 | { | 3787 | { |
| 3676 | unsigned char *src = coding->source, *src_base = src; | 3788 | unsigned char *src = coding->source, *src_base = src; |
| 3677 | unsigned char *src_end = coding->source + coding->src_bytes; | 3789 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -3681,6 +3793,7 @@ detect_coding_sjis (coding, mask) | |||
| 3681 | int c; | 3793 | int c; |
| 3682 | int incomplete; | 3794 | int incomplete; |
| 3683 | 3795 | ||
| 3796 | detect_info->checked |= CATEGORY_MASK_SJIS; | ||
| 3684 | /* A coding system of this category is always ASCII compatible. */ | 3797 | /* A coding system of this category is always ASCII compatible. */ |
| 3685 | src += coding->head_ascii; | 3798 | src += coding->head_ascii; |
| 3686 | 3799 | ||
| @@ -3696,23 +3809,24 @@ detect_coding_sjis (coding, mask) | |||
| 3696 | ONE_MORE_BYTE (c); | 3809 | ONE_MORE_BYTE (c); |
| 3697 | if (c < 0x40 || c == 0x7F || c > 0xFC) | 3810 | if (c < 0x40 || c == 0x7F || c > 0xFC) |
| 3698 | break; | 3811 | break; |
| 3699 | found = 1; | 3812 | found = CATEGORY_MASK_SJIS; |
| 3700 | } | 3813 | } |
| 3701 | else if (c >= 0xA0 && c < 0xE0) | 3814 | else if (c >= 0xA0 && c < 0xE0) |
| 3702 | found = 1; | 3815 | found = CATEGORY_MASK_SJIS; |
| 3703 | else | 3816 | else |
| 3704 | break; | 3817 | break; |
| 3705 | } | 3818 | } |
| 3706 | *mask &= ~CATEGORY_MASK_SJIS; | 3819 | detect_info->rejected |= CATEGORY_MASK_SJIS; |
| 3707 | return 0; | 3820 | return 0; |
| 3708 | 3821 | ||
| 3709 | no_more_source: | 3822 | no_more_source: |
| 3710 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | 3823 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3711 | { | 3824 | { |
| 3712 | *mask &= ~CATEGORY_MASK_SJIS; | 3825 | detect_info->rejected |= CATEGORY_MASK_SJIS; |
| 3713 | return 0; | 3826 | return 0; |
| 3714 | } | 3827 | } |
| 3715 | return found; | 3828 | detect_info->found |= found; |
| 3829 | return 1; | ||
| 3716 | } | 3830 | } |
| 3717 | 3831 | ||
| 3718 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3832 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| @@ -3720,9 +3834,9 @@ detect_coding_sjis (coding, mask) | |||
| 3720 | CATEGORY_MASK_BIG5, else return 0. */ | 3834 | CATEGORY_MASK_BIG5, else return 0. */ |
| 3721 | 3835 | ||
| 3722 | static int | 3836 | static int |
| 3723 | detect_coding_big5 (coding, mask) | 3837 | detect_coding_big5 (coding, detect_info) |
| 3724 | struct coding_system *coding; | 3838 | struct coding_system *coding; |
| 3725 | int *mask; | 3839 | struct coding_detection_info *detect_info; |
| 3726 | { | 3840 | { |
| 3727 | unsigned char *src = coding->source, *src_base = src; | 3841 | unsigned char *src = coding->source, *src_base = src; |
| 3728 | unsigned char *src_end = coding->source + coding->src_bytes; | 3842 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -3732,6 +3846,7 @@ detect_coding_big5 (coding, mask) | |||
| 3732 | int c; | 3846 | int c; |
| 3733 | int incomplete; | 3847 | int incomplete; |
| 3734 | 3848 | ||
| 3849 | detect_info->checked |= CATEGORY_MASK_BIG5; | ||
| 3735 | /* A coding system of this category is always ASCII compatible. */ | 3850 | /* A coding system of this category is always ASCII compatible. */ |
| 3736 | src += coding->head_ascii; | 3851 | src += coding->head_ascii; |
| 3737 | 3852 | ||
| @@ -3747,21 +3862,22 @@ detect_coding_big5 (coding, mask) | |||
| 3747 | ONE_MORE_BYTE (c); | 3862 | ONE_MORE_BYTE (c); |
| 3748 | if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) | 3863 | if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) |
| 3749 | return 0; | 3864 | return 0; |
| 3750 | found = 1; | 3865 | found = CATEGORY_MASK_BIG5; |
| 3751 | } | 3866 | } |
| 3752 | else | 3867 | else |
| 3753 | break; | 3868 | break; |
| 3754 | } | 3869 | } |
| 3755 | *mask &= ~CATEGORY_MASK_BIG5; | 3870 | detect_info->rejected |= CATEGORY_MASK_BIG5; |
| 3756 | return 0; | 3871 | return 0; |
| 3757 | 3872 | ||
| 3758 | no_more_source: | 3873 | no_more_source: |
| 3759 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | 3874 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3760 | { | 3875 | { |
| 3761 | *mask &= ~CATEGORY_MASK_BIG5; | 3876 | detect_info->rejected |= CATEGORY_MASK_BIG5; |
| 3762 | return 0; | 3877 | return 0; |
| 3763 | } | 3878 | } |
| 3764 | return found; | 3879 | detect_info->found |= found; |
| 3880 | return 1; | ||
| 3765 | } | 3881 | } |
| 3766 | 3882 | ||
| 3767 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3883 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| @@ -3775,11 +3891,14 @@ decode_coding_sjis (coding) | |||
| 3775 | unsigned char *src_end = coding->source + coding->src_bytes; | 3891 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 3776 | unsigned char *src_base; | 3892 | unsigned char *src_base; |
| 3777 | int *charbuf = coding->charbuf; | 3893 | int *charbuf = coding->charbuf; |
| 3778 | int *charbuf_end = charbuf + coding->charbuf_size; | 3894 | int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; |
| 3779 | int consumed_chars = 0, consumed_chars_base; | 3895 | int consumed_chars = 0, consumed_chars_base; |
| 3780 | int multibytep = coding->src_multibyte; | 3896 | int multibytep = coding->src_multibyte; |
| 3781 | struct charset *charset_roman, *charset_kanji, *charset_kana; | 3897 | struct charset *charset_roman, *charset_kanji, *charset_kana; |
| 3782 | Lisp_Object attrs, eol_type, charset_list, val; | 3898 | Lisp_Object attrs, eol_type, charset_list, val; |
| 3899 | int char_offset = coding->produced_char; | ||
| 3900 | int last_offset = char_offset; | ||
| 3901 | int last_id = charset_ascii; | ||
| 3783 | 3902 | ||
| 3784 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 3903 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3785 | 3904 | ||
| @@ -3842,9 +3961,18 @@ decode_coding_sjis (coding) | |||
| 3842 | charset = charset_kana; | 3961 | charset = charset_kana; |
| 3843 | } | 3962 | } |
| 3844 | } | 3963 | } |
| 3964 | if (charset->id != charset_ascii | ||
| 3965 | && last_id != charset->id) | ||
| 3966 | { | ||
| 3967 | if (last_id != charset_ascii) | ||
| 3968 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 3969 | last_id = charset->id; | ||
| 3970 | last_offset = char_offset; | ||
| 3971 | } | ||
| 3845 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | 3972 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); |
| 3846 | } | 3973 | } |
| 3847 | *charbuf++ = c; | 3974 | *charbuf++ = c; |
| 3975 | char_offset++; | ||
| 3848 | continue; | 3976 | continue; |
| 3849 | 3977 | ||
| 3850 | invalid_code: | 3978 | invalid_code: |
| @@ -3852,10 +3980,13 @@ decode_coding_sjis (coding) | |||
| 3852 | consumed_chars = consumed_chars_base; | 3980 | consumed_chars = consumed_chars_base; |
| 3853 | ONE_MORE_BYTE (c); | 3981 | ONE_MORE_BYTE (c); |
| 3854 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 3982 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 3983 | char_offset++; | ||
| 3855 | coding->errors++; | 3984 | coding->errors++; |
| 3856 | } | 3985 | } |
| 3857 | 3986 | ||
| 3858 | no_more_source: | 3987 | no_more_source: |
| 3988 | if (last_id != charset_ascii) | ||
| 3989 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 3859 | coding->consumed_char += consumed_chars_base; | 3990 | coding->consumed_char += consumed_chars_base; |
| 3860 | coding->consumed = src_base - coding->source; | 3991 | coding->consumed = src_base - coding->source; |
| 3861 | coding->charbuf_used = charbuf - coding->charbuf; | 3992 | coding->charbuf_used = charbuf - coding->charbuf; |
| @@ -3869,11 +4000,14 @@ decode_coding_big5 (coding) | |||
| 3869 | unsigned char *src_end = coding->source + coding->src_bytes; | 4000 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 3870 | unsigned char *src_base; | 4001 | unsigned char *src_base; |
| 3871 | int *charbuf = coding->charbuf; | 4002 | int *charbuf = coding->charbuf; |
| 3872 | int *charbuf_end = charbuf + coding->charbuf_size; | 4003 | int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; |
| 3873 | int consumed_chars = 0, consumed_chars_base; | 4004 | int consumed_chars = 0, consumed_chars_base; |
| 3874 | int multibytep = coding->src_multibyte; | 4005 | int multibytep = coding->src_multibyte; |
| 3875 | struct charset *charset_roman, *charset_big5; | 4006 | struct charset *charset_roman, *charset_big5; |
| 3876 | Lisp_Object attrs, eol_type, charset_list, val; | 4007 | Lisp_Object attrs, eol_type, charset_list, val; |
| 4008 | int char_offset = coding->produced_char; | ||
| 4009 | int last_offset = char_offset; | ||
| 4010 | int last_id = charset_ascii; | ||
| 3877 | 4011 | ||
| 3878 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 4012 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3879 | val = charset_list; | 4013 | val = charset_list; |
| @@ -3923,10 +4057,19 @@ decode_coding_big5 (coding) | |||
| 3923 | c = c << 8 | c1; | 4057 | c = c << 8 | c1; |
| 3924 | charset = charset_big5; | 4058 | charset = charset_big5; |
| 3925 | } | 4059 | } |
| 4060 | if (charset->id != charset_ascii | ||
| 4061 | && last_id != charset->id) | ||
| 4062 | { | ||
| 4063 | if (last_id != charset_ascii) | ||
| 4064 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 4065 | last_id = charset->id; | ||
| 4066 | last_offset = char_offset; | ||
| 4067 | } | ||
| 3926 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | 4068 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); |
| 3927 | } | 4069 | } |
| 3928 | 4070 | ||
| 3929 | *charbuf++ = c; | 4071 | *charbuf++ = c; |
| 4072 | char_offset++; | ||
| 3930 | continue; | 4073 | continue; |
| 3931 | 4074 | ||
| 3932 | invalid_code: | 4075 | invalid_code: |
| @@ -3934,10 +4077,13 @@ decode_coding_big5 (coding) | |||
| 3934 | consumed_chars = consumed_chars_base; | 4077 | consumed_chars = consumed_chars_base; |
| 3935 | ONE_MORE_BYTE (c); | 4078 | ONE_MORE_BYTE (c); |
| 3936 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 4079 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 4080 | char_offset++; | ||
| 3937 | coding->errors++; | 4081 | coding->errors++; |
| 3938 | } | 4082 | } |
| 3939 | 4083 | ||
| 3940 | no_more_source: | 4084 | no_more_source: |
| 4085 | if (last_id != charset_ascii) | ||
| 4086 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 3941 | coding->consumed_char += consumed_chars_base; | 4087 | coding->consumed_char += consumed_chars_base; |
| 3942 | coding->consumed = src_base - coding->source; | 4088 | coding->consumed = src_base - coding->source; |
| 3943 | coding->charbuf_used = charbuf - coding->charbuf; | 4089 | coding->charbuf_used = charbuf - coding->charbuf; |
| @@ -4106,9 +4252,9 @@ encode_coding_big5 (coding) | |||
| 4106 | CATEGORY_MASK_CCL, else return 0. */ | 4252 | CATEGORY_MASK_CCL, else return 0. */ |
| 4107 | 4253 | ||
| 4108 | static int | 4254 | static int |
| 4109 | detect_coding_ccl (coding, mask) | 4255 | detect_coding_ccl (coding, detect_info) |
| 4110 | struct coding_system *coding; | 4256 | struct coding_system *coding; |
| 4111 | int *mask; | 4257 | struct coding_detection_info *detect_info; |
| 4112 | { | 4258 | { |
| 4113 | unsigned char *src = coding->source, *src_base = src; | 4259 | unsigned char *src = coding->source, *src_base = src; |
| 4114 | unsigned char *src_end = coding->source + coding->src_bytes; | 4260 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -4119,6 +4265,8 @@ detect_coding_ccl (coding, mask) | |||
| 4119 | int head_ascii = coding->head_ascii; | 4265 | int head_ascii = coding->head_ascii; |
| 4120 | Lisp_Object attrs; | 4266 | Lisp_Object attrs; |
| 4121 | 4267 | ||
| 4268 | detect_info->checked |= CATEGORY_MASK_CCL; | ||
| 4269 | |||
| 4122 | coding = &coding_categories[coding_category_ccl]; | 4270 | coding = &coding_categories[coding_category_ccl]; |
| 4123 | attrs = CODING_ID_ATTRS (coding->id); | 4271 | attrs = CODING_ID_ATTRS (coding->id); |
| 4124 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) | 4272 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) |
| @@ -4130,14 +4278,15 @@ detect_coding_ccl (coding, mask) | |||
| 4130 | ONE_MORE_BYTE (c); | 4278 | ONE_MORE_BYTE (c); |
| 4131 | if (! valids[c]) | 4279 | if (! valids[c]) |
| 4132 | break; | 4280 | break; |
| 4133 | if (!found && valids[c] > 1) | 4281 | if ((valids[c] > 1)) |
| 4134 | found = 1; | 4282 | found = CATEGORY_MASK_CCL; |
| 4135 | } | 4283 | } |
| 4136 | *mask &= ~CATEGORY_MASK_CCL; | 4284 | detect_info->rejected |= CATEGORY_MASK_CCL; |
| 4137 | return 0; | 4285 | return 0; |
| 4138 | 4286 | ||
| 4139 | no_more_source: | 4287 | no_more_source: |
| 4140 | return found; | 4288 | detect_info->found |= found; |
| 4289 | return 1; | ||
| 4141 | } | 4290 | } |
| 4142 | 4291 | ||
| 4143 | static void | 4292 | static void |
| @@ -4375,10 +4524,14 @@ encode_coding_raw_text (coding) | |||
| 4375 | return 0; | 4524 | return 0; |
| 4376 | } | 4525 | } |
| 4377 | 4526 | ||
| 4527 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | ||
| 4528 | Check if a text is encoded in a charset-based coding system. If it | ||
| 4529 | is, return 1, else return 0. */ | ||
| 4530 | |||
| 4378 | static int | 4531 | static int |
| 4379 | detect_coding_charset (coding, mask) | 4532 | detect_coding_charset (coding, detect_info) |
| 4380 | struct coding_system *coding; | 4533 | struct coding_system *coding; |
| 4381 | int *mask; | 4534 | struct coding_detection_info *detect_info; |
| 4382 | { | 4535 | { |
| 4383 | unsigned char *src = coding->source, *src_base = src; | 4536 | unsigned char *src = coding->source, *src_base = src; |
| 4384 | unsigned char *src_end = coding->source + coding->src_bytes; | 4537 | unsigned char *src_end = coding->source + coding->src_bytes; |
| @@ -4387,6 +4540,8 @@ detect_coding_charset (coding, mask) | |||
| 4387 | Lisp_Object attrs, valids; | 4540 | Lisp_Object attrs, valids; |
| 4388 | int found = 0; | 4541 | int found = 0; |
| 4389 | 4542 | ||
| 4543 | detect_info->checked |= CATEGORY_MASK_CHARSET; | ||
| 4544 | |||
| 4390 | coding = &coding_categories[coding_category_charset]; | 4545 | coding = &coding_categories[coding_category_charset]; |
| 4391 | attrs = CODING_ID_ATTRS (coding->id); | 4546 | attrs = CODING_ID_ATTRS (coding->id); |
| 4392 | valids = AREF (attrs, coding_attr_charset_valids); | 4547 | valids = AREF (attrs, coding_attr_charset_valids); |
| @@ -4402,13 +4557,14 @@ detect_coding_charset (coding, mask) | |||
| 4402 | if (NILP (AREF (valids, c))) | 4557 | if (NILP (AREF (valids, c))) |
| 4403 | break; | 4558 | break; |
| 4404 | if (c >= 0x80) | 4559 | if (c >= 0x80) |
| 4405 | found = 1; | 4560 | found = CATEGORY_MASK_CHARSET; |
| 4406 | } | 4561 | } |
| 4407 | *mask &= ~CATEGORY_MASK_CHARSET; | 4562 | detect_info->rejected |= CATEGORY_MASK_CHARSET; |
| 4408 | return 0; | 4563 | return 0; |
| 4409 | 4564 | ||
| 4410 | no_more_source: | 4565 | no_more_source: |
| 4411 | return (found || NILP (CODING_ATTR_ASCII_COMPAT (attrs))); | 4566 | detect_info->found |= found; |
| 4567 | return 1; | ||
| 4412 | } | 4568 | } |
| 4413 | 4569 | ||
| 4414 | static void | 4570 | static void |
| @@ -4419,10 +4575,13 @@ decode_coding_charset (coding) | |||
| 4419 | unsigned char *src_end = coding->source + coding->src_bytes; | 4575 | unsigned char *src_end = coding->source + coding->src_bytes; |
| 4420 | unsigned char *src_base; | 4576 | unsigned char *src_base; |
| 4421 | int *charbuf = coding->charbuf; | 4577 | int *charbuf = coding->charbuf; |
| 4422 | int *charbuf_end = charbuf + coding->charbuf_size; | 4578 | int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; |
| 4423 | int consumed_chars = 0, consumed_chars_base; | 4579 | int consumed_chars = 0, consumed_chars_base; |
| 4424 | int multibytep = coding->src_multibyte; | 4580 | int multibytep = coding->src_multibyte; |
| 4425 | Lisp_Object attrs, eol_type, charset_list, valids; | 4581 | Lisp_Object attrs, eol_type, charset_list, valids; |
| 4582 | int char_offset = coding->produced_char; | ||
| 4583 | int last_offset = char_offset; | ||
| 4584 | int last_id = charset_ascii; | ||
| 4426 | 4585 | ||
| 4427 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 4586 | CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 4428 | valids = AREF (attrs, coding_attr_charset_valids); | 4587 | valids = AREF (attrs, coding_attr_charset_valids); |
| @@ -4503,8 +4662,17 @@ decode_coding_charset (coding) | |||
| 4503 | } | 4662 | } |
| 4504 | if (c < 0) | 4663 | if (c < 0) |
| 4505 | goto invalid_code; | 4664 | goto invalid_code; |
| 4665 | if (charset->id != charset_ascii | ||
| 4666 | && last_id != charset->id) | ||
| 4667 | { | ||
| 4668 | if (last_id != charset_ascii) | ||
| 4669 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 4670 | last_id = charset->id; | ||
| 4671 | last_offset = char_offset; | ||
| 4672 | } | ||
| 4506 | } | 4673 | } |
| 4507 | *charbuf++ = c; | 4674 | *charbuf++ = c; |
| 4675 | char_offset++; | ||
| 4508 | continue; | 4676 | continue; |
| 4509 | 4677 | ||
| 4510 | invalid_code: | 4678 | invalid_code: |
| @@ -4512,10 +4680,13 @@ decode_coding_charset (coding) | |||
| 4512 | consumed_chars = consumed_chars_base; | 4680 | consumed_chars = consumed_chars_base; |
| 4513 | ONE_MORE_BYTE (c); | 4681 | ONE_MORE_BYTE (c); |
| 4514 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 4682 | *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 4683 | char_offset++; | ||
| 4515 | coding->errors++; | 4684 | coding->errors++; |
| 4516 | } | 4685 | } |
| 4517 | 4686 | ||
| 4518 | no_more_source: | 4687 | no_more_source: |
| 4688 | if (last_id != charset_ascii) | ||
| 4689 | ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | ||
| 4519 | coding->consumed_char += consumed_chars_base; | 4690 | coding->consumed_char += consumed_chars_base; |
| 4520 | coding->consumed = src_base - coding->source; | 4691 | coding->consumed = src_base - coding->source; |
| 4521 | coding->charbuf_used = charbuf - coding->charbuf; | 4692 | coding->charbuf_used = charbuf - coding->charbuf; |
| @@ -4632,6 +4803,7 @@ setup_coding_system (coding_system, coding) | |||
| 4632 | { | 4803 | { |
| 4633 | int i; | 4804 | int i; |
| 4634 | int flags = XINT (AREF (attrs, coding_attr_iso_flags)); | 4805 | int flags = XINT (AREF (attrs, coding_attr_iso_flags)); |
| 4806 | enum coding_category category = XINT (CODING_ATTR_CATEGORY (attrs)); | ||
| 4635 | 4807 | ||
| 4636 | /* Invoke graphic register 0 to plane 0. */ | 4808 | /* Invoke graphic register 0 to plane 0. */ |
| 4637 | CODING_ISO_INVOCATION (coding, 0) = 0; | 4809 | CODING_ISO_INVOCATION (coding, 0) = 0; |
| @@ -4655,6 +4827,8 @@ setup_coding_system (coding_system, coding) | |||
| 4655 | | CODING_REQUIRE_FLUSHING_MASK); | 4827 | | CODING_REQUIRE_FLUSHING_MASK); |
| 4656 | if (flags & CODING_ISO_FLAG_COMPOSITION) | 4828 | if (flags & CODING_ISO_FLAG_COMPOSITION) |
| 4657 | coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; | 4829 | coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; |
| 4830 | if (flags & CODING_ISO_FLAG_DESIGNATION) | ||
| 4831 | coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK; | ||
| 4658 | if (flags & CODING_ISO_FLAG_FULL_SUPPORT) | 4832 | if (flags & CODING_ISO_FLAG_FULL_SUPPORT) |
| 4659 | { | 4833 | { |
| 4660 | setup_iso_safe_charsets (attrs); | 4834 | setup_iso_safe_charsets (attrs); |
| @@ -4930,9 +5104,12 @@ coding_inherit_eol_type (coding_system, parent) | |||
| 4930 | #define EOL_SEEN_CR 2 | 5104 | #define EOL_SEEN_CR 2 |
| 4931 | #define EOL_SEEN_CRLF 4 | 5105 | #define EOL_SEEN_CRLF 4 |
| 4932 | 5106 | ||
| 4933 | /* Detect how end-of-line of a text of length CODING->src_bytes | 5107 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by |
| 4934 | pointed by CODING->source is encoded. Return one of | 5108 | SOURCE is encoded. If CATEGORY is one of |
| 4935 | EOL_SEEN_XXX. */ | 5109 | coding_category_utf_16_XXXX, assume that CR and LF are encoded by |
| 5110 | two-byte, else they are encoded by one-byte. | ||
| 5111 | |||
| 5112 | Return one of EOL_SEEN_XXX. */ | ||
| 4936 | 5113 | ||
| 4937 | #define MAX_EOL_CHECK_COUNT 3 | 5114 | #define MAX_EOL_CHECK_COUNT 3 |
| 4938 | 5115 | ||
| @@ -5057,7 +5234,6 @@ detect_coding (coding) | |||
| 5057 | now. */ | 5234 | now. */ |
| 5058 | if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 5235 | if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
| 5059 | { | 5236 | { |
| 5060 | int mask = CATEGORY_MASK_ANY; | ||
| 5061 | int c, i; | 5237 | int c, i; |
| 5062 | 5238 | ||
| 5063 | for (src = coding->source; src < src_end; src++) | 5239 | for (src = coding->source; src < src_end; src++) |
| @@ -5072,46 +5248,43 @@ detect_coding (coding) | |||
| 5072 | 5248 | ||
| 5073 | if (coding->head_ascii < coding->src_bytes) | 5249 | if (coding->head_ascii < coding->src_bytes) |
| 5074 | { | 5250 | { |
| 5075 | int detected = 0; | 5251 | struct coding_detection_info detect_info; |
| 5252 | enum coding_category category; | ||
| 5253 | struct coding_system *this; | ||
| 5076 | 5254 | ||
| 5255 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | ||
| 5077 | for (i = 0; i < coding_category_raw_text; i++) | 5256 | for (i = 0; i < coding_category_raw_text; i++) |
| 5078 | { | 5257 | { |
| 5079 | enum coding_category category = coding_priorities[i]; | 5258 | category = coding_priorities[i]; |
| 5080 | struct coding_system *this = coding_categories + category; | 5259 | this = coding_categories + category; |
| 5081 | |||
| 5082 | if (this->id < 0) | 5260 | if (this->id < 0) |
| 5083 | { | 5261 | { |
| 5084 | /* No coding system of this category is defined. */ | 5262 | /* No coding system of this category is defined. */ |
| 5085 | mask &= ~(1 << category); | 5263 | detect_info.rejected |= (1 << category); |
| 5086 | } | 5264 | } |
| 5087 | else if (category >= coding_category_raw_text | 5265 | else if (category >= coding_category_raw_text) |
| 5088 | || detected & (1 << category)) | ||
| 5089 | continue; | 5266 | continue; |
| 5090 | else | 5267 | else if (detect_info.checked & (1 << category)) |
| 5091 | { | 5268 | { |
| 5092 | detected |= detected_mask[category]; | 5269 | if (detect_info.found & (1 << category)) |
| 5093 | if ((*(this->detector)) (coding, &mask) | 5270 | break; |
| 5094 | && (mask & (1 << category))) | ||
| 5095 | { | ||
| 5096 | mask = 1 << category; | ||
| 5097 | break; | ||
| 5098 | } | ||
| 5099 | } | 5271 | } |
| 5272 | else if ((*(this->detector)) (coding, &detect_info) | ||
| 5273 | && detect_info.found & (1 << category)) | ||
| 5274 | break; | ||
| 5100 | } | 5275 | } |
| 5101 | if (! mask) | 5276 | if (i < coding_category_raw_text) |
| 5277 | setup_coding_system (CODING_ID_NAME (this->id), coding); | ||
| 5278 | else if (detect_info.rejected == CATEGORY_MASK_ANY) | ||
| 5102 | setup_coding_system (Qraw_text, coding); | 5279 | setup_coding_system (Qraw_text, coding); |
| 5103 | else if (mask != CATEGORY_MASK_ANY) | 5280 | else if (detect_info.rejected) |
| 5104 | for (i = 0; i < coding_category_raw_text; i++) | 5281 | for (i = 0; i < coding_category_raw_text; i++) |
| 5105 | { | 5282 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) |
| 5106 | enum coding_category category = coding_priorities[i]; | 5283 | { |
| 5107 | struct coding_system *this = coding_categories + category; | 5284 | this = coding_categories + coding_priorities[i]; |
| 5108 | 5285 | setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5109 | if (mask & (1 << category)) | 5286 | break; |
| 5110 | { | 5287 | } |
| 5111 | setup_coding_system (CODING_ID_NAME (this->id), coding); | ||
| 5112 | break; | ||
| 5113 | } | ||
| 5114 | } | ||
| 5115 | } | 5288 | } |
| 5116 | } | 5289 | } |
| 5117 | 5290 | ||
| @@ -5408,9 +5581,9 @@ produce_chars (coding) | |||
| 5408 | return produced_chars; | 5581 | return produced_chars; |
| 5409 | } | 5582 | } |
| 5410 | 5583 | ||
| 5411 | /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ] | 5584 | /* Compose text in CODING->object according to the annotation data at |
| 5412 | or | 5585 | CHARBUF. CHARBUF is an array: |
| 5413 | [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ] | 5586 | [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ] |
| 5414 | */ | 5587 | */ |
| 5415 | 5588 | ||
| 5416 | static INLINE void | 5589 | static INLINE void |
| @@ -5418,18 +5591,15 @@ produce_composition (coding, charbuf) | |||
| 5418 | struct coding_system *coding; | 5591 | struct coding_system *coding; |
| 5419 | int *charbuf; | 5592 | int *charbuf; |
| 5420 | { | 5593 | { |
| 5421 | Lisp_Object buffer; | ||
| 5422 | int len; | 5594 | int len; |
| 5423 | EMACS_INT pos; | 5595 | EMACS_INT from, to; |
| 5424 | enum composition_method method; | 5596 | enum composition_method method; |
| 5425 | int cmp_len; | ||
| 5426 | Lisp_Object components; | 5597 | Lisp_Object components; |
| 5427 | 5598 | ||
| 5428 | buffer = coding->dst_object; | ||
| 5429 | len = -charbuf[0]; | 5599 | len = -charbuf[0]; |
| 5430 | pos = coding->dst_pos + charbuf[1]; | 5600 | from = coding->dst_pos + charbuf[2]; |
| 5431 | method = (enum composition_method) (charbuf[3]); | 5601 | to = coding->dst_pos + charbuf[3]; |
| 5432 | cmp_len = charbuf[4]; | 5602 | method = (enum composition_method) (charbuf[4]); |
| 5433 | 5603 | ||
| 5434 | if (method == COMPOSITION_RELATIVE) | 5604 | if (method == COMPOSITION_RELATIVE) |
| 5435 | components = Qnil; | 5605 | components = Qnil; |
| @@ -5445,65 +5615,30 @@ produce_composition (coding, charbuf) | |||
| 5445 | components = (method == COMPOSITION_WITH_ALTCHARS | 5615 | components = (method == COMPOSITION_WITH_ALTCHARS |
| 5446 | ? Fstring (len, args) : Fvector (len, args)); | 5616 | ? Fstring (len, args) : Fvector (len, args)); |
| 5447 | } | 5617 | } |
| 5448 | compose_text (pos, pos + cmp_len, components, Qnil, Qnil); | 5618 | compose_text (from, to, components, Qnil, coding->dst_object); |
| 5449 | } | 5619 | } |
| 5450 | 5620 | ||
| 5451 | static int * | ||
| 5452 | save_composition_data (buf, buf_end, prop) | ||
| 5453 | int *buf, *buf_end; | ||
| 5454 | Lisp_Object prop; | ||
| 5455 | { | ||
| 5456 | enum composition_method method = COMPOSITION_METHOD (prop); | ||
| 5457 | int cmp_len = COMPOSITION_LENGTH (prop); | ||
| 5458 | |||
| 5459 | if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end) | ||
| 5460 | return NULL; | ||
| 5461 | 5621 | ||
| 5462 | buf[1] = CODING_ANNOTATE_COMPOSITION_MASK; | 5622 | /* Put `charset' property on text in CODING->object according to |
| 5463 | buf[2] = method; | 5623 | the annotation data at CHARBUF. CHARBUF is an array: |
| 5464 | buf[3] = cmp_len; | 5624 | [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ] |
| 5465 | 5625 | */ | |
| 5466 | if (method == COMPOSITION_RELATIVE) | ||
| 5467 | buf[0] = 4; | ||
| 5468 | else | ||
| 5469 | { | ||
| 5470 | Lisp_Object components; | ||
| 5471 | int len, i; | ||
| 5472 | 5626 | ||
| 5473 | components = COMPOSITION_COMPONENTS (prop); | 5627 | static INLINE void |
| 5474 | if (VECTORP (components)) | 5628 | produce_charset (coding, charbuf) |
| 5475 | { | 5629 | struct coding_system *coding; |
| 5476 | len = XVECTOR (components)->size; | 5630 | int *charbuf; |
| 5477 | for (i = 0; i < len; i++) | 5631 | { |
| 5478 | buf[4 + i] = XINT (AREF (components, i)); | 5632 | EMACS_INT from = coding->dst_pos + charbuf[2]; |
| 5479 | } | 5633 | EMACS_INT to = coding->dst_pos + charbuf[3]; |
| 5480 | else if (STRINGP (components)) | 5634 | struct charset *charset = CHARSET_FROM_ID (charbuf[4]); |
| 5481 | { | ||
| 5482 | int i_byte; | ||
| 5483 | 5635 | ||
| 5484 | len = XSTRING (components)->size; | 5636 | Fput_text_property (make_number (from), make_number (to), |
| 5485 | i = i_byte = 0; | 5637 | Qcharset, CHARSET_NAME (charset), |
| 5486 | while (i < len) | 5638 | coding->dst_object); |
| 5487 | FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte); | ||
| 5488 | } | ||
| 5489 | else if (INTEGERP (components)) | ||
| 5490 | { | ||
| 5491 | len = 1; | ||
| 5492 | buf[4] = XINT (components); | ||
| 5493 | } | ||
| 5494 | else if (CONSP (components)) | ||
| 5495 | { | ||
| 5496 | for (len = 0; CONSP (components); | ||
| 5497 | len++, components = XCDR (components)) | ||
| 5498 | buf[4 + len] = XINT (XCAR (components)); | ||
| 5499 | } | ||
| 5500 | else | ||
| 5501 | abort (); | ||
| 5502 | buf[0] = 4 + len; | ||
| 5503 | } | ||
| 5504 | return (buf + buf[0]); | ||
| 5505 | } | 5639 | } |
| 5506 | 5640 | ||
| 5641 | |||
| 5507 | #define CHARBUF_SIZE 0x4000 | 5642 | #define CHARBUF_SIZE 0x4000 |
| 5508 | 5643 | ||
| 5509 | #define ALLOC_CONVERSION_WORK_AREA(coding) \ | 5644 | #define ALLOC_CONVERSION_WORK_AREA(coding) \ |
| @@ -5534,6 +5669,9 @@ produce_annotation (coding) | |||
| 5534 | int *charbuf = coding->charbuf; | 5669 | int *charbuf = coding->charbuf; |
| 5535 | int *charbuf_end = charbuf + coding->charbuf_used; | 5670 | int *charbuf_end = charbuf + coding->charbuf_used; |
| 5536 | 5671 | ||
| 5672 | if (NILP (coding->dst_object)) | ||
| 5673 | return; | ||
| 5674 | |||
| 5537 | while (charbuf < charbuf_end) | 5675 | while (charbuf < charbuf_end) |
| 5538 | { | 5676 | { |
| 5539 | if (*charbuf >= 0) | 5677 | if (*charbuf >= 0) |
| @@ -5541,11 +5679,14 @@ produce_annotation (coding) | |||
| 5541 | else | 5679 | else |
| 5542 | { | 5680 | { |
| 5543 | int len = -*charbuf; | 5681 | int len = -*charbuf; |
| 5544 | switch (charbuf[2]) | 5682 | switch (charbuf[1]) |
| 5545 | { | 5683 | { |
| 5546 | case CODING_ANNOTATE_COMPOSITION_MASK: | 5684 | case CODING_ANNOTATE_COMPOSITION_MASK: |
| 5547 | produce_composition (coding, charbuf); | 5685 | produce_composition (coding, charbuf); |
| 5548 | break; | 5686 | break; |
| 5687 | case CODING_ANNOTATE_CHARSET_MASK: | ||
| 5688 | produce_charset (coding, charbuf); | ||
| 5689 | break; | ||
| 5549 | default: | 5690 | default: |
| 5550 | abort (); | 5691 | abort (); |
| 5551 | } | 5692 | } |
| @@ -5669,41 +5810,159 @@ decode_coding (coding) | |||
| 5669 | return coding->result; | 5810 | return coding->result; |
| 5670 | } | 5811 | } |
| 5671 | 5812 | ||
| 5813 | |||
| 5814 | /* Extract an annotation data from a composition starting at POS and | ||
| 5815 | ending before LIMIT of CODING->src_object (buffer or string), store | ||
| 5816 | the data in BUF, set *STOP to a starting position of the next | ||
| 5817 | composition (if any) or to LIMIT, and return the address of the | ||
| 5818 | next element of BUF. | ||
| 5819 | |||
| 5820 | If such an annotation is not found, set *STOP to a starting | ||
| 5821 | position of a composition after POS (if any) or to LIMIT, and | ||
| 5822 | return BUF. */ | ||
| 5823 | |||
| 5824 | static INLINE int * | ||
| 5825 | handle_composition_annotation (pos, limit, coding, buf, stop) | ||
| 5826 | EMACS_INT pos, limit; | ||
| 5827 | struct coding_system *coding; | ||
| 5828 | int *buf; | ||
| 5829 | EMACS_INT *stop; | ||
| 5830 | { | ||
| 5831 | EMACS_INT start, end; | ||
| 5832 | Lisp_Object prop; | ||
| 5833 | |||
| 5834 | if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object) | ||
| 5835 | || end > limit) | ||
| 5836 | *stop = limit; | ||
| 5837 | else if (start > pos) | ||
| 5838 | *stop = start; | ||
| 5839 | else | ||
| 5840 | { | ||
| 5841 | if (start == pos) | ||
| 5842 | { | ||
| 5843 | /* We found a composition. Store the corresponding | ||
| 5844 | annotation data in BUF. */ | ||
| 5845 | int *head = buf; | ||
| 5846 | enum composition_method method = COMPOSITION_METHOD (prop); | ||
| 5847 | int nchars = COMPOSITION_LENGTH (prop); | ||
| 5848 | |||
| 5849 | ADD_COMPOSITION_DATA (buf, 0, nchars, method); | ||
| 5850 | if (method != COMPOSITION_RELATIVE) | ||
| 5851 | { | ||
| 5852 | Lisp_Object components; | ||
| 5853 | int len, i, i_byte; | ||
| 5854 | |||
| 5855 | components = COMPOSITION_COMPONENTS (prop); | ||
| 5856 | if (VECTORP (components)) | ||
| 5857 | { | ||
| 5858 | len = XVECTOR (components)->size; | ||
| 5859 | for (i = 0; i < len; i++) | ||
| 5860 | *buf++ = XINT (AREF (components, i)); | ||
| 5861 | } | ||
| 5862 | else if (STRINGP (components)) | ||
| 5863 | { | ||
| 5864 | len = XSTRING (components)->size; | ||
| 5865 | i = i_byte = 0; | ||
| 5866 | while (i < len) | ||
| 5867 | { | ||
| 5868 | FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte); | ||
| 5869 | buf++; | ||
| 5870 | } | ||
| 5871 | } | ||
| 5872 | else if (INTEGERP (components)) | ||
| 5873 | { | ||
| 5874 | len = 1; | ||
| 5875 | *buf++ = XINT (components); | ||
| 5876 | } | ||
| 5877 | else if (CONSP (components)) | ||
| 5878 | { | ||
| 5879 | for (len = 0; CONSP (components); | ||
| 5880 | len++, components = XCDR (components)) | ||
| 5881 | *buf++ = XINT (XCAR (components)); | ||
| 5882 | } | ||
| 5883 | else | ||
| 5884 | abort (); | ||
| 5885 | *head -= len; | ||
| 5886 | } | ||
| 5887 | } | ||
| 5888 | |||
| 5889 | if (find_composition (end, limit, &start, &end, &prop, | ||
| 5890 | coding->src_object) | ||
| 5891 | && end <= limit) | ||
| 5892 | *stop = start; | ||
| 5893 | else | ||
| 5894 | *stop = limit; | ||
| 5895 | } | ||
| 5896 | return buf; | ||
| 5897 | } | ||
| 5898 | |||
| 5899 | |||
| 5900 | /* Extract an annotation data from a text property `charset' at POS of | ||
| 5901 | CODING->src_object (buffer of string), store the data in BUF, set | ||
| 5902 | *STOP to the position where the value of `charset' property changes | ||
| 5903 | (limiting by LIMIT), and return the address of the next element of | ||
| 5904 | BUF. | ||
| 5905 | |||
| 5906 | If the property value is nil, set *STOP to the position where the | ||
| 5907 | property value is non-nil (limiting by LIMIT), and return BUF. */ | ||
| 5908 | |||
| 5909 | static INLINE int * | ||
| 5910 | handle_charset_annotation (pos, limit, coding, buf, stop) | ||
| 5911 | EMACS_INT pos, limit; | ||
| 5912 | struct coding_system *coding; | ||
| 5913 | int *buf; | ||
| 5914 | EMACS_INT *stop; | ||
| 5915 | { | ||
| 5916 | Lisp_Object val, next; | ||
| 5917 | int id; | ||
| 5918 | |||
| 5919 | val = Fget_text_property (make_number (pos), Qcharset, coding->src_object); | ||
| 5920 | if (! NILP (val) && CHARSETP (val)) | ||
| 5921 | id = XINT (CHARSET_SYMBOL_ID (val)); | ||
| 5922 | else | ||
| 5923 | id = -1; | ||
| 5924 | ADD_CHARSET_DATA (buf, 0, 0, id); | ||
| 5925 | next = Fnext_single_property_change (make_number (pos), Qcharset, | ||
| 5926 | coding->src_object, | ||
| 5927 | make_number (limit)); | ||
| 5928 | *stop = XINT (next); | ||
| 5929 | return buf; | ||
| 5930 | } | ||
| 5931 | |||
| 5932 | |||
| 5672 | static void | 5933 | static void |
| 5673 | consume_chars (coding) | 5934 | consume_chars (coding) |
| 5674 | struct coding_system *coding; | 5935 | struct coding_system *coding; |
| 5675 | { | 5936 | { |
| 5676 | int *buf = coding->charbuf; | 5937 | int *buf = coding->charbuf; |
| 5677 | /* -1 is to compensate for CRLF. */ | 5938 | int *buf_end = coding->charbuf + coding->charbuf_size; |
| 5678 | int *buf_end = coding->charbuf + coding->charbuf_size - 1; | ||
| 5679 | const unsigned char *src = coding->source + coding->consumed; | 5939 | const unsigned char *src = coding->source + coding->consumed; |
| 5680 | int pos = coding->src_pos + coding->consumed_char; | 5940 | EMACS_INT pos = coding->src_pos + coding->consumed_char; |
| 5681 | int end_pos = coding->src_pos + coding->src_chars; | 5941 | EMACS_INT end_pos = coding->src_pos + coding->src_chars; |
| 5682 | int multibytep = coding->src_multibyte; | 5942 | int multibytep = coding->src_multibyte; |
| 5683 | Lisp_Object eol_type; | 5943 | Lisp_Object eol_type; |
| 5684 | int c; | 5944 | int c; |
| 5685 | int start, end, stop; | 5945 | EMACS_INT stop, stop_composition, stop_charset; |
| 5686 | Lisp_Object object, prop; | 5946 | int id; |
| 5687 | 5947 | ||
| 5688 | eol_type = CODING_ID_EOL_TYPE (coding->id); | 5948 | eol_type = CODING_ID_EOL_TYPE (coding->id); |
| 5689 | if (VECTORP (eol_type)) | 5949 | if (VECTORP (eol_type)) |
| 5690 | eol_type = Qunix; | 5950 | eol_type = Qunix; |
| 5691 | 5951 | ||
| 5692 | object = coding->src_object; | ||
| 5693 | |||
| 5694 | /* Note: composition handling is not yet implemented. */ | 5952 | /* Note: composition handling is not yet implemented. */ |
| 5695 | coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; | 5953 | coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; |
| 5696 | 5954 | ||
| 5697 | if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK | 5955 | if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK) |
| 5698 | && find_composition (pos, end_pos, &start, &end, &prop, object) | 5956 | stop = stop_composition = pos; |
| 5699 | && end <= end_pos | 5957 | else |
| 5700 | && (start >= pos | 5958 | stop = stop_composition = end_pos; |
| 5701 | || (find_composition (end, end_pos, &start, &end, &prop, object) | 5959 | if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK) |
| 5702 | && end <= end_pos))) | 5960 | stop = stop_charset = pos; |
| 5703 | stop = start; | ||
| 5704 | else | 5961 | else |
| 5705 | stop = end_pos; | 5962 | stop_charset = end_pos; |
| 5706 | 5963 | ||
| 5964 | /* Compensate for CRLF and annotation. */ | ||
| 5965 | buf_end -= 1 + MAX_ANNOTATION_LENGTH; | ||
| 5707 | while (buf < buf_end) | 5966 | while (buf < buf_end) |
| 5708 | { | 5967 | { |
| 5709 | if (pos == stop) | 5968 | if (pos == stop) |
| @@ -5712,15 +5971,14 @@ consume_chars (coding) | |||
| 5712 | 5971 | ||
| 5713 | if (pos == end_pos) | 5972 | if (pos == end_pos) |
| 5714 | break; | 5973 | break; |
| 5715 | p = save_composition_data (buf, buf_end, prop); | 5974 | if (pos == stop_composition) |
| 5716 | if (p == NULL) | 5975 | buf = handle_composition_annotation (pos, end_pos, coding, |
| 5717 | break; | 5976 | buf, &stop_composition); |
| 5718 | buf = p; | 5977 | if (pos == stop_charset) |
| 5719 | if (find_composition (end, end_pos, &start, &end, &prop, object) | 5978 | buf = handle_charset_annotation (pos, end_pos, coding, |
| 5720 | && end <= end_pos) | 5979 | buf, &stop_charset); |
| 5721 | stop = start; | 5980 | stop = (stop_composition < stop_charset |
| 5722 | else | 5981 | ? stop_composition : stop_charset); |
| 5723 | stop = end_pos; | ||
| 5724 | } | 5982 | } |
| 5725 | 5983 | ||
| 5726 | if (! multibytep) | 5984 | if (! multibytep) |
| @@ -6162,16 +6420,16 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte, | |||
| 6162 | else if (BUFFERP (src_object)) | 6420 | else if (BUFFERP (src_object)) |
| 6163 | { | 6421 | { |
| 6164 | set_buffer_internal (XBUFFER (src_object)); | 6422 | set_buffer_internal (XBUFFER (src_object)); |
| 6165 | if (from != GPT) | ||
| 6166 | move_gap_both (from, from_byte); | ||
| 6167 | if (EQ (src_object, dst_object)) | 6423 | if (EQ (src_object, dst_object)) |
| 6168 | { | 6424 | { |
| 6169 | del_range_both (from, from_byte, to, to_byte, 1); | 6425 | coding->src_object = del_range_1 (from, to, 1, 1); |
| 6170 | coding->src_pos = -chars; | 6426 | coding->src_pos = 0; |
| 6171 | coding->src_pos_byte = -bytes; | 6427 | coding->src_pos_byte = 0; |
| 6172 | } | 6428 | } |
| 6173 | else | 6429 | else |
| 6174 | { | 6430 | { |
| 6431 | if (from < GPT && to >= GPT) | ||
| 6432 | move_gap_both (from, from_byte); | ||
| 6175 | coding->src_pos = from; | 6433 | coding->src_pos = from; |
| 6176 | coding->src_pos_byte = from_byte; | 6434 | coding->src_pos_byte = from_byte; |
| 6177 | } | 6435 | } |
| @@ -6320,12 +6578,11 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6320 | { | 6578 | { |
| 6321 | unsigned char *src_end = src + src_bytes; | 6579 | unsigned char *src_end = src + src_bytes; |
| 6322 | int mask = CATEGORY_MASK_ANY; | 6580 | int mask = CATEGORY_MASK_ANY; |
| 6323 | int detected = 0; | ||
| 6324 | int c, i; | ||
| 6325 | Lisp_Object attrs, eol_type; | 6581 | Lisp_Object attrs, eol_type; |
| 6326 | Lisp_Object val; | 6582 | Lisp_Object val; |
| 6327 | struct coding_system coding; | 6583 | struct coding_system coding; |
| 6328 | int id; | 6584 | int id; |
| 6585 | struct coding_detection_info detect_info; | ||
| 6329 | 6586 | ||
| 6330 | if (NILP (coding_system)) | 6587 | if (NILP (coding_system)) |
| 6331 | coding_system = Qundecided; | 6588 | coding_system = Qundecided; |
| @@ -6340,9 +6597,15 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6340 | coding.consumed = 0; | 6597 | coding.consumed = 0; |
| 6341 | coding.mode |= CODING_MODE_LAST_BLOCK; | 6598 | coding.mode |= CODING_MODE_LAST_BLOCK; |
| 6342 | 6599 | ||
| 6600 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | ||
| 6601 | |||
| 6343 | /* At first, detect text-format if necessary. */ | 6602 | /* At first, detect text-format if necessary. */ |
| 6344 | if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) | 6603 | if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) |
| 6345 | { | 6604 | { |
| 6605 | enum coding_category category; | ||
| 6606 | struct coding_system *this; | ||
| 6607 | int c, i; | ||
| 6608 | |||
| 6346 | for (; src < src_end; src++) | 6609 | for (; src < src_end; src++) |
| 6347 | { | 6610 | { |
| 6348 | c = *src; | 6611 | c = *src; |
| @@ -6357,64 +6620,92 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6357 | if (src < src_end) | 6620 | if (src < src_end) |
| 6358 | for (i = 0; i < coding_category_raw_text; i++) | 6621 | for (i = 0; i < coding_category_raw_text; i++) |
| 6359 | { | 6622 | { |
| 6360 | enum coding_category category = coding_priorities[i]; | 6623 | category = coding_priorities[i]; |
| 6361 | struct coding_system *this = coding_categories + category; | 6624 | this = coding_categories + category; |
| 6362 | 6625 | ||
| 6363 | if (this->id < 0) | 6626 | if (this->id < 0) |
| 6364 | { | 6627 | { |
| 6365 | /* No coding system of this category is defined. */ | 6628 | /* No coding system of this category is defined. */ |
| 6366 | mask &= ~(1 << category); | 6629 | detect_info.rejected |= (1 << category); |
| 6367 | } | 6630 | } |
| 6368 | else if (category >= coding_category_raw_text | 6631 | else if (category >= coding_category_raw_text) |
| 6369 | || detected & (1 << category)) | ||
| 6370 | continue; | 6632 | continue; |
| 6633 | else if (detect_info.checked & (1 << category)) | ||
| 6634 | { | ||
| 6635 | if (highest | ||
| 6636 | && (detect_info.found & (1 << category))) | ||
| 6637 | break; | ||
| 6638 | } | ||
| 6371 | else | 6639 | else |
| 6372 | { | 6640 | { |
| 6373 | detected |= detected_mask[category]; | 6641 | if ((*(this->detector)) (&coding, &detect_info) |
| 6374 | if ((*(coding_categories[category].detector)) (&coding, &mask) | ||
| 6375 | && highest | 6642 | && highest |
| 6376 | && (mask & (1 << category))) | 6643 | && (detect_info.found & (1 << category))) |
| 6377 | { | 6644 | break; |
| 6378 | mask = 1 << category; | ||
| 6379 | break; | ||
| 6380 | } | ||
| 6381 | } | 6645 | } |
| 6382 | } | 6646 | } |
| 6383 | 6647 | ||
| 6384 | if (!mask) | 6648 | |
| 6649 | if (detect_info.rejected == CATEGORY_MASK_ANY) | ||
| 6385 | { | 6650 | { |
| 6651 | detect_info.found = CATEGORY_MASK_RAW_TEXT; | ||
| 6386 | id = coding_categories[coding_category_raw_text].id; | 6652 | id = coding_categories[coding_category_raw_text].id; |
| 6387 | val = Fcons (make_number (id), Qnil); | 6653 | val = Fcons (make_number (id), Qnil); |
| 6388 | } | 6654 | } |
| 6389 | else if (mask == CATEGORY_MASK_ANY) | 6655 | else if (! detect_info.rejected && ! detect_info.found) |
| 6390 | { | 6656 | { |
| 6657 | detect_info.found = CATEGORY_MASK_ANY; | ||
| 6391 | id = coding_categories[coding_category_undecided].id; | 6658 | id = coding_categories[coding_category_undecided].id; |
| 6392 | val = Fcons (make_number (id), Qnil); | 6659 | val = Fcons (make_number (id), Qnil); |
| 6393 | } | 6660 | } |
| 6394 | else if (highest) | 6661 | else if (highest) |
| 6395 | { | 6662 | { |
| 6396 | for (i = 0; i < coding_category_raw_text; i++) | 6663 | if (detect_info.found) |
| 6397 | if (mask & (1 << coding_priorities[i])) | 6664 | { |
| 6398 | { | 6665 | detect_info.found = 1 << category; |
| 6399 | id = coding_categories[coding_priorities[i]].id; | 6666 | val = Fcons (make_number (this->id), Qnil); |
| 6400 | val = Fcons (make_number (id), Qnil); | 6667 | } |
| 6401 | break; | 6668 | else |
| 6402 | } | 6669 | for (i = 0; i < coding_category_raw_text; i++) |
| 6403 | } | 6670 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) |
| 6671 | { | ||
| 6672 | detect_info.found = 1 << coding_priorities[i]; | ||
| 6673 | id = coding_categories[coding_priorities[i]].id; | ||
| 6674 | val = Fcons (make_number (id), Qnil); | ||
| 6675 | break; | ||
| 6676 | } | ||
| 6677 | } | ||
| 6404 | else | 6678 | else |
| 6405 | { | 6679 | { |
| 6680 | int mask = detect_info.rejected | detect_info.found; | ||
| 6681 | int found = 0; | ||
| 6406 | val = Qnil; | 6682 | val = Qnil; |
| 6683 | |||
| 6407 | for (i = coding_category_raw_text - 1; i >= 0; i--) | 6684 | for (i = coding_category_raw_text - 1; i >= 0; i--) |
| 6408 | if (mask & (1 << coding_priorities[i])) | 6685 | { |
| 6409 | { | 6686 | category = coding_priorities[i]; |
| 6410 | id = coding_categories[coding_priorities[i]].id; | 6687 | if (! (mask & (1 << category))) |
| 6411 | val = Fcons (make_number (id), val); | 6688 | { |
| 6412 | } | 6689 | found |= 1 << category; |
| 6690 | id = coding_categories[category].id; | ||
| 6691 | val = Fcons (make_number (id), val); | ||
| 6692 | } | ||
| 6693 | } | ||
| 6694 | for (i = coding_category_raw_text - 1; i >= 0; i--) | ||
| 6695 | { | ||
| 6696 | category = coding_priorities[i]; | ||
| 6697 | if (detect_info.found & (1 << category)) | ||
| 6698 | { | ||
| 6699 | id = coding_categories[category].id; | ||
| 6700 | val = Fcons (make_number (id), val); | ||
| 6701 | } | ||
| 6702 | } | ||
| 6703 | detect_info.found |= found; | ||
| 6413 | } | 6704 | } |
| 6414 | } | 6705 | } |
| 6415 | else | 6706 | else |
| 6416 | { | 6707 | { |
| 6417 | mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); | 6708 | detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); |
| 6418 | val = Fcons (make_number (coding.id), Qnil); | 6709 | val = Fcons (make_number (coding.id), Qnil); |
| 6419 | } | 6710 | } |
| 6420 | 6711 | ||
| @@ -6425,13 +6716,15 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6425 | 6716 | ||
| 6426 | if (VECTORP (eol_type)) | 6717 | if (VECTORP (eol_type)) |
| 6427 | { | 6718 | { |
| 6428 | if (mask & ~CATEGORY_MASK_UTF_16) | 6719 | if (detect_info.found & ~CATEGORY_MASK_UTF_16) |
| 6429 | normal_eol = detect_eol (coding.source, src_bytes, | 6720 | normal_eol = detect_eol (coding.source, src_bytes, |
| 6430 | coding_category_raw_text); | 6721 | coding_category_raw_text); |
| 6431 | if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG)) | 6722 | if (detect_info.found & (CATEGORY_MASK_UTF_16_BE |
| 6723 | | CATEGORY_MASK_UTF_16_BE_NOSIG)) | ||
| 6432 | utf_16_be_eol = detect_eol (coding.source, src_bytes, | 6724 | utf_16_be_eol = detect_eol (coding.source, src_bytes, |
| 6433 | coding_category_utf_16_be); | 6725 | coding_category_utf_16_be); |
| 6434 | if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG)) | 6726 | if (detect_info.found & (CATEGORY_MASK_UTF_16_LE |
| 6727 | | CATEGORY_MASK_UTF_16_LE_NOSIG)) | ||
| 6435 | utf_16_le_eol = detect_eol (coding.source, src_bytes, | 6728 | utf_16_le_eol = detect_eol (coding.source, src_bytes, |
| 6436 | coding_category_utf_16_le); | 6729 | coding_category_utf_16_le); |
| 6437 | } | 6730 | } |