diff options
| author | Kenichi Handa | 2002-10-10 09:05:37 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2002-10-10 09:05:37 +0000 |
| commit | 89528eb3a79d75ad7478f4aa7ff2adbf69b8b599 (patch) | |
| tree | 8d5e84efbd7f2a21d4a048e6ef8fad8a74428c64 /src | |
| parent | e28811311e74a6a2e24f038b406e130ebc00290f (diff) | |
| download | emacs-89528eb3a79d75ad7478f4aa7ff2adbf69b8b599.tar.gz emacs-89528eb3a79d75ad7478f4aa7ff2adbf69b8b599.zip | |
(detect_coding_utf_8): Check incomplete byte sequence.
Don't update *mask when correctly detected.
(detect_coding_utf_16): Likewise.
(detect_coding_emacs_mule): Likewise.
(detect_coding_iso_2022): Likewise.
(detect_coding_sjis): Likewise.
(detect_coding_big5): Likewise.
(detect_coding_ccl): Likewise.
(decode_coding_sjis): Fix decoding of katakana-jisx0201.
(detect_eol): Delete the argument CODING, and add the argument
CATEGORY.
(detect_coding): Adjusted for the changes above.
(detect_coding_system): Likewise.
Diffstat (limited to 'src')
| -rw-r--r-- | src/coding.c | 275 |
1 files changed, 175 insertions, 100 deletions
diff --git a/src/coding.c b/src/coding.c index 7660fc01919..d23a5ff544c 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -1031,6 +1031,7 @@ detect_coding_utf_8 (coding, mask) | |||
| 1031 | int multibytep = coding->src_multibyte; | 1031 | int multibytep = coding->src_multibyte; |
| 1032 | int consumed_chars = 0; | 1032 | int consumed_chars = 0; |
| 1033 | int found = 0; | 1033 | int found = 0; |
| 1034 | int incomplete; | ||
| 1034 | 1035 | ||
| 1035 | /* A coding system of this category is always ASCII compatible. */ | 1036 | /* A coding system of this category is always ASCII compatible. */ |
| 1036 | src += coding->head_ascii; | 1037 | src += coding->head_ascii; |
| @@ -1039,9 +1040,11 @@ detect_coding_utf_8 (coding, mask) | |||
| 1039 | { | 1040 | { |
| 1040 | int c, c1, c2, c3, c4; | 1041 | int c, c1, c2, c3, c4; |
| 1041 | 1042 | ||
| 1043 | incomplete = 0; | ||
| 1042 | ONE_MORE_BYTE (c); | 1044 | ONE_MORE_BYTE (c); |
| 1043 | if (UTF_8_1_OCTET_P (c)) | 1045 | if (UTF_8_1_OCTET_P (c)) |
| 1044 | continue; | 1046 | continue; |
| 1047 | incomplete = 1; | ||
| 1045 | ONE_MORE_BYTE (c1); | 1048 | ONE_MORE_BYTE (c1); |
| 1046 | if (! UTF_8_EXTRA_OCTET_P (c1)) | 1049 | if (! UTF_8_EXTRA_OCTET_P (c1)) |
| 1047 | break; | 1050 | break; |
| @@ -1080,10 +1083,12 @@ detect_coding_utf_8 (coding, mask) | |||
| 1080 | return 0; | 1083 | return 0; |
| 1081 | 1084 | ||
| 1082 | no_more_source: | 1085 | no_more_source: |
| 1083 | if (! found) | 1086 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1084 | return 0; | 1087 | { |
| 1085 | *mask &= CATEGORY_MASK_UTF_8; | 1088 | *mask &= ~CATEGORY_MASK_UTF_8; |
| 1086 | return 1; | 1089 | return 0; |
| 1090 | } | ||
| 1091 | return found; | ||
| 1087 | } | 1092 | } |
| 1088 | 1093 | ||
| 1089 | 1094 | ||
| @@ -1289,19 +1294,19 @@ detect_coding_utf_16 (coding, mask) | |||
| 1289 | int consumed_chars = 0; | 1294 | int consumed_chars = 0; |
| 1290 | int c1, c2; | 1295 | int c1, c2; |
| 1291 | 1296 | ||
| 1297 | *mask &= ~CATEGORY_MASK_UTF_16; | ||
| 1298 | |||
| 1292 | ONE_MORE_BYTE (c1); | 1299 | ONE_MORE_BYTE (c1); |
| 1293 | ONE_MORE_BYTE (c2); | 1300 | ONE_MORE_BYTE (c2); |
| 1294 | 1301 | ||
| 1295 | if ((c1 == 0xFF) && (c2 == 0xFE)) | 1302 | if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 1296 | { | 1303 | *mask |= CATEGORY_MASK_UTF_16_LE; |
| 1297 | *mask &= CATEGORY_MASK_UTF_16_LE; | ||
| 1298 | return 1; | ||
| 1299 | } | ||
| 1300 | else if ((c1 == 0xFE) && (c2 == 0xFF)) | 1304 | else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 1301 | { | 1305 | *mask |= CATEGORY_MASK_UTF_16_BE; |
| 1302 | *mask &= CATEGORY_MASK_UTF_16_BE; | 1306 | else |
| 1303 | return 1; | 1307 | *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG; |
| 1304 | } | 1308 | return 1; |
| 1309 | |||
| 1305 | no_more_source: | 1310 | no_more_source: |
| 1306 | return 0; | 1311 | return 0; |
| 1307 | } | 1312 | } |
| @@ -1643,13 +1648,16 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1643 | int consumed_chars = 0; | 1648 | int consumed_chars = 0; |
| 1644 | int c; | 1649 | int c; |
| 1645 | int found = 0; | 1650 | int found = 0; |
| 1651 | int incomplete; | ||
| 1646 | 1652 | ||
| 1647 | /* A coding system of this category is always ASCII compatible. */ | 1653 | /* A coding system of this category is always ASCII compatible. */ |
| 1648 | src += coding->head_ascii; | 1654 | src += coding->head_ascii; |
| 1649 | 1655 | ||
| 1650 | while (1) | 1656 | while (1) |
| 1651 | { | 1657 | { |
| 1658 | incomplete = 0; | ||
| 1652 | ONE_MORE_BYTE (c); | 1659 | ONE_MORE_BYTE (c); |
| 1660 | incomplete = 1; | ||
| 1653 | 1661 | ||
| 1654 | if (c == 0x80) | 1662 | if (c == 0x80) |
| 1655 | { | 1663 | { |
| @@ -1698,10 +1706,12 @@ detect_coding_emacs_mule (coding, mask) | |||
| 1698 | return 0; | 1706 | return 0; |
| 1699 | 1707 | ||
| 1700 | no_more_source: | 1708 | no_more_source: |
| 1701 | if (!found) | 1709 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1702 | return 0; | 1710 | { |
| 1703 | *mask &= CATEGORY_MASK_EMACS_MULE; | 1711 | *mask &= ~CATEGORY_MASK_EMACS_MULE; |
| 1704 | return 1; | 1712 | return 0; |
| 1713 | } | ||
| 1714 | return found; | ||
| 1705 | } | 1715 | } |
| 1706 | 1716 | ||
| 1707 | 1717 | ||
| @@ -2465,6 +2475,7 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2465 | { | 2475 | { |
| 2466 | int newmask = CATEGORY_MASK_ISO_8_ELSE; | 2476 | int newmask = CATEGORY_MASK_ISO_8_ELSE; |
| 2467 | 2477 | ||
| 2478 | mask_8bit_found = 1; | ||
| 2468 | if (inhibit_iso_escape_detection) | 2479 | if (inhibit_iso_escape_detection) |
| 2469 | break; | 2480 | break; |
| 2470 | if (c != ISO_CODE_CSI) | 2481 | if (c != ISO_CODE_CSI) |
| @@ -2558,7 +2569,8 @@ detect_coding_iso_2022 (coding, mask) | |||
| 2558 | } | 2569 | } |
| 2559 | if (!mask_found) | 2570 | if (!mask_found) |
| 2560 | return 0; | 2571 | return 0; |
| 2561 | *mask &= mask_iso & mask_found; | 2572 | *mask &= ~CATEGORY_MASK_ISO; |
| 2573 | *mask |= mask_iso & mask_found; | ||
| 2562 | if (! mask_8bit_found) | 2574 | if (! mask_8bit_found) |
| 2563 | *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); | 2575 | *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); |
| 2564 | return 1; | 2576 | return 1; |
| @@ -3658,13 +3670,16 @@ detect_coding_sjis (coding, mask) | |||
| 3658 | int consumed_chars = 0; | 3670 | int consumed_chars = 0; |
| 3659 | int found = 0; | 3671 | int found = 0; |
| 3660 | int c; | 3672 | int c; |
| 3673 | int incomplete; | ||
| 3661 | 3674 | ||
| 3662 | /* A coding system of this category is always ASCII compatible. */ | 3675 | /* A coding system of this category is always ASCII compatible. */ |
| 3663 | src += coding->head_ascii; | 3676 | src += coding->head_ascii; |
| 3664 | 3677 | ||
| 3665 | while (1) | 3678 | while (1) |
| 3666 | { | 3679 | { |
| 3680 | incomplete = 0; | ||
| 3667 | ONE_MORE_BYTE (c); | 3681 | ONE_MORE_BYTE (c); |
| 3682 | incomplete = 1; | ||
| 3668 | if (c < 0x80) | 3683 | if (c < 0x80) |
| 3669 | continue; | 3684 | continue; |
| 3670 | if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) | 3685 | if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) |
| @@ -3683,10 +3698,12 @@ detect_coding_sjis (coding, mask) | |||
| 3683 | return 0; | 3698 | return 0; |
| 3684 | 3699 | ||
| 3685 | no_more_source: | 3700 | no_more_source: |
| 3686 | if (!found) | 3701 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3687 | return 0; | 3702 | { |
| 3688 | *mask &= CATEGORY_MASK_SJIS; | 3703 | *mask &= ~CATEGORY_MASK_SJIS; |
| 3689 | return 1; | 3704 | return 0; |
| 3705 | } | ||
| 3706 | return found; | ||
| 3690 | } | 3707 | } |
| 3691 | 3708 | ||
| 3692 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3709 | /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| @@ -3704,13 +3721,16 @@ detect_coding_big5 (coding, mask) | |||
| 3704 | int consumed_chars = 0; | 3721 | int consumed_chars = 0; |
| 3705 | int found = 0; | 3722 | int found = 0; |
| 3706 | int c; | 3723 | int c; |
| 3724 | int incomplete; | ||
| 3707 | 3725 | ||
| 3708 | /* A coding system of this category is always ASCII compatible. */ | 3726 | /* A coding system of this category is always ASCII compatible. */ |
| 3709 | src += coding->head_ascii; | 3727 | src += coding->head_ascii; |
| 3710 | 3728 | ||
| 3711 | while (1) | 3729 | while (1) |
| 3712 | { | 3730 | { |
| 3731 | incomplete = 0; | ||
| 3713 | ONE_MORE_BYTE (c); | 3732 | ONE_MORE_BYTE (c); |
| 3733 | incomplete = 1; | ||
| 3714 | if (c < 0x80) | 3734 | if (c < 0x80) |
| 3715 | continue; | 3735 | continue; |
| 3716 | if (c >= 0xA1) | 3736 | if (c >= 0xA1) |
| @@ -3727,10 +3747,12 @@ detect_coding_big5 (coding, mask) | |||
| 3727 | return 0; | 3747 | return 0; |
| 3728 | 3748 | ||
| 3729 | no_more_source: | 3749 | no_more_source: |
| 3730 | if (!found) | 3750 | if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3731 | return 0; | 3751 | { |
| 3732 | *mask &= CATEGORY_MASK_BIG5; | 3752 | *mask &= ~CATEGORY_MASK_BIG5; |
| 3733 | return 1; | 3753 | return 0; |
| 3754 | } | ||
| 3755 | return found; | ||
| 3734 | } | 3756 | } |
| 3735 | 3757 | ||
| 3736 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3758 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| @@ -3754,8 +3776,8 @@ decode_coding_sjis (coding) | |||
| 3754 | 3776 | ||
| 3755 | val = charset_list; | 3777 | val = charset_list; |
| 3756 | charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | 3778 | charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3757 | charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | 3779 | charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3758 | charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))); | 3780 | charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); |
| 3759 | 3781 | ||
| 3760 | while (1) | 3782 | while (1) |
| 3761 | { | 3783 | { |
| @@ -3802,8 +3824,11 @@ decode_coding_sjis (coding) | |||
| 3802 | charset = charset_kanji; | 3824 | charset = charset_kanji; |
| 3803 | } | 3825 | } |
| 3804 | else | 3826 | else |
| 3805 | /* SJIS -> JISX0201-Kana */ | 3827 | { |
| 3806 | charset = charset_kana; | 3828 | /* SJIS -> JISX0201-Kana */ |
| 3829 | c &= 0x7F; | ||
| 3830 | charset = charset_kana; | ||
| 3831 | } | ||
| 3807 | } | 3832 | } |
| 3808 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | 3833 | CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); |
| 3809 | } | 3834 | } |
| @@ -4097,10 +4122,7 @@ detect_coding_ccl (coding, mask) | |||
| 4097 | return 0; | 4122 | return 0; |
| 4098 | 4123 | ||
| 4099 | no_more_source: | 4124 | no_more_source: |
| 4100 | if (!found) | 4125 | return found; |
| 4101 | return 0; | ||
| 4102 | *mask &= CATEGORY_MASK_CCL; | ||
| 4103 | return 1; | ||
| 4104 | } | 4126 | } |
| 4105 | 4127 | ||
| 4106 | static void | 4128 | static void |
| @@ -4368,7 +4390,6 @@ detect_coding_charset (coding, mask) | |||
| 4368 | return 0; | 4390 | return 0; |
| 4369 | 4391 | ||
| 4370 | no_more_source: | 4392 | no_more_source: |
| 4371 | *mask &= CATEGORY_MASK_CHARSET; | ||
| 4372 | return 1; | 4393 | return 1; |
| 4373 | } | 4394 | } |
| 4374 | 4395 | ||
| @@ -4894,25 +4915,22 @@ coding_inherit_eol_type (coding_system, parent) | |||
| 4894 | #define MAX_EOL_CHECK_COUNT 3 | 4915 | #define MAX_EOL_CHECK_COUNT 3 |
| 4895 | 4916 | ||
| 4896 | static int | 4917 | static int |
| 4897 | detect_eol (coding, source, src_bytes) | 4918 | detect_eol (source, src_bytes, category) |
| 4898 | struct coding_system *coding; | ||
| 4899 | unsigned char *source; | 4919 | unsigned char *source; |
| 4900 | EMACS_INT src_bytes; | 4920 | EMACS_INT src_bytes; |
| 4921 | enum coding_category category; | ||
| 4901 | { | 4922 | { |
| 4902 | Lisp_Object attrs, coding_type; | ||
| 4903 | unsigned char *src = source, *src_end = src + src_bytes; | 4923 | unsigned char *src = source, *src_end = src + src_bytes; |
| 4904 | unsigned char c; | 4924 | unsigned char c; |
| 4905 | int total = 0; | 4925 | int total = 0; |
| 4906 | int eol_seen = EOL_SEEN_NONE; | 4926 | int eol_seen = EOL_SEEN_NONE; |
| 4907 | 4927 | ||
| 4908 | attrs = CODING_ID_ATTRS (coding->id); | 4928 | if ((1 << category) & CATEGORY_MASK_UTF_16) |
| 4909 | coding_type = CODING_ATTR_TYPE (attrs); | ||
| 4910 | |||
| 4911 | if (EQ (coding_type, Qccl)) | ||
| 4912 | { | 4929 | { |
| 4913 | int msb, lsb; | 4930 | int msb, lsb; |
| 4914 | 4931 | ||
| 4915 | msb = coding->spec.utf_16.endian == utf_16_little_endian; | 4932 | msb = category == (coding_category_utf_16_le |
| 4933 | | coding_category_utf_16_le_nosig); | ||
| 4916 | lsb = 1 - msb; | 4934 | lsb = 1 - msb; |
| 4917 | 4935 | ||
| 4918 | while (src + 1 < src_end) | 4936 | while (src + 1 < src_end) |
| @@ -5039,19 +5057,19 @@ detect_coding (coding) | |||
| 5039 | enum coding_category category = coding_priorities[i]; | 5057 | enum coding_category category = coding_priorities[i]; |
| 5040 | struct coding_system *this = coding_categories + category; | 5058 | struct coding_system *this = coding_categories + category; |
| 5041 | 5059 | ||
| 5042 | if (category >= coding_category_raw_text | ||
| 5043 | || detected & (1 << category)) | ||
| 5044 | continue; | ||
| 5045 | |||
| 5046 | if (this->id < 0) | 5060 | if (this->id < 0) |
| 5047 | { | 5061 | { |
| 5048 | /* No coding system of this category is defined. */ | 5062 | /* No coding system of this category is defined. */ |
| 5049 | mask &= ~(1 << category); | 5063 | mask &= ~(1 << category); |
| 5050 | } | 5064 | } |
| 5065 | else if (category >= coding_category_raw_text | ||
| 5066 | || detected & (1 << category)) | ||
| 5067 | continue; | ||
| 5051 | else | 5068 | else |
| 5052 | { | 5069 | { |
| 5053 | detected |= detected_mask[category]; | 5070 | detected |= detected_mask[category]; |
| 5054 | if ((*(this->detector)) (coding, &mask)) | 5071 | if ((*(this->detector)) (coding, &mask) |
| 5072 | && (mask & (1 << category))) | ||
| 5055 | break; | 5073 | break; |
| 5056 | } | 5074 | } |
| 5057 | } | 5075 | } |
| @@ -5081,7 +5099,8 @@ detect_coding (coding) | |||
| 5081 | if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) | 5099 | if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) |
| 5082 | && ! EQ (coding_type, Qccl)) | 5100 | && ! EQ (coding_type, Qccl)) |
| 5083 | { | 5101 | { |
| 5084 | int eol_seen = detect_eol (coding, coding->source, coding->src_bytes); | 5102 | int eol_seen = detect_eol (coding->source, coding->src_bytes, |
| 5103 | XINT (CODING_ATTR_CATEGORY (attrs))); | ||
| 5085 | 5104 | ||
| 5086 | if (eol_seen != EOL_SEEN_NONE) | 5105 | if (eol_seen != EOL_SEEN_NONE) |
| 5087 | adjust_coding_eol_type (coding, eol_seen); | 5106 | adjust_coding_eol_type (coding, eol_seen); |
| @@ -6245,6 +6264,22 @@ The value of property should be a vector of length 5. */) | |||
| 6245 | } | 6264 | } |
| 6246 | 6265 | ||
| 6247 | 6266 | ||
| 6267 | /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If | ||
| 6268 | HIGHEST is nonzero, return the coding system of the highest | ||
| 6269 | priority among the detected coding systems. Otherwize return a | ||
| 6270 | list of detected coding systems sorted by their priorities. If | ||
| 6271 | MULTIBYTEP is nonzero, it is assumed that the bytes are in correct | ||
| 6272 | multibyte form but contains only ASCII and eight-bit chars. | ||
| 6273 | Otherwise, the bytes are raw bytes. | ||
| 6274 | |||
| 6275 | CODING-SYSTEM controls the detection as below: | ||
| 6276 | |||
| 6277 | If it is nil, detect both text-format and eol-format. If the | ||
| 6278 | text-format part of CODING-SYSTEM is already specified | ||
| 6279 | (e.g. `iso-latin-1'), detect only eol-format. If the eol-format | ||
| 6280 | part of CODING-SYSTEM is already specified (e.g. `undecided-unix'), | ||
| 6281 | detect only text-format. */ | ||
| 6282 | |||
| 6248 | Lisp_Object | 6283 | Lisp_Object |
| 6249 | detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | 6284 | detect_coding_system (src, src_bytes, highest, multibytep, coding_system) |
| 6250 | unsigned char *src; | 6285 | unsigned char *src; |
| @@ -6259,31 +6294,33 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6259 | Lisp_Object attrs, eol_type; | 6294 | Lisp_Object attrs, eol_type; |
| 6260 | Lisp_Object val; | 6295 | Lisp_Object val; |
| 6261 | struct coding_system coding; | 6296 | struct coding_system coding; |
| 6297 | int id; | ||
| 6262 | 6298 | ||
| 6263 | if (NILP (coding_system)) | 6299 | if (NILP (coding_system)) |
| 6264 | coding_system = Qundecided; | 6300 | coding_system = Qundecided; |
| 6265 | setup_coding_system (coding_system, &coding); | 6301 | setup_coding_system (coding_system, &coding); |
| 6266 | attrs = CODING_ID_ATTRS (coding.id); | 6302 | attrs = CODING_ID_ATTRS (coding.id); |
| 6267 | eol_type = CODING_ID_EOL_TYPE (coding.id); | 6303 | eol_type = CODING_ID_EOL_TYPE (coding.id); |
| 6304 | coding_system = CODING_ATTR_BASE_NAME (attrs); | ||
| 6268 | 6305 | ||
| 6269 | coding.source = src; | 6306 | coding.source = src; |
| 6270 | coding.src_bytes = src_bytes; | 6307 | coding.src_bytes = src_bytes; |
| 6271 | coding.src_multibyte = multibytep; | 6308 | coding.src_multibyte = multibytep; |
| 6272 | coding.consumed = 0; | 6309 | coding.consumed = 0; |
| 6310 | coding.mode |= CODING_MODE_LAST_BLOCK; | ||
| 6273 | 6311 | ||
| 6274 | if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided) | 6312 | /* At first, detect text-format if necessary. */ |
| 6275 | { | 6313 | if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) |
| 6276 | mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); | ||
| 6277 | } | ||
| 6278 | else | ||
| 6279 | { | 6314 | { |
| 6280 | coding_system = Qnil; | ||
| 6281 | for (; src < src_end; src++) | 6315 | for (; src < src_end; src++) |
| 6282 | { | 6316 | { |
| 6283 | c = *src; | 6317 | c = *src; |
| 6284 | if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC | 6318 | if (c & 0x80 |
| 6285 | || c == ISO_CODE_SI | 6319 | || (c < 0x20 && (c == ISO_CODE_ESC |
| 6286 | || c == ISO_CODE_SO))) | 6320 | || c == ISO_CODE_SI |
| 6321 | || c == ISO_CODE_SO | ||
| 6322 | /* Most UTF-16 text contains '\0'. */ | ||
| 6323 | || !c))) | ||
| 6287 | break; | 6324 | break; |
| 6288 | } | 6325 | } |
| 6289 | coding.head_ascii = src - coding.source; | 6326 | coding.head_ascii = src - coding.source; |
| @@ -6294,84 +6331,122 @@ detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | |||
| 6294 | enum coding_category category = coding_priorities[i]; | 6331 | enum coding_category category = coding_priorities[i]; |
| 6295 | struct coding_system *this = coding_categories + category; | 6332 | struct coding_system *this = coding_categories + category; |
| 6296 | 6333 | ||
| 6297 | if (category >= coding_category_raw_text | ||
| 6298 | || detected & (1 << category)) | ||
| 6299 | continue; | ||
| 6300 | |||
| 6301 | if (this->id < 0) | 6334 | if (this->id < 0) |
| 6302 | { | 6335 | { |
| 6303 | /* No coding system of this category is defined. */ | 6336 | /* No coding system of this category is defined. */ |
| 6304 | mask &= ~(1 << category); | 6337 | mask &= ~(1 << category); |
| 6305 | } | 6338 | } |
| 6339 | else if (category >= coding_category_raw_text | ||
| 6340 | || detected & (1 << category)) | ||
| 6341 | continue; | ||
| 6306 | else | 6342 | else |
| 6307 | { | 6343 | { |
| 6308 | detected |= detected_mask[category]; | 6344 | detected |= detected_mask[category]; |
| 6309 | if ((*(coding_categories[category].detector)) (&coding, &mask) | 6345 | if ((*(coding_categories[category].detector)) (&coding, &mask) |
| 6310 | && highest) | 6346 | && highest |
| 6347 | && (mask & (1 << category))) | ||
| 6311 | { | 6348 | { |
| 6312 | mask &= detected_mask[category]; | 6349 | mask = 1 << category; |
| 6313 | break; | 6350 | break; |
| 6314 | } | 6351 | } |
| 6315 | } | 6352 | } |
| 6316 | } | 6353 | } |
| 6317 | } | ||
| 6318 | 6354 | ||
| 6319 | if (!mask) | 6355 | if (!mask) |
| 6320 | val = Fcons (make_number (coding_category_raw_text), Qnil); | 6356 | { |
| 6321 | else if (mask == CATEGORY_MASK_ANY) | 6357 | id = coding_categories[coding_category_raw_text].id; |
| 6322 | val = Fcons (make_number (coding_category_undecided), Qnil); | 6358 | val = Fcons (make_number (id), Qnil); |
| 6323 | else if (highest) | 6359 | } |
| 6324 | { | 6360 | else if (mask == CATEGORY_MASK_ANY) |
| 6325 | for (i = 0; i < coding_category_raw_text; i++) | 6361 | { |
| 6326 | if (mask & (1 << coding_priorities[i])) | 6362 | id = coding_categories[coding_category_undecided].id; |
| 6327 | { | 6363 | val = Fcons (make_number (id), Qnil); |
| 6328 | val = Fcons (make_number (coding_priorities[i]), Qnil); | 6364 | } |
| 6329 | break; | 6365 | else if (highest) |
| 6330 | } | 6366 | { |
| 6331 | } | 6367 | for (i = 0; i < coding_category_raw_text; i++) |
| 6368 | if (mask & (1 << coding_priorities[i])) | ||
| 6369 | { | ||
| 6370 | id = coding_categories[coding_priorities[i]].id; | ||
| 6371 | val = Fcons (make_number (id), Qnil); | ||
| 6372 | break; | ||
| 6373 | } | ||
| 6374 | } | ||
| 6375 | else | ||
| 6376 | { | ||
| 6377 | val = Qnil; | ||
| 6378 | for (i = coding_category_raw_text - 1; i >= 0; i--) | ||
| 6379 | if (mask & (1 << coding_priorities[i])) | ||
| 6380 | { | ||
| 6381 | id = coding_categories[coding_priorities[i]].id; | ||
| 6382 | val = Fcons (make_number (id), val); | ||
| 6383 | } | ||
| 6384 | } | ||
| 6385 | } | ||
| 6332 | else | 6386 | else |
| 6333 | { | 6387 | { |
| 6334 | val = Qnil; | 6388 | mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); |
| 6335 | for (i = coding_category_raw_text - 1; i >= 0; i--) | 6389 | val = Fcons (make_number (coding.id), Qnil); |
| 6336 | if (mask & (1 << coding_priorities[i])) | ||
| 6337 | val = Fcons (make_number (coding_priorities[i]), val); | ||
| 6338 | } | 6390 | } |
| 6339 | 6391 | ||
| 6392 | /* Then, detect eol-format if necessary. */ | ||
| 6340 | { | 6393 | { |
| 6341 | int one_byte_eol = -1, two_byte_eol = -1; | 6394 | int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; |
| 6342 | Lisp_Object tail; | 6395 | Lisp_Object tail; |
| 6343 | 6396 | ||
| 6397 | if (VECTORP (eol_type)) | ||
| 6398 | { | ||
| 6399 | if (mask & ~CATEGORY_MASK_UTF_16) | ||
| 6400 | normal_eol = detect_eol (coding.source, src_bytes, | ||
| 6401 | coding_category_raw_text); | ||
| 6402 | if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG)) | ||
| 6403 | utf_16_be_eol = detect_eol (coding.source, src_bytes, | ||
| 6404 | coding_category_utf_16_be); | ||
| 6405 | if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG)) | ||
| 6406 | utf_16_le_eol = detect_eol (coding.source, src_bytes, | ||
| 6407 | coding_category_utf_16_le); | ||
| 6408 | } | ||
| 6409 | else | ||
| 6410 | { | ||
| 6411 | if (EQ (eol_type, Qunix)) | ||
| 6412 | normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF; | ||
| 6413 | else if (EQ (eol_type, Qdos)) | ||
| 6414 | normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF; | ||
| 6415 | else | ||
| 6416 | normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR; | ||
| 6417 | } | ||
| 6418 | |||
| 6344 | for (tail = val; CONSP (tail); tail = XCDR (tail)) | 6419 | for (tail = val; CONSP (tail); tail = XCDR (tail)) |
| 6345 | { | 6420 | { |
| 6346 | struct coding_system *this | 6421 | enum coding_category category; |
| 6347 | = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail)) | ||
| 6348 | : &coding); | ||
| 6349 | int this_eol; | 6422 | int this_eol; |
| 6350 | 6423 | ||
| 6351 | attrs = CODING_ID_ATTRS (this->id); | 6424 | id = XINT (XCAR (tail)); |
| 6352 | eol_type = CODING_ID_EOL_TYPE (this->id); | 6425 | attrs = CODING_ID_ATTRS (id); |
| 6353 | XSETCAR (tail, CODING_ID_NAME (this->id)); | 6426 | category = XINT (CODING_ATTR_CATEGORY (attrs)); |
| 6427 | eol_type = CODING_ID_EOL_TYPE (id); | ||
| 6354 | if (VECTORP (eol_type)) | 6428 | if (VECTORP (eol_type)) |
| 6355 | { | 6429 | { |
| 6356 | if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16)) | 6430 | if (category == coding_category_utf_16_be |
| 6357 | { | 6431 | || category == coding_category_utf_16_be_nosig) |
| 6358 | if (two_byte_eol < 0) | 6432 | this_eol = utf_16_be_eol; |
| 6359 | two_byte_eol = detect_eol (this, coding.source, src_bytes); | 6433 | else if (category == coding_category_utf_16_le |
| 6360 | this_eol = two_byte_eol; | 6434 | || category == coding_category_utf_16_le_nosig) |
| 6361 | } | 6435 | this_eol = utf_16_le_eol; |
| 6362 | else | 6436 | else |
| 6363 | { | 6437 | this_eol = normal_eol; |
| 6364 | if (one_byte_eol < 0) | 6438 | |
| 6365 | one_byte_eol =detect_eol (this, coding.source, src_bytes); | ||
| 6366 | this_eol = one_byte_eol; | ||
| 6367 | } | ||
| 6368 | if (this_eol == EOL_SEEN_LF) | 6439 | if (this_eol == EOL_SEEN_LF) |
| 6369 | XSETCAR (tail, AREF (eol_type, 0)); | 6440 | XSETCAR (tail, AREF (eol_type, 0)); |
| 6370 | else if (this_eol == EOL_SEEN_CRLF) | 6441 | else if (this_eol == EOL_SEEN_CRLF) |
| 6371 | XSETCAR (tail, AREF (eol_type, 1)); | 6442 | XSETCAR (tail, AREF (eol_type, 1)); |
| 6372 | else if (this_eol == EOL_SEEN_CR) | 6443 | else if (this_eol == EOL_SEEN_CR) |
| 6373 | XSETCAR (tail, AREF (eol_type, 2)); | 6444 | XSETCAR (tail, AREF (eol_type, 2)); |
| 6445 | else | ||
| 6446 | XSETCAR (tail, CODING_ID_NAME (id)); | ||
| 6374 | } | 6447 | } |
| 6448 | else | ||
| 6449 | XSETCAR (tail, CODING_ID_NAME (id)); | ||
| 6375 | } | 6450 | } |
| 6376 | } | 6451 | } |
| 6377 | 6452 | ||