diff options
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 409 |
1 files changed, 312 insertions, 97 deletions
diff --git a/src/coding.c b/src/coding.c index 6cfcec905a1..42fd81b6322 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, | |||
| 1125 | *buf++ = id; \ | 1125 | *buf++ = id; \ |
| 1126 | } while (0) | 1126 | } while (0) |
| 1127 | 1127 | ||
| 1128 | |||
| 1129 | /* Bitmasks for coding->eol_seen. */ | ||
| 1130 | |||
| 1131 | #define EOL_SEEN_NONE 0 | ||
| 1132 | #define EOL_SEEN_LF 1 | ||
| 1133 | #define EOL_SEEN_CR 2 | ||
| 1134 | #define EOL_SEEN_CRLF 4 | ||
| 1135 | |||
| 1128 | 1136 | ||
| 1129 | /*** 2. Emacs' internal format (emacs-utf-8) ***/ | 1137 | /*** 2. Emacs' internal format (emacs-utf-8) ***/ |
| 1130 | 1138 | ||
| @@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, | |||
| 1147 | #define UTF_8_BOM_2 0xBB | 1155 | #define UTF_8_BOM_2 0xBB |
| 1148 | #define UTF_8_BOM_3 0xBF | 1156 | #define UTF_8_BOM_3 0xBF |
| 1149 | 1157 | ||
| 1158 | /* Unlike the other detect_coding_XXX, this function counts number of | ||
| 1159 | characters and check EOL format. */ | ||
| 1160 | |||
| 1150 | static bool | 1161 | static bool |
| 1151 | detect_coding_utf_8 (struct coding_system *coding, | 1162 | detect_coding_utf_8 (struct coding_system *coding, |
| 1152 | struct coding_detection_info *detect_info) | 1163 | struct coding_detection_info *detect_info) |
| @@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1156 | bool multibytep = coding->src_multibyte; | 1167 | bool multibytep = coding->src_multibyte; |
| 1157 | ptrdiff_t consumed_chars = 0; | 1168 | ptrdiff_t consumed_chars = 0; |
| 1158 | bool bom_found = 0; | 1169 | bool bom_found = 0; |
| 1159 | bool found = 0; | 1170 | int nchars = coding->head_ascii; |
| 1171 | int eol_seen = coding->eol_seen; | ||
| 1160 | 1172 | ||
| 1161 | detect_info->checked |= CATEGORY_MASK_UTF_8; | 1173 | detect_info->checked |= CATEGORY_MASK_UTF_8; |
| 1162 | /* A coding system of this category is always ASCII compatible. */ | 1174 | /* A coding system of this category is always ASCII compatible. */ |
| 1163 | src += coding->head_ascii; | 1175 | src += nchars; |
| 1176 | |||
| 1177 | if (src == coding->source /* BOM should be at the head. */ | ||
| 1178 | && src + 3 < src_end /* BOM is 3-byte long. */ | ||
| 1179 | && src[0] == UTF_8_BOM_1 | ||
| 1180 | && src[1] == UTF_8_BOM_2 | ||
| 1181 | && src[2] == UTF_8_BOM_3) | ||
| 1182 | { | ||
| 1183 | bom_found = 1; | ||
| 1184 | src += 3; | ||
| 1185 | nchars++; | ||
| 1186 | } | ||
| 1164 | 1187 | ||
| 1165 | while (1) | 1188 | while (1) |
| 1166 | { | 1189 | { |
| @@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1169 | src_base = src; | 1192 | src_base = src; |
| 1170 | ONE_MORE_BYTE (c); | 1193 | ONE_MORE_BYTE (c); |
| 1171 | if (c < 0 || UTF_8_1_OCTET_P (c)) | 1194 | if (c < 0 || UTF_8_1_OCTET_P (c)) |
| 1172 | continue; | 1195 | { |
| 1196 | nchars++; | ||
| 1197 | if (c == '\r') | ||
| 1198 | { | ||
| 1199 | if (src < src_end && *src == '\n') | ||
| 1200 | { | ||
| 1201 | eol_seen |= EOL_SEEN_CRLF; | ||
| 1202 | src++; | ||
| 1203 | nchars++; | ||
| 1204 | } | ||
| 1205 | else | ||
| 1206 | eol_seen |= EOL_SEEN_CR; | ||
| 1207 | } | ||
| 1208 | else if (c == '\n') | ||
| 1209 | eol_seen |= EOL_SEEN_LF; | ||
| 1210 | continue; | ||
| 1211 | } | ||
| 1173 | ONE_MORE_BYTE (c1); | 1212 | ONE_MORE_BYTE (c1); |
| 1174 | if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) | 1213 | if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) |
| 1175 | break; | 1214 | break; |
| 1176 | if (UTF_8_2_OCTET_LEADING_P (c)) | 1215 | if (UTF_8_2_OCTET_LEADING_P (c)) |
| 1177 | { | 1216 | { |
| 1178 | found = 1; | 1217 | nchars++; |
| 1179 | continue; | 1218 | continue; |
| 1180 | } | 1219 | } |
| 1181 | ONE_MORE_BYTE (c2); | 1220 | ONE_MORE_BYTE (c2); |
| @@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1183 | break; | 1222 | break; |
| 1184 | if (UTF_8_3_OCTET_LEADING_P (c)) | 1223 | if (UTF_8_3_OCTET_LEADING_P (c)) |
| 1185 | { | 1224 | { |
| 1186 | found = 1; | 1225 | nchars++; |
| 1187 | if (src_base == coding->source | ||
| 1188 | && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3) | ||
| 1189 | bom_found = 1; | ||
| 1190 | continue; | 1226 | continue; |
| 1191 | } | 1227 | } |
| 1192 | ONE_MORE_BYTE (c3); | 1228 | ONE_MORE_BYTE (c3); |
| @@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1194 | break; | 1230 | break; |
| 1195 | if (UTF_8_4_OCTET_LEADING_P (c)) | 1231 | if (UTF_8_4_OCTET_LEADING_P (c)) |
| 1196 | { | 1232 | { |
| 1197 | found = 1; | 1233 | nchars++; |
| 1198 | continue; | 1234 | continue; |
| 1199 | } | 1235 | } |
| 1200 | ONE_MORE_BYTE (c4); | 1236 | ONE_MORE_BYTE (c4); |
| @@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1202 | break; | 1238 | break; |
| 1203 | if (UTF_8_5_OCTET_LEADING_P (c)) | 1239 | if (UTF_8_5_OCTET_LEADING_P (c)) |
| 1204 | { | 1240 | { |
| 1205 | found = 1; | 1241 | nchars++; |
| 1206 | continue; | 1242 | continue; |
| 1207 | } | 1243 | } |
| 1208 | break; | 1244 | break; |
| @@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding, | |||
| 1219 | if (bom_found) | 1255 | if (bom_found) |
| 1220 | { | 1256 | { |
| 1221 | /* The first character 0xFFFE doesn't necessarily mean a BOM. */ | 1257 | /* The first character 0xFFFE doesn't necessarily mean a BOM. */ |
| 1222 | detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; | 1258 | detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; |
| 1223 | } | 1259 | } |
| 1224 | else | 1260 | else |
| 1225 | { | 1261 | { |
| 1226 | detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; | 1262 | detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; |
| 1227 | if (found) | 1263 | if (nchars < src_end - coding->source) |
| 1228 | detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; | 1264 | /* The found characters are less than source bytes, which |
| 1265 | means that we found a valid non-ASCII characters. */ | ||
| 1266 | detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG; | ||
| 1229 | } | 1267 | } |
| 1268 | coding->detected_utf8_chars = nchars; | ||
| 1230 | return 1; | 1269 | return 1; |
| 1231 | } | 1270 | } |
| 1232 | 1271 | ||
| @@ -3887,6 +3926,14 @@ decode_coding_iso_2022 (struct coding_system *coding) | |||
| 3887 | *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | 3926 | *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 3888 | char_offset++; | 3927 | char_offset++; |
| 3889 | coding->errors++; | 3928 | coding->errors++; |
| 3929 | /* Reset the invocation and designation status to the safest | ||
| 3930 | one; i.e. designate ASCII to the graphic register 0, and | ||
| 3931 | invoke that register to the graphic plane 0. This typically | ||
| 3932 | helps the case that an designation sequence for ASCII "ESC ( | ||
| 3933 | B" is somehow broken (e.g. broken by a newline). */ | ||
| 3934 | CODING_ISO_INVOCATION (coding, 0) = 0; | ||
| 3935 | CODING_ISO_DESIGNATION (coding, 0) = charset_ascii; | ||
| 3936 | charset_id_0 = charset_ascii; | ||
| 3890 | continue; | 3937 | continue; |
| 3891 | 3938 | ||
| 3892 | break_loop: | 3939 | break_loop: |
| @@ -5614,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding) | |||
| 5614 | eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); | 5661 | eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); |
| 5615 | 5662 | ||
| 5616 | coding->mode = 0; | 5663 | coding->mode = 0; |
| 5617 | coding->head_ascii = -1; | ||
| 5618 | if (VECTORP (eol_type)) | 5664 | if (VECTORP (eol_type)) |
| 5619 | coding->common_flags = (CODING_REQUIRE_DECODING_MASK | 5665 | coding->common_flags = (CODING_REQUIRE_DECODING_MASK |
| 5620 | | CODING_REQUIRE_DETECTION_MASK); | 5666 | | CODING_REQUIRE_DETECTION_MASK); |
| @@ -6066,51 +6112,40 @@ complement_process_encoding_system (Lisp_Object coding_system) | |||
| 6066 | 6112 | ||
| 6067 | */ | 6113 | */ |
| 6068 | 6114 | ||
| 6069 | #define EOL_SEEN_NONE 0 | 6115 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, |
| 6070 | #define EOL_SEEN_LF 1 | 6116 | int eol_seen); |
| 6071 | #define EOL_SEEN_CR 2 | ||
| 6072 | #define EOL_SEEN_CRLF 4 | ||
| 6073 | 6117 | ||
| 6074 | 6118 | ||
| 6075 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); | 6119 | /* Return the number of ASCII characters at the head of the source. |
| 6120 | By side effects, set coding->head_ascii and update | ||
| 6121 | coding->eol_seen. The value of coding->eol_seen is "logical or" of | ||
| 6122 | EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is | ||
| 6123 | reliable only when all the source bytes are ASCII. */ | ||
| 6076 | 6124 | ||
| 6077 | 6125 | static int | |
| 6078 | /* Return 1 if all the source bytes are ASCII, and return 0 otherwize. | 6126 | check_ascii (struct coding_system *coding) |
| 6079 | By side effects, set coding->head_ascii and coding->eol_seen. The | ||
| 6080 | value of coding->eol_seen is "logical or" of EOL_SEEN_LF, | ||
| 6081 | EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when | ||
| 6082 | all the source bytes are ASCII. */ | ||
| 6083 | |||
| 6084 | static bool | ||
| 6085 | detect_ascii (struct coding_system *coding) | ||
| 6086 | { | 6127 | { |
| 6087 | const unsigned char *src, *end; | 6128 | const unsigned char *src, *end; |
| 6088 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); | 6129 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); |
| 6089 | int eol_seen; | 6130 | int eol_seen = coding->eol_seen; |
| 6090 | 6131 | ||
| 6091 | eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE | ||
| 6092 | : EQ (eol_type, Qunix) ? EOL_SEEN_LF | ||
| 6093 | : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF | ||
| 6094 | : EOL_SEEN_CR); | ||
| 6095 | coding_set_source (coding); | 6132 | coding_set_source (coding); |
| 6096 | src = coding->source; | 6133 | src = coding->source; |
| 6097 | end = src + coding->src_bytes; | 6134 | end = src + coding->src_bytes; |
| 6098 | 6135 | ||
| 6099 | if (inhibit_eol_conversion) | 6136 | if (inhibit_eol_conversion |
| 6137 | || SYMBOLP (eol_type)) | ||
| 6100 | { | 6138 | { |
| 6101 | /* We don't have to check EOL format. */ | 6139 | /* We don't have to check EOL format. */ |
| 6102 | while (src < end && !( *src & 0x80)) src++; | 6140 | while (src < end && !( *src & 0x80)) |
| 6103 | eol_seen = EOL_SEEN_LF; | 6141 | { |
| 6104 | adjust_coding_eol_type (coding, eol_seen); | 6142 | if (*src++ == '\n') |
| 6105 | } | 6143 | eol_seen |= EOL_SEEN_LF; |
| 6106 | else if (eol_seen != EOL_SEEN_NONE) | 6144 | } |
| 6107 | { | ||
| 6108 | /* We don't have to check EOL format either. */ | ||
| 6109 | while (src < end && !(*src & 0x80)) src++; | ||
| 6110 | } | 6145 | } |
| 6111 | else | 6146 | else |
| 6112 | { | 6147 | { |
| 6113 | end--; /* We look ahead one byte. */ | 6148 | end--; /* We look ahead one byte for "CR LF". */ |
| 6114 | while (src < end) | 6149 | while (src < end) |
| 6115 | { | 6150 | { |
| 6116 | int c = *src; | 6151 | int c = *src; |
| @@ -6118,6 +6153,69 @@ detect_ascii (struct coding_system *coding) | |||
| 6118 | if (c & 0x80) | 6153 | if (c & 0x80) |
| 6119 | break; | 6154 | break; |
| 6120 | src++; | 6155 | src++; |
| 6156 | if (c == '\r') | ||
| 6157 | { | ||
| 6158 | if (*src == '\n') | ||
| 6159 | { | ||
| 6160 | eol_seen |= EOL_SEEN_CRLF; | ||
| 6161 | src++; | ||
| 6162 | } | ||
| 6163 | else | ||
| 6164 | eol_seen |= EOL_SEEN_CR; | ||
| 6165 | } | ||
| 6166 | else if (c == '\n') | ||
| 6167 | eol_seen |= EOL_SEEN_LF; | ||
| 6168 | } | ||
| 6169 | if (src == end) | ||
| 6170 | { | ||
| 6171 | int c = *src; | ||
| 6172 | |||
| 6173 | /* All bytes but the last one C are ASCII. */ | ||
| 6174 | if (! (c & 0x80)) | ||
| 6175 | { | ||
| 6176 | if (c == '\r') | ||
| 6177 | eol_seen |= EOL_SEEN_CR; | ||
| 6178 | else if (c == '\n') | ||
| 6179 | eol_seen |= EOL_SEEN_LF; | ||
| 6180 | src++; | ||
| 6181 | } | ||
| 6182 | } | ||
| 6183 | } | ||
| 6184 | coding->head_ascii = src - coding->source; | ||
| 6185 | coding->eol_seen = eol_seen; | ||
| 6186 | return (coding->head_ascii); | ||
| 6187 | } | ||
| 6188 | |||
| 6189 | |||
| 6190 | /* Return the number of characters at the source if all the bytes are | ||
| 6191 | valid UTF-8 (of Unicode range). Otherwise, return -1. By side | ||
| 6192 | effects, update coding->eol_seen. The value of coding->eol_seen is | ||
| 6193 | "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but | ||
| 6194 | the value is reliable only when all the source bytes are valid | ||
| 6195 | UTF-8. */ | ||
| 6196 | |||
| 6197 | static int | ||
| 6198 | check_utf_8 (struct coding_system *coding) | ||
| 6199 | { | ||
| 6200 | const unsigned char *src, *end; | ||
| 6201 | int eol_seen; | ||
| 6202 | int nchars = coding->head_ascii; | ||
| 6203 | |||
| 6204 | if (coding->head_ascii < 0) | ||
| 6205 | check_ascii (coding); | ||
| 6206 | else | ||
| 6207 | coding_set_source (coding); | ||
| 6208 | src = coding->source + coding->head_ascii; | ||
| 6209 | /* We look ahead one byte for CR LF. */ | ||
| 6210 | end = coding->source + coding->src_bytes - 1; | ||
| 6211 | eol_seen = coding->eol_seen; | ||
| 6212 | while (src < end) | ||
| 6213 | { | ||
| 6214 | int c = *src; | ||
| 6215 | |||
| 6216 | if (UTF_8_1_OCTET_P (*src)) | ||
| 6217 | { | ||
| 6218 | src++; | ||
| 6121 | if (c < 0x20) | 6219 | if (c < 0x20) |
| 6122 | { | 6220 | { |
| 6123 | if (c == '\r') | 6221 | if (c == '\r') |
| @@ -6126,6 +6224,7 @@ detect_ascii (struct coding_system *coding) | |||
| 6126 | { | 6224 | { |
| 6127 | eol_seen |= EOL_SEEN_CRLF; | 6225 | eol_seen |= EOL_SEEN_CRLF; |
| 6128 | src++; | 6226 | src++; |
| 6227 | nchars++; | ||
| 6129 | } | 6228 | } |
| 6130 | else | 6229 | else |
| 6131 | eol_seen |= EOL_SEEN_CR; | 6230 | eol_seen |= EOL_SEEN_CR; |
| @@ -6134,27 +6233,58 @@ detect_ascii (struct coding_system *coding) | |||
| 6134 | eol_seen |= EOL_SEEN_LF; | 6233 | eol_seen |= EOL_SEEN_LF; |
| 6135 | } | 6234 | } |
| 6136 | } | 6235 | } |
| 6137 | if (src > end) | 6236 | else if (UTF_8_2_OCTET_LEADING_P (c)) |
| 6138 | /* The last two bytes are CR LF, which means that we have | ||
| 6139 | scanned all bytes. */ | ||
| 6140 | end++; | ||
| 6141 | else if (src == end) | ||
| 6142 | { | 6237 | { |
| 6143 | end++; | 6238 | if (c < 0xC2 /* overlong sequence */ |
| 6144 | if (! (*src & 0x80)) | 6239 | || src + 1 >= end |
| 6145 | { | 6240 | || ! UTF_8_EXTRA_OCTET_P (src[1])) |
| 6146 | if (*src == '\r') | 6241 | return -1; |
| 6147 | eol_seen |= EOL_SEEN_CR; | 6242 | src += 2; |
| 6148 | else if (*src == '\n') | ||
| 6149 | eol_seen |= EOL_SEEN_LF; | ||
| 6150 | src++; | ||
| 6151 | } | ||
| 6152 | } | 6243 | } |
| 6153 | adjust_coding_eol_type (coding, eol_seen); | 6244 | else if (UTF_8_3_OCTET_LEADING_P (c)) |
| 6245 | { | ||
| 6246 | if (src + 2 >= end | ||
| 6247 | || ! (UTF_8_EXTRA_OCTET_P (src[1]) | ||
| 6248 | && UTF_8_EXTRA_OCTET_P (src[2]))) | ||
| 6249 | return -1; | ||
| 6250 | c = (((c & 0xF) << 12) | ||
| 6251 | | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); | ||
| 6252 | if (c < 0x800 /* overlong sequence */ | ||
| 6253 | || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */ | ||
| 6254 | return -1; | ||
| 6255 | src += 3; | ||
| 6256 | } | ||
| 6257 | else if (UTF_8_4_OCTET_LEADING_P (c)) | ||
| 6258 | { | ||
| 6259 | if (src + 3 >= end | ||
| 6260 | || ! (UTF_8_EXTRA_OCTET_P (src[1]) | ||
| 6261 | && UTF_8_EXTRA_OCTET_P (src[2]) | ||
| 6262 | && UTF_8_EXTRA_OCTET_P (src[3]))) | ||
| 6263 | return -1; | ||
| 6264 | c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12) | ||
| 6265 | | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); | ||
| 6266 | if (c < 0x10000 /* overlong sequence */ | ||
| 6267 | || c >= 0x110000) /* non-Unicode character */ | ||
| 6268 | return -1; | ||
| 6269 | src += 4; | ||
| 6270 | } | ||
| 6271 | else | ||
| 6272 | return -1; | ||
| 6273 | nchars++; | ||
| 6274 | } | ||
| 6275 | |||
| 6276 | if (src == end) | ||
| 6277 | { | ||
| 6278 | if (! UTF_8_1_OCTET_P (*src)) | ||
| 6279 | return -1; | ||
| 6280 | nchars++; | ||
| 6281 | if (*src == '\r') | ||
| 6282 | eol_seen |= EOL_SEEN_CR; | ||
| 6283 | else if (*src == '\n') | ||
| 6284 | eol_seen |= EOL_SEEN_LF; | ||
| 6154 | } | 6285 | } |
| 6155 | coding->head_ascii = src - coding->source; | ||
| 6156 | coding->eol_seen = eol_seen; | 6286 | coding->eol_seen = eol_seen; |
| 6157 | return (src == end); | 6287 | return nchars; |
| 6158 | } | 6288 | } |
| 6159 | 6289 | ||
| 6160 | 6290 | ||
| @@ -6269,6 +6399,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen) | |||
| 6269 | Lisp_Object eol_type; | 6399 | Lisp_Object eol_type; |
| 6270 | 6400 | ||
| 6271 | eol_type = CODING_ID_EOL_TYPE (coding->id); | 6401 | eol_type = CODING_ID_EOL_TYPE (coding->id); |
| 6402 | if (! VECTORP (eol_type)) | ||
| 6403 | /* Already adjusted. */ | ||
| 6404 | return eol_type; | ||
| 6272 | if (eol_seen & EOL_SEEN_LF) | 6405 | if (eol_seen & EOL_SEEN_LF) |
| 6273 | { | 6406 | { |
| 6274 | coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); | 6407 | coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); |
| @@ -6296,6 +6429,8 @@ detect_coding (struct coding_system *coding) | |||
| 6296 | { | 6429 | { |
| 6297 | const unsigned char *src, *src_end; | 6430 | const unsigned char *src, *src_end; |
| 6298 | unsigned int saved_mode = coding->mode; | 6431 | unsigned int saved_mode = coding->mode; |
| 6432 | Lisp_Object found = Qnil; | ||
| 6433 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); | ||
| 6299 | 6434 | ||
| 6300 | coding->consumed = coding->consumed_char = 0; | 6435 | coding->consumed = coding->consumed_char = 0; |
| 6301 | coding->produced = coding->produced_char = 0; | 6436 | coding->produced = coding->produced_char = 0; |
| @@ -6303,6 +6438,7 @@ detect_coding (struct coding_system *coding) | |||
| 6303 | 6438 | ||
| 6304 | src_end = coding->source + coding->src_bytes; | 6439 | src_end = coding->source + coding->src_bytes; |
| 6305 | 6440 | ||
| 6441 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6306 | /* If we have not yet decided the text encoding type, detect it | 6442 | /* If we have not yet decided the text encoding type, detect it |
| 6307 | now. */ | 6443 | now. */ |
| 6308 | if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 6444 | if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
| @@ -6312,7 +6448,6 @@ detect_coding (struct coding_system *coding) | |||
| 6312 | bool null_byte_found = 0, eight_bit_found = 0; | 6448 | bool null_byte_found = 0, eight_bit_found = 0; |
| 6313 | 6449 | ||
| 6314 | coding->head_ascii = 0; | 6450 | coding->head_ascii = 0; |
| 6315 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6316 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | 6451 | detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 6317 | for (src = coding->source; src < src_end; src++) | 6452 | for (src = coding->source; src < src_end; src++) |
| 6318 | { | 6453 | { |
| @@ -6360,7 +6495,8 @@ detect_coding (struct coding_system *coding) | |||
| 6360 | { | 6495 | { |
| 6361 | coding->eol_seen |= EOL_SEEN_CRLF; | 6496 | coding->eol_seen |= EOL_SEEN_CRLF; |
| 6362 | src++; | 6497 | src++; |
| 6363 | coding->head_ascii++; | 6498 | if (! eight_bit_found) |
| 6499 | coding->head_ascii++; | ||
| 6364 | } | 6500 | } |
| 6365 | else | 6501 | else |
| 6366 | coding->eol_seen |= EOL_SEEN_CR; | 6502 | coding->eol_seen |= EOL_SEEN_CR; |
| @@ -6422,32 +6558,58 @@ detect_coding (struct coding_system *coding) | |||
| 6422 | } | 6558 | } |
| 6423 | else if ((*(this->detector)) (coding, &detect_info) | 6559 | else if ((*(this->detector)) (coding, &detect_info) |
| 6424 | && detect_info.found & (1 << category)) | 6560 | && detect_info.found & (1 << category)) |
| 6425 | { | 6561 | break; |
| 6426 | if (category == coding_category_utf_16_auto) | ||
| 6427 | { | ||
| 6428 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 6429 | category = coding_category_utf_16_le; | ||
| 6430 | else | ||
| 6431 | category = coding_category_utf_16_be; | ||
| 6432 | } | ||
| 6433 | break; | ||
| 6434 | } | ||
| 6435 | } | 6562 | } |
| 6436 | } | 6563 | } |
| 6437 | 6564 | ||
| 6438 | if (i < coding_category_raw_text) | 6565 | if (i < coding_category_raw_text) |
| 6439 | setup_coding_system (CODING_ID_NAME (this->id), coding); | 6566 | { |
| 6567 | if (category == coding_category_utf_8_auto) | ||
| 6568 | { | ||
| 6569 | Lisp_Object coding_systems; | ||
| 6570 | |||
| 6571 | coding_systems = AREF (CODING_ID_ATTRS (this->id), | ||
| 6572 | coding_attr_utf_bom); | ||
| 6573 | if (CONSP (coding_systems)) | ||
| 6574 | { | ||
| 6575 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | ||
| 6576 | found = XCAR (coding_systems); | ||
| 6577 | else | ||
| 6578 | found = XCDR (coding_systems); | ||
| 6579 | } | ||
| 6580 | else | ||
| 6581 | found = CODING_ID_NAME (this->id); | ||
| 6582 | } | ||
| 6583 | else if (category == coding_category_utf_16_auto) | ||
| 6584 | { | ||
| 6585 | Lisp_Object coding_systems; | ||
| 6586 | |||
| 6587 | coding_systems = AREF (CODING_ID_ATTRS (this->id), | ||
| 6588 | coding_attr_utf_bom); | ||
| 6589 | if (CONSP (coding_systems)) | ||
| 6590 | { | ||
| 6591 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 6592 | found = XCAR (coding_systems); | ||
| 6593 | else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) | ||
| 6594 | found = XCDR (coding_systems); | ||
| 6595 | } | ||
| 6596 | else | ||
| 6597 | found = CODING_ID_NAME (this->id); | ||
| 6598 | } | ||
| 6599 | else | ||
| 6600 | found = CODING_ID_NAME (this->id); | ||
| 6601 | } | ||
| 6440 | else if (null_byte_found) | 6602 | else if (null_byte_found) |
| 6441 | setup_coding_system (Qno_conversion, coding); | 6603 | found = Qno_conversion; |
| 6442 | else if ((detect_info.rejected & CATEGORY_MASK_ANY) | 6604 | else if ((detect_info.rejected & CATEGORY_MASK_ANY) |
| 6443 | == CATEGORY_MASK_ANY) | 6605 | == CATEGORY_MASK_ANY) |
| 6444 | setup_coding_system (Qraw_text, coding); | 6606 | found = Qraw_text; |
| 6445 | else if (detect_info.rejected) | 6607 | else if (detect_info.rejected) |
| 6446 | for (i = 0; i < coding_category_raw_text; i++) | 6608 | for (i = 0; i < coding_category_raw_text; i++) |
| 6447 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) | 6609 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) |
| 6448 | { | 6610 | { |
| 6449 | this = coding_categories + coding_priorities[i]; | 6611 | this = coding_categories + coding_priorities[i]; |
| 6450 | setup_coding_system (CODING_ID_NAME (this->id), coding); | 6612 | found = CODING_ID_NAME (this->id); |
| 6451 | break; | 6613 | break; |
| 6452 | } | 6614 | } |
| 6453 | } | 6615 | } |
| @@ -6461,9 +6623,10 @@ detect_coding (struct coding_system *coding) | |||
| 6461 | coding_systems | 6623 | coding_systems |
| 6462 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6624 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6463 | detect_info.found = detect_info.rejected = 0; | 6625 | detect_info.found = detect_info.rejected = 0; |
| 6464 | if (detect_ascii (coding)) | 6626 | if (check_ascii (coding) == coding->src_bytes) |
| 6465 | { | 6627 | { |
| 6466 | setup_coding_system (XCDR (coding_systems), coding); | 6628 | if (CONSP (coding_systems)) |
| 6629 | found = XCDR (coding_systems); | ||
| 6467 | } | 6630 | } |
| 6468 | else | 6631 | else |
| 6469 | { | 6632 | { |
| @@ -6471,9 +6634,9 @@ detect_coding (struct coding_system *coding) | |||
| 6471 | && detect_coding_utf_8 (coding, &detect_info)) | 6634 | && detect_coding_utf_8 (coding, &detect_info)) |
| 6472 | { | 6635 | { |
| 6473 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | 6636 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) |
| 6474 | setup_coding_system (XCAR (coding_systems), coding); | 6637 | found = XCAR (coding_systems); |
| 6475 | else | 6638 | else |
| 6476 | setup_coding_system (XCDR (coding_systems), coding); | 6639 | found = XCDR (coding_systems); |
| 6477 | } | 6640 | } |
| 6478 | } | 6641 | } |
| 6479 | } | 6642 | } |
| @@ -6487,16 +6650,28 @@ detect_coding (struct coding_system *coding) | |||
| 6487 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6650 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6488 | detect_info.found = detect_info.rejected = 0; | 6651 | detect_info.found = detect_info.rejected = 0; |
| 6489 | coding->head_ascii = 0; | 6652 | coding->head_ascii = 0; |
| 6490 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6491 | if (CONSP (coding_systems) | 6653 | if (CONSP (coding_systems) |
| 6492 | && detect_coding_utf_16 (coding, &detect_info)) | 6654 | && detect_coding_utf_16 (coding, &detect_info)) |
| 6493 | { | 6655 | { |
| 6494 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | 6656 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) |
| 6495 | setup_coding_system (XCAR (coding_systems), coding); | 6657 | found = XCAR (coding_systems); |
| 6496 | else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) | 6658 | else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) |
| 6497 | setup_coding_system (XCDR (coding_systems), coding); | 6659 | found = XCDR (coding_systems); |
| 6498 | } | 6660 | } |
| 6499 | } | 6661 | } |
| 6662 | |||
| 6663 | if (! NILP (found)) | ||
| 6664 | { | ||
| 6665 | int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE | ||
| 6666 | : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF | ||
| 6667 | : EQ (eol_type, Qmac) ? EOL_SEEN_CR | ||
| 6668 | : EOL_SEEN_LF); | ||
| 6669 | |||
| 6670 | setup_coding_system (found, coding); | ||
| 6671 | if (specified_eol != EOL_SEEN_NONE) | ||
| 6672 | adjust_coding_eol_type (coding, specified_eol); | ||
| 6673 | } | ||
| 6674 | |||
| 6500 | coding->mode = saved_mode; | 6675 | coding->mode = saved_mode; |
| 6501 | } | 6676 | } |
| 6502 | 6677 | ||
| @@ -7617,19 +7792,55 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7617 | coding->dst_pos_byte = PT_BYTE; | 7792 | coding->dst_pos_byte = PT_BYTE; |
| 7618 | coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); | 7793 | coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); |
| 7619 | 7794 | ||
| 7795 | coding->head_ascii = -1; | ||
| 7796 | coding->detected_utf8_chars = -1; | ||
| 7797 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 7620 | if (CODING_REQUIRE_DETECTION (coding)) | 7798 | if (CODING_REQUIRE_DETECTION (coding)) |
| 7621 | detect_coding (coding); | 7799 | detect_coding (coding); |
| 7622 | attrs = CODING_ID_ATTRS (coding->id); | 7800 | attrs = CODING_ID_ATTRS (coding->id); |
| 7623 | if (! disable_ascii_optimization) | 7801 | if (! disable_ascii_optimization |
| 7624 | { | 7802 | && ! coding->src_multibyte |
| 7625 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) | 7803 | && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) |
| 7626 | && NILP (CODING_ATTR_POST_READ (attrs)) | 7804 | && NILP (CODING_ATTR_POST_READ (attrs)) |
| 7627 | && NILP (get_translation_table (attrs, 0, NULL)) | 7805 | && NILP (get_translation_table (attrs, 0, NULL))) |
| 7628 | && (coding->head_ascii >= 0 /* We've already called detect_coding */ | 7806 | { |
| 7629 | ? coding->head_ascii == bytes | 7807 | chars = coding->head_ascii; |
| 7630 | : detect_ascii (coding))) | 7808 | if (chars < 0) |
| 7809 | chars = check_ascii (coding); | ||
| 7810 | if (chars != bytes) | ||
| 7811 | { | ||
| 7812 | /* There exists a non-ASCII byte. */ | ||
| 7813 | if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)) | ||
| 7814 | { | ||
| 7815 | if (coding->detected_utf8_chars >= 0) | ||
| 7816 | chars = coding->detected_utf8_chars; | ||
| 7817 | else | ||
| 7818 | chars = check_utf_8 (coding); | ||
| 7819 | if (CODING_UTF_8_BOM (coding) != utf_without_bom | ||
| 7820 | && coding->head_ascii == 0 | ||
| 7821 | && coding->source[0] == UTF_8_BOM_1 | ||
| 7822 | && coding->source[1] == UTF_8_BOM_2 | ||
| 7823 | && coding->source[2] == UTF_8_BOM_3) | ||
| 7824 | { | ||
| 7825 | chars--; | ||
| 7826 | bytes -= 3; | ||
| 7827 | coding->src_bytes -= 3; | ||
| 7828 | } | ||
| 7829 | } | ||
| 7830 | else | ||
| 7831 | chars = -1; | ||
| 7832 | } | ||
| 7833 | if (chars >= 0) | ||
| 7631 | { | 7834 | { |
| 7632 | if (coding->eol_seen == EOL_SEEN_CR) | 7835 | Lisp_Object eol_type; |
| 7836 | |||
| 7837 | eol_type = CODING_ID_EOL_TYPE (coding->id); | ||
| 7838 | if (VECTORP (eol_type)) | ||
| 7839 | { | ||
| 7840 | if (coding->eol_seen != EOL_SEEN_NONE) | ||
| 7841 | eol_type = adjust_coding_eol_type (coding, coding->eol_seen); | ||
| 7842 | } | ||
| 7843 | if (EQ (eol_type, Qmac)) | ||
| 7633 | { | 7844 | { |
| 7634 | unsigned char *src_end = GAP_END_ADDR; | 7845 | unsigned char *src_end = GAP_END_ADDR; |
| 7635 | unsigned char *src = src_end - coding->src_bytes; | 7846 | unsigned char *src = src_end - coding->src_bytes; |
| @@ -7640,22 +7851,26 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7640 | src[-1] = '\n'; | 7851 | src[-1] = '\n'; |
| 7641 | } | 7852 | } |
| 7642 | } | 7853 | } |
| 7643 | else if (coding->eol_seen == EOL_SEEN_CRLF) | 7854 | else if (EQ (eol_type, Qdos)) |
| 7644 | { | 7855 | { |
| 7645 | unsigned char *src = GAP_END_ADDR; | 7856 | unsigned char *src = GAP_END_ADDR; |
| 7646 | unsigned char *src_beg = src - coding->src_bytes; | 7857 | unsigned char *src_beg = src - coding->src_bytes; |
| 7647 | unsigned char *dst = src; | 7858 | unsigned char *dst = src; |
| 7859 | ptrdiff_t diff; | ||
| 7648 | 7860 | ||
| 7649 | while (src_beg < src) | 7861 | while (src_beg < src) |
| 7650 | { | 7862 | { |
| 7651 | *--dst = *--src; | 7863 | *--dst = *--src; |
| 7652 | if (*src == '\n') | 7864 | if (*src == '\n' && src > src_beg && src[-1] == '\r') |
| 7653 | src--; | 7865 | src--; |
| 7654 | } | 7866 | } |
| 7655 | bytes -= dst - src; | 7867 | diff = dst - src; |
| 7868 | bytes -= diff; | ||
| 7869 | chars -= diff; | ||
| 7656 | } | 7870 | } |
| 7657 | coding->produced_char = coding->produced = bytes; | 7871 | coding->produced = bytes; |
| 7658 | insert_from_gap (bytes, bytes, 1); | 7872 | coding->produced_char = chars; |
| 7873 | insert_from_gap (chars, bytes, 1); | ||
| 7659 | return; | 7874 | return; |
| 7660 | } | 7875 | } |
| 7661 | } | 7876 | } |