aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c409
1 files changed, 312 insertions, 97 deletions
diff --git a/src/coding.c b/src/coding.c
index 6cfcec905a1..42fd81b6322 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1125 *buf++ = id; \ 1125 *buf++ = id; \
1126 } while (0) 1126 } while (0)
1127 1127
1128
1129/* Bitmasks for coding->eol_seen. */
1130
1131#define EOL_SEEN_NONE 0
1132#define EOL_SEEN_LF 1
1133#define EOL_SEEN_CR 2
1134#define EOL_SEEN_CRLF 4
1135
1128 1136
1129/*** 2. Emacs' internal format (emacs-utf-8) ***/ 1137/*** 2. Emacs' internal format (emacs-utf-8) ***/
1130 1138
@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1147#define UTF_8_BOM_2 0xBB 1155#define UTF_8_BOM_2 0xBB
1148#define UTF_8_BOM_3 0xBF 1156#define UTF_8_BOM_3 0xBF
1149 1157
1158/* Unlike the other detect_coding_XXX, this function counts number of
1159 characters and check EOL format. */
1160
1150static bool 1161static bool
1151detect_coding_utf_8 (struct coding_system *coding, 1162detect_coding_utf_8 (struct coding_system *coding,
1152 struct coding_detection_info *detect_info) 1163 struct coding_detection_info *detect_info)
@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
1156 bool multibytep = coding->src_multibyte; 1167 bool multibytep = coding->src_multibyte;
1157 ptrdiff_t consumed_chars = 0; 1168 ptrdiff_t consumed_chars = 0;
1158 bool bom_found = 0; 1169 bool bom_found = 0;
1159 bool found = 0; 1170 int nchars = coding->head_ascii;
1171 int eol_seen = coding->eol_seen;
1160 1172
1161 detect_info->checked |= CATEGORY_MASK_UTF_8; 1173 detect_info->checked |= CATEGORY_MASK_UTF_8;
1162 /* A coding system of this category is always ASCII compatible. */ 1174 /* A coding system of this category is always ASCII compatible. */
1163 src += coding->head_ascii; 1175 src += nchars;
1176
1177 if (src == coding->source /* BOM should be at the head. */
1178 && src + 3 < src_end /* BOM is 3-byte long. */
1179 && src[0] == UTF_8_BOM_1
1180 && src[1] == UTF_8_BOM_2
1181 && src[2] == UTF_8_BOM_3)
1182 {
1183 bom_found = 1;
1184 src += 3;
1185 nchars++;
1186 }
1164 1187
1165 while (1) 1188 while (1)
1166 { 1189 {
@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
1169 src_base = src; 1192 src_base = src;
1170 ONE_MORE_BYTE (c); 1193 ONE_MORE_BYTE (c);
1171 if (c < 0 || UTF_8_1_OCTET_P (c)) 1194 if (c < 0 || UTF_8_1_OCTET_P (c))
1172 continue; 1195 {
1196 nchars++;
1197 if (c == '\r')
1198 {
1199 if (src < src_end && *src == '\n')
1200 {
1201 eol_seen |= EOL_SEEN_CRLF;
1202 src++;
1203 nchars++;
1204 }
1205 else
1206 eol_seen |= EOL_SEEN_CR;
1207 }
1208 else if (c == '\n')
1209 eol_seen |= EOL_SEEN_LF;
1210 continue;
1211 }
1173 ONE_MORE_BYTE (c1); 1212 ONE_MORE_BYTE (c1);
1174 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) 1213 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175 break; 1214 break;
1176 if (UTF_8_2_OCTET_LEADING_P (c)) 1215 if (UTF_8_2_OCTET_LEADING_P (c))
1177 { 1216 {
1178 found = 1; 1217 nchars++;
1179 continue; 1218 continue;
1180 } 1219 }
1181 ONE_MORE_BYTE (c2); 1220 ONE_MORE_BYTE (c2);
@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1183 break; 1222 break;
1184 if (UTF_8_3_OCTET_LEADING_P (c)) 1223 if (UTF_8_3_OCTET_LEADING_P (c))
1185 { 1224 {
1186 found = 1; 1225 nchars++;
1187 if (src_base == coding->source
1188 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189 bom_found = 1;
1190 continue; 1226 continue;
1191 } 1227 }
1192 ONE_MORE_BYTE (c3); 1228 ONE_MORE_BYTE (c3);
@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1194 break; 1230 break;
1195 if (UTF_8_4_OCTET_LEADING_P (c)) 1231 if (UTF_8_4_OCTET_LEADING_P (c))
1196 { 1232 {
1197 found = 1; 1233 nchars++;
1198 continue; 1234 continue;
1199 } 1235 }
1200 ONE_MORE_BYTE (c4); 1236 ONE_MORE_BYTE (c4);
@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1202 break; 1238 break;
1203 if (UTF_8_5_OCTET_LEADING_P (c)) 1239 if (UTF_8_5_OCTET_LEADING_P (c))
1204 { 1240 {
1205 found = 1; 1241 nchars++;
1206 continue; 1242 continue;
1207 } 1243 }
1208 break; 1244 break;
@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
1219 if (bom_found) 1255 if (bom_found)
1220 { 1256 {
1221 /* The first character 0xFFFE doesn't necessarily mean a BOM. */ 1257 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1222 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; 1258 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1223 } 1259 }
1224 else 1260 else
1225 { 1261 {
1226 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; 1262 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227 if (found) 1263 if (nchars < src_end - coding->source)
1228 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; 1264 /* The found characters are less than source bytes, which
1265 means that we found a valid non-ASCII characters. */
1266 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1229 } 1267 }
1268 coding->detected_utf8_chars = nchars;
1230 return 1; 1269 return 1;
1231} 1270}
1232 1271
@@ -3887,6 +3926,14 @@ decode_coding_iso_2022 (struct coding_system *coding)
3887 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3926 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888 char_offset++; 3927 char_offset++;
3889 coding->errors++; 3928 coding->errors++;
3929 /* Reset the invocation and designation status to the safest
3930 one; i.e. designate ASCII to the graphic register 0, and
3931 invoke that register to the graphic plane 0. This typically
3932 helps the case that an designation sequence for ASCII "ESC (
3933 B" is somehow broken (e.g. broken by a newline). */
3934 CODING_ISO_INVOCATION (coding, 0) = 0;
3935 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3936 charset_id_0 = charset_ascii;
3890 continue; 3937 continue;
3891 3938
3892 break_loop: 3939 break_loop:
@@ -5614,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5614 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); 5661 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5615 5662
5616 coding->mode = 0; 5663 coding->mode = 0;
5617 coding->head_ascii = -1;
5618 if (VECTORP (eol_type)) 5664 if (VECTORP (eol_type))
5619 coding->common_flags = (CODING_REQUIRE_DECODING_MASK 5665 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5620 | CODING_REQUIRE_DETECTION_MASK); 5666 | CODING_REQUIRE_DETECTION_MASK);
@@ -6066,51 +6112,40 @@ complement_process_encoding_system (Lisp_Object coding_system)
6066 6112
6067*/ 6113*/
6068 6114
6069#define EOL_SEEN_NONE 0 6115static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6070#define EOL_SEEN_LF 1 6116 int eol_seen);
6071#define EOL_SEEN_CR 2
6072#define EOL_SEEN_CRLF 4
6073 6117
6074 6118
6075static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); 6119/* Return the number of ASCII characters at the head of the source.
6120 By side effects, set coding->head_ascii and update
6121 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6122 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6123 reliable only when all the source bytes are ASCII. */
6076 6124
6077 6125static int
6078/* Return 1 if all the source bytes are ASCII, and return 0 otherwize. 6126check_ascii (struct coding_system *coding)
6079 By side effects, set coding->head_ascii and coding->eol_seen. The
6080 value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6081 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6082 all the source bytes are ASCII. */
6083
6084static bool
6085detect_ascii (struct coding_system *coding)
6086{ 6127{
6087 const unsigned char *src, *end; 6128 const unsigned char *src, *end;
6088 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); 6129 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6089 int eol_seen; 6130 int eol_seen = coding->eol_seen;
6090 6131
6091 eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6092 : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6093 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6094 : EOL_SEEN_CR);
6095 coding_set_source (coding); 6132 coding_set_source (coding);
6096 src = coding->source; 6133 src = coding->source;
6097 end = src + coding->src_bytes; 6134 end = src + coding->src_bytes;
6098 6135
6099 if (inhibit_eol_conversion) 6136 if (inhibit_eol_conversion
6137 || SYMBOLP (eol_type))
6100 { 6138 {
6101 /* We don't have to check EOL format. */ 6139 /* We don't have to check EOL format. */
6102 while (src < end && !( *src & 0x80)) src++; 6140 while (src < end && !( *src & 0x80))
6103 eol_seen = EOL_SEEN_LF; 6141 {
6104 adjust_coding_eol_type (coding, eol_seen); 6142 if (*src++ == '\n')
6105 } 6143 eol_seen |= EOL_SEEN_LF;
6106 else if (eol_seen != EOL_SEEN_NONE) 6144 }
6107 {
6108 /* We don't have to check EOL format either. */
6109 while (src < end && !(*src & 0x80)) src++;
6110 } 6145 }
6111 else 6146 else
6112 { 6147 {
6113 end--; /* We look ahead one byte. */ 6148 end--; /* We look ahead one byte for "CR LF". */
6114 while (src < end) 6149 while (src < end)
6115 { 6150 {
6116 int c = *src; 6151 int c = *src;
@@ -6118,6 +6153,69 @@ detect_ascii (struct coding_system *coding)
6118 if (c & 0x80) 6153 if (c & 0x80)
6119 break; 6154 break;
6120 src++; 6155 src++;
6156 if (c == '\r')
6157 {
6158 if (*src == '\n')
6159 {
6160 eol_seen |= EOL_SEEN_CRLF;
6161 src++;
6162 }
6163 else
6164 eol_seen |= EOL_SEEN_CR;
6165 }
6166 else if (c == '\n')
6167 eol_seen |= EOL_SEEN_LF;
6168 }
6169 if (src == end)
6170 {
6171 int c = *src;
6172
6173 /* All bytes but the last one C are ASCII. */
6174 if (! (c & 0x80))
6175 {
6176 if (c == '\r')
6177 eol_seen |= EOL_SEEN_CR;
6178 else if (c == '\n')
6179 eol_seen |= EOL_SEEN_LF;
6180 src++;
6181 }
6182 }
6183 }
6184 coding->head_ascii = src - coding->source;
6185 coding->eol_seen = eol_seen;
6186 return (coding->head_ascii);
6187}
6188
6189
6190/* Return the number of characters at the source if all the bytes are
6191 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6192 effects, update coding->eol_seen. The value of coding->eol_seen is
6193 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6194 the value is reliable only when all the source bytes are valid
6195 UTF-8. */
6196
6197static int
6198check_utf_8 (struct coding_system *coding)
6199{
6200 const unsigned char *src, *end;
6201 int eol_seen;
6202 int nchars = coding->head_ascii;
6203
6204 if (coding->head_ascii < 0)
6205 check_ascii (coding);
6206 else
6207 coding_set_source (coding);
6208 src = coding->source + coding->head_ascii;
6209 /* We look ahead one byte for CR LF. */
6210 end = coding->source + coding->src_bytes - 1;
6211 eol_seen = coding->eol_seen;
6212 while (src < end)
6213 {
6214 int c = *src;
6215
6216 if (UTF_8_1_OCTET_P (*src))
6217 {
6218 src++;
6121 if (c < 0x20) 6219 if (c < 0x20)
6122 { 6220 {
6123 if (c == '\r') 6221 if (c == '\r')
@@ -6126,6 +6224,7 @@ detect_ascii (struct coding_system *coding)
6126 { 6224 {
6127 eol_seen |= EOL_SEEN_CRLF; 6225 eol_seen |= EOL_SEEN_CRLF;
6128 src++; 6226 src++;
6227 nchars++;
6129 } 6228 }
6130 else 6229 else
6131 eol_seen |= EOL_SEEN_CR; 6230 eol_seen |= EOL_SEEN_CR;
@@ -6134,27 +6233,58 @@ detect_ascii (struct coding_system *coding)
6134 eol_seen |= EOL_SEEN_LF; 6233 eol_seen |= EOL_SEEN_LF;
6135 } 6234 }
6136 } 6235 }
6137 if (src > end) 6236 else if (UTF_8_2_OCTET_LEADING_P (c))
6138 /* The last two bytes are CR LF, which means that we have
6139 scanned all bytes. */
6140 end++;
6141 else if (src == end)
6142 { 6237 {
6143 end++; 6238 if (c < 0xC2 /* overlong sequence */
6144 if (! (*src & 0x80)) 6239 || src + 1 >= end
6145 { 6240 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6146 if (*src == '\r') 6241 return -1;
6147 eol_seen |= EOL_SEEN_CR; 6242 src += 2;
6148 else if (*src == '\n')
6149 eol_seen |= EOL_SEEN_LF;
6150 src++;
6151 }
6152 } 6243 }
6153 adjust_coding_eol_type (coding, eol_seen); 6244 else if (UTF_8_3_OCTET_LEADING_P (c))
6245 {
6246 if (src + 2 >= end
6247 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6248 && UTF_8_EXTRA_OCTET_P (src[2])))
6249 return -1;
6250 c = (((c & 0xF) << 12)
6251 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6252 if (c < 0x800 /* overlong sequence */
6253 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6254 return -1;
6255 src += 3;
6256 }
6257 else if (UTF_8_4_OCTET_LEADING_P (c))
6258 {
6259 if (src + 3 >= end
6260 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6261 && UTF_8_EXTRA_OCTET_P (src[2])
6262 && UTF_8_EXTRA_OCTET_P (src[3])))
6263 return -1;
6264 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6265 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6266 if (c < 0x10000 /* overlong sequence */
6267 || c >= 0x110000) /* non-Unicode character */
6268 return -1;
6269 src += 4;
6270 }
6271 else
6272 return -1;
6273 nchars++;
6274 }
6275
6276 if (src == end)
6277 {
6278 if (! UTF_8_1_OCTET_P (*src))
6279 return -1;
6280 nchars++;
6281 if (*src == '\r')
6282 eol_seen |= EOL_SEEN_CR;
6283 else if (*src == '\n')
6284 eol_seen |= EOL_SEEN_LF;
6154 } 6285 }
6155 coding->head_ascii = src - coding->source;
6156 coding->eol_seen = eol_seen; 6286 coding->eol_seen = eol_seen;
6157 return (src == end); 6287 return nchars;
6158} 6288}
6159 6289
6160 6290
@@ -6269,6 +6399,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6269 Lisp_Object eol_type; 6399 Lisp_Object eol_type;
6270 6400
6271 eol_type = CODING_ID_EOL_TYPE (coding->id); 6401 eol_type = CODING_ID_EOL_TYPE (coding->id);
6402 if (! VECTORP (eol_type))
6403 /* Already adjusted. */
6404 return eol_type;
6272 if (eol_seen & EOL_SEEN_LF) 6405 if (eol_seen & EOL_SEEN_LF)
6273 { 6406 {
6274 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); 6407 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6296,6 +6429,8 @@ detect_coding (struct coding_system *coding)
6296{ 6429{
6297 const unsigned char *src, *src_end; 6430 const unsigned char *src, *src_end;
6298 unsigned int saved_mode = coding->mode; 6431 unsigned int saved_mode = coding->mode;
6432 Lisp_Object found = Qnil;
6433 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6299 6434
6300 coding->consumed = coding->consumed_char = 0; 6435 coding->consumed = coding->consumed_char = 0;
6301 coding->produced = coding->produced_char = 0; 6436 coding->produced = coding->produced_char = 0;
@@ -6303,6 +6438,7 @@ detect_coding (struct coding_system *coding)
6303 6438
6304 src_end = coding->source + coding->src_bytes; 6439 src_end = coding->source + coding->src_bytes;
6305 6440
6441 coding->eol_seen = EOL_SEEN_NONE;
6306 /* If we have not yet decided the text encoding type, detect it 6442 /* If we have not yet decided the text encoding type, detect it
6307 now. */ 6443 now. */
6308 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 6444 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6312,7 +6448,6 @@ detect_coding (struct coding_system *coding)
6312 bool null_byte_found = 0, eight_bit_found = 0; 6448 bool null_byte_found = 0, eight_bit_found = 0;
6313 6449
6314 coding->head_ascii = 0; 6450 coding->head_ascii = 0;
6315 coding->eol_seen = EOL_SEEN_NONE;
6316 detect_info.checked = detect_info.found = detect_info.rejected = 0; 6451 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6317 for (src = coding->source; src < src_end; src++) 6452 for (src = coding->source; src < src_end; src++)
6318 { 6453 {
@@ -6360,7 +6495,8 @@ detect_coding (struct coding_system *coding)
6360 { 6495 {
6361 coding->eol_seen |= EOL_SEEN_CRLF; 6496 coding->eol_seen |= EOL_SEEN_CRLF;
6362 src++; 6497 src++;
6363 coding->head_ascii++; 6498 if (! eight_bit_found)
6499 coding->head_ascii++;
6364 } 6500 }
6365 else 6501 else
6366 coding->eol_seen |= EOL_SEEN_CR; 6502 coding->eol_seen |= EOL_SEEN_CR;
@@ -6422,32 +6558,58 @@ detect_coding (struct coding_system *coding)
6422 } 6558 }
6423 else if ((*(this->detector)) (coding, &detect_info) 6559 else if ((*(this->detector)) (coding, &detect_info)
6424 && detect_info.found & (1 << category)) 6560 && detect_info.found & (1 << category))
6425 { 6561 break;
6426 if (category == coding_category_utf_16_auto)
6427 {
6428 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6429 category = coding_category_utf_16_le;
6430 else
6431 category = coding_category_utf_16_be;
6432 }
6433 break;
6434 }
6435 } 6562 }
6436 } 6563 }
6437 6564
6438 if (i < coding_category_raw_text) 6565 if (i < coding_category_raw_text)
6439 setup_coding_system (CODING_ID_NAME (this->id), coding); 6566 {
6567 if (category == coding_category_utf_8_auto)
6568 {
6569 Lisp_Object coding_systems;
6570
6571 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6572 coding_attr_utf_bom);
6573 if (CONSP (coding_systems))
6574 {
6575 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6576 found = XCAR (coding_systems);
6577 else
6578 found = XCDR (coding_systems);
6579 }
6580 else
6581 found = CODING_ID_NAME (this->id);
6582 }
6583 else if (category == coding_category_utf_16_auto)
6584 {
6585 Lisp_Object coding_systems;
6586
6587 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6588 coding_attr_utf_bom);
6589 if (CONSP (coding_systems))
6590 {
6591 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6592 found = XCAR (coding_systems);
6593 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6594 found = XCDR (coding_systems);
6595 }
6596 else
6597 found = CODING_ID_NAME (this->id);
6598 }
6599 else
6600 found = CODING_ID_NAME (this->id);
6601 }
6440 else if (null_byte_found) 6602 else if (null_byte_found)
6441 setup_coding_system (Qno_conversion, coding); 6603 found = Qno_conversion;
6442 else if ((detect_info.rejected & CATEGORY_MASK_ANY) 6604 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6443 == CATEGORY_MASK_ANY) 6605 == CATEGORY_MASK_ANY)
6444 setup_coding_system (Qraw_text, coding); 6606 found = Qraw_text;
6445 else if (detect_info.rejected) 6607 else if (detect_info.rejected)
6446 for (i = 0; i < coding_category_raw_text; i++) 6608 for (i = 0; i < coding_category_raw_text; i++)
6447 if (! (detect_info.rejected & (1 << coding_priorities[i]))) 6609 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6448 { 6610 {
6449 this = coding_categories + coding_priorities[i]; 6611 this = coding_categories + coding_priorities[i];
6450 setup_coding_system (CODING_ID_NAME (this->id), coding); 6612 found = CODING_ID_NAME (this->id);
6451 break; 6613 break;
6452 } 6614 }
6453 } 6615 }
@@ -6461,9 +6623,10 @@ detect_coding (struct coding_system *coding)
6461 coding_systems 6623 coding_systems
6462 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6624 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6463 detect_info.found = detect_info.rejected = 0; 6625 detect_info.found = detect_info.rejected = 0;
6464 if (detect_ascii (coding)) 6626 if (check_ascii (coding) == coding->src_bytes)
6465 { 6627 {
6466 setup_coding_system (XCDR (coding_systems), coding); 6628 if (CONSP (coding_systems))
6629 found = XCDR (coding_systems);
6467 } 6630 }
6468 else 6631 else
6469 { 6632 {
@@ -6471,9 +6634,9 @@ detect_coding (struct coding_system *coding)
6471 && detect_coding_utf_8 (coding, &detect_info)) 6634 && detect_coding_utf_8 (coding, &detect_info))
6472 { 6635 {
6473 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) 6636 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6474 setup_coding_system (XCAR (coding_systems), coding); 6637 found = XCAR (coding_systems);
6475 else 6638 else
6476 setup_coding_system (XCDR (coding_systems), coding); 6639 found = XCDR (coding_systems);
6477 } 6640 }
6478 } 6641 }
6479 } 6642 }
@@ -6487,16 +6650,28 @@ detect_coding (struct coding_system *coding)
6487 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6650 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6488 detect_info.found = detect_info.rejected = 0; 6651 detect_info.found = detect_info.rejected = 0;
6489 coding->head_ascii = 0; 6652 coding->head_ascii = 0;
6490 coding->eol_seen = EOL_SEEN_NONE;
6491 if (CONSP (coding_systems) 6653 if (CONSP (coding_systems)
6492 && detect_coding_utf_16 (coding, &detect_info)) 6654 && detect_coding_utf_16 (coding, &detect_info))
6493 { 6655 {
6494 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) 6656 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6495 setup_coding_system (XCAR (coding_systems), coding); 6657 found = XCAR (coding_systems);
6496 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) 6658 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6497 setup_coding_system (XCDR (coding_systems), coding); 6659 found = XCDR (coding_systems);
6498 } 6660 }
6499 } 6661 }
6662
6663 if (! NILP (found))
6664 {
6665 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6666 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6667 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6668 : EOL_SEEN_LF);
6669
6670 setup_coding_system (found, coding);
6671 if (specified_eol != EOL_SEEN_NONE)
6672 adjust_coding_eol_type (coding, specified_eol);
6673 }
6674
6500 coding->mode = saved_mode; 6675 coding->mode = saved_mode;
6501} 6676}
6502 6677
@@ -7617,19 +7792,55 @@ decode_coding_gap (struct coding_system *coding,
7617 coding->dst_pos_byte = PT_BYTE; 7792 coding->dst_pos_byte = PT_BYTE;
7618 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); 7793 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7619 7794
7795 coding->head_ascii = -1;
7796 coding->detected_utf8_chars = -1;
7797 coding->eol_seen = EOL_SEEN_NONE;
7620 if (CODING_REQUIRE_DETECTION (coding)) 7798 if (CODING_REQUIRE_DETECTION (coding))
7621 detect_coding (coding); 7799 detect_coding (coding);
7622 attrs = CODING_ID_ATTRS (coding->id); 7800 attrs = CODING_ID_ATTRS (coding->id);
7623 if (! disable_ascii_optimization) 7801 if (! disable_ascii_optimization
7624 { 7802 && ! coding->src_multibyte
7625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) 7803 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7626 && NILP (CODING_ATTR_POST_READ (attrs)) 7804 && NILP (CODING_ATTR_POST_READ (attrs))
7627 && NILP (get_translation_table (attrs, 0, NULL)) 7805 && NILP (get_translation_table (attrs, 0, NULL)))
7628 && (coding->head_ascii >= 0 /* We've already called detect_coding */ 7806 {
7629 ? coding->head_ascii == bytes 7807 chars = coding->head_ascii;
7630 : detect_ascii (coding))) 7808 if (chars < 0)
7809 chars = check_ascii (coding);
7810 if (chars != bytes)
7811 {
7812 /* There exists a non-ASCII byte. */
7813 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7814 {
7815 if (coding->detected_utf8_chars >= 0)
7816 chars = coding->detected_utf8_chars;
7817 else
7818 chars = check_utf_8 (coding);
7819 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7820 && coding->head_ascii == 0
7821 && coding->source[0] == UTF_8_BOM_1
7822 && coding->source[1] == UTF_8_BOM_2
7823 && coding->source[2] == UTF_8_BOM_3)
7824 {
7825 chars--;
7826 bytes -= 3;
7827 coding->src_bytes -= 3;
7828 }
7829 }
7830 else
7831 chars = -1;
7832 }
7833 if (chars >= 0)
7631 { 7834 {
7632 if (coding->eol_seen == EOL_SEEN_CR) 7835 Lisp_Object eol_type;
7836
7837 eol_type = CODING_ID_EOL_TYPE (coding->id);
7838 if (VECTORP (eol_type))
7839 {
7840 if (coding->eol_seen != EOL_SEEN_NONE)
7841 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7842 }
7843 if (EQ (eol_type, Qmac))
7633 { 7844 {
7634 unsigned char *src_end = GAP_END_ADDR; 7845 unsigned char *src_end = GAP_END_ADDR;
7635 unsigned char *src = src_end - coding->src_bytes; 7846 unsigned char *src = src_end - coding->src_bytes;
@@ -7640,22 +7851,26 @@ decode_coding_gap (struct coding_system *coding,
7640 src[-1] = '\n'; 7851 src[-1] = '\n';
7641 } 7852 }
7642 } 7853 }
7643 else if (coding->eol_seen == EOL_SEEN_CRLF) 7854 else if (EQ (eol_type, Qdos))
7644 { 7855 {
7645 unsigned char *src = GAP_END_ADDR; 7856 unsigned char *src = GAP_END_ADDR;
7646 unsigned char *src_beg = src - coding->src_bytes; 7857 unsigned char *src_beg = src - coding->src_bytes;
7647 unsigned char *dst = src; 7858 unsigned char *dst = src;
7859 ptrdiff_t diff;
7648 7860
7649 while (src_beg < src) 7861 while (src_beg < src)
7650 { 7862 {
7651 *--dst = *--src; 7863 *--dst = *--src;
7652 if (*src == '\n') 7864 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7653 src--; 7865 src--;
7654 } 7866 }
7655 bytes -= dst - src; 7867 diff = dst - src;
7868 bytes -= diff;
7869 chars -= diff;
7656 } 7870 }
7657 coding->produced_char = coding->produced = bytes; 7871 coding->produced = bytes;
7658 insert_from_gap (bytes, bytes, 1); 7872 coding->produced_char = chars;
7873 insert_from_gap (chars, bytes, 1);
7659 return; 7874 return;
7660 } 7875 }
7661 } 7876 }