Fix the setting of buffer-file-coding-system on, for instance, C-x RET c unix RET _FILE_OF_DOS_EOL_TYPE_ RET.

author: Kenichi Handa 2013-05-22 23:53:21 +0900
committer: Kenichi Handa 2013-05-22 23:53:21 +0900
commit: e6d2f1553635a746396f2f4261dde31e03e0fdd1 (patch)
tree: 00882ebfc0d82b37593f64bee4aee51c49b5f19b /src/coding.c
parent: 59c886717271b57d661027685d203a3dd5cfafa7 (diff)
download: emacs-e6d2f1553635a746396f2f4261dde31e03e0fdd1.tar.gz
emacs-e6d2f1553635a746396f2f4261dde31e03e0fdd1.zip
1 files changed, 154 insertions, 67 deletions
diff --git a/src/coding.c b/src/coding.c
index f6664e179b7..42fd81b6322 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
    *buf++ = id;                                                        \
  } while (0)
+/* Bitmasks for coding->eol_seen.  */
+#define EOL_SEEN_NONE   0
+#define EOL_SEEN_LF     1
+#define EOL_SEEN_CR     2
+#define EOL_SEEN_CRLF   4
 /*** 2. Emacs' internal format (emacs-utf-8) ***/
@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
 #define UTF_8_BOM_2 0xBB
 #define UTF_8_BOM_3 0xBF
+/* Unlike the other detect_coding_XXX, this function counts number of
+   characters and check EOL format.  */
 static bool
 detect_coding_utf_8 (struct coding_system *coding,
                     struct coding_detection_info *detect_info)
@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
  bool multibytep = coding->src_multibyte;
  ptrdiff_t consumed_chars = 0;
  bool bom_found = 0;
-  bool found = 0;
+  int nchars = coding->head_ascii;
+  int eol_seen = coding->eol_seen;
  detect_info->checked |= CATEGORY_MASK_UTF_8;
  /* A coding system of this category is always ASCII compatible.  */
-  src += coding->head_ascii;
+  src += nchars;
+  if (src == coding->source     /* BOM should be at the head.  */
+      && src + 3 < src_end      /* BOM is 3-byte long.  */
+      && src[0] == UTF_8_BOM_1
+      && src[1] == UTF_8_BOM_2
+      && src[2] == UTF_8_BOM_3)
+    {
+      bom_found = 1;
+      src += 3;
+      nchars++;
+    }
  while (1)
    {
@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
      src_base = src;
      ONE_MORE_BYTE (c);
      if (c < 0 || UTF_8_1_OCTET_P (c))
-        continue;
+        {
+          nchars++;
+          if (c == '\r')
+            {
+              if (src < src_end && *src == '\n')
+                {
+                  eol_seen |= EOL_SEEN_CRLF;
+                  src++;
+                  nchars++;
+                }
+              else
+                eol_seen |= EOL_SEEN_CR;
+            }
+          else if (c == '\n')
+            eol_seen |= EOL_SEEN_LF;
+          continue;
+        }
      ONE_MORE_BYTE (c1);
      if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
        break;
      if (UTF_8_2_OCTET_LEADING_P (c))
        {
-          found = 1;
+          nchars++;
          continue;
        }
      ONE_MORE_BYTE (c2);
@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
        break;
      if (UTF_8_3_OCTET_LEADING_P (c))
        {
-          found = 1;
+          nchars++;
-          if (src_base == coding->source
-              && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
-            bom_found = 1;
          continue;
        }
      ONE_MORE_BYTE (c3);
@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
        break;
      if (UTF_8_4_OCTET_LEADING_P (c))
        {
-          found = 1;
+          nchars++;
          continue;
        }
      ONE_MORE_BYTE (c4);
@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
        break;
      if (UTF_8_5_OCTET_LEADING_P (c))
        {
-          found = 1;
+          nchars++;
          continue;
        }
      break;
@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
  if (bom_found)
    {
      /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
-      detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+      detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
    }
  else
    {
      detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
-      if (found)
+      if (nchars < src_end - coding->source)
-        detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+        /* The found characters are less than source bytes, which
+           means that we found a valid non-ASCII characters.  */
+        detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
    }
+  coding->detected_utf8_chars = nchars;
  return 1;
 }
@@ -5622,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
  eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  coding->mode = 0;
-  coding->head_ascii = -1;
  if (VECTORP (eol_type))
    coding->common_flags = (CODING_REQUIRE_DECODING_MASK
                            | CODING_REQUIRE_DETECTION_MASK);
@@ -6074,46 +6112,35 @@ complement_process_encoding_system (Lisp_Object coding_system)
 */
-#define EOL_SEEN_NONE   0
-#define EOL_SEEN_LF     1
-#define EOL_SEEN_CR     2
-#define EOL_SEEN_CRLF   4
 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
                                           int eol_seen);
 /* Return the number of ASCII characters at the head of the source.
-   By side effects, set coding->head_ascii and coding->eol_seen.  The
+   By side effects, set coding->head_ascii and update
-   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
+   coding->eol_seen.  The value of coding->eol_seen is "logical or" of
-   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
+   EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
-   all the source bytes are ASCII.  */
+   reliable only when all the source bytes are ASCII.  */
 static int
 check_ascii (struct coding_system *coding)
 {
  const unsigned char *src, *end;
  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
-  int eol_seen;
+  int eol_seen = coding->eol_seen;
-  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
-              : EQ (eol_type, Qunix) ? EOL_SEEN_LF
-              : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
-              : EOL_SEEN_CR);
  coding_set_source (coding);
  src = coding->source;
  end = src + coding->src_bytes;
  if (inhibit_eol_conversion
-      || eol_seen != EOL_SEEN_NONE)
+      || SYMBOLP (eol_type))
    {
      /* We don't have to check EOL format.  */
-      while (src < end && !( *src & 0x80)) src++;
+      while (src < end && !( *src & 0x80))
-      if (inhibit_eol_conversion)
        {
-          eol_seen = EOL_SEEN_LF;
+          if (*src++ == '\n')
-          adjust_coding_eol_type (coding, eol_seen);
+            eol_seen |= EOL_SEEN_LF;
        }
    }
  else
@@ -6171,7 +6198,7 @@ static int
 check_utf_8 (struct coding_system *coding)
 {
  const unsigned char *src, *end;
-  int eol_seen = coding->eol_seen;
+  int eol_seen;
  int nchars = coding->head_ascii;
  if (coding->head_ascii < 0)
@@ -6181,7 +6208,7 @@ check_utf_8 (struct coding_system *coding)
  src = coding->source + coding->head_ascii;
  /* We look ahead one byte for CR LF.  */
  end = coding->source + coding->src_bytes - 1;
+  eol_seen = coding->eol_seen;
  while (src < end)
    {
      int c = *src;
@@ -6402,6 +6429,8 @@ detect_coding (struct coding_system *coding)
 {
  const unsigned char *src, *src_end;
  unsigned int saved_mode = coding->mode;
+  Lisp_Object found = Qnil;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  coding->consumed = coding->consumed_char = 0;
  coding->produced = coding->produced_char = 0;
@@ -6409,6 +6438,7 @@ detect_coding (struct coding_system *coding)
  src_end = coding->source + coding->src_bytes;
+  coding->eol_seen = EOL_SEEN_NONE;
  /* If we have not yet decided the text encoding type, detect it
     now.  */
  if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6418,7 +6448,6 @@ detect_coding (struct coding_system *coding)
      bool null_byte_found = 0, eight_bit_found = 0;
      coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
      detect_info.checked = detect_info.found = detect_info.rejected = 0;
      for (src = coding->source; src < src_end; src++)
        {
@@ -6529,32 +6558,58 @@ detect_coding (struct coding_system *coding)
                    }
                  else if ((*(this->detector)) (coding, &detect_info)
                           && detect_info.found & (1 << category))
-                    {
+                    break;
-                      if (category == coding_category_utf_16_auto)
-                        {
-                          if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-                            category = coding_category_utf_16_le;
-                          else
-                            category = coding_category_utf_16_be;
-                        }
-                      break;
-                    }
                }
            }
          if (i < coding_category_raw_text)
-            setup_coding_system (CODING_ID_NAME (this->id), coding);
+            {
+              if (category == coding_category_utf_8_auto)
+                {
+                  Lisp_Object coding_systems;
+                  coding_systems = AREF (CODING_ID_ATTRS (this->id),
+                                         coding_attr_utf_bom);
+                  if (CONSP (coding_systems))
+                    {
+                      if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+                        found = XCAR (coding_systems);
+                      else
+                        found = XCDR (coding_systems);
+                    }
+                  else
+                    found = CODING_ID_NAME (this->id);
+                }
+              else if (category == coding_category_utf_16_auto)
+                {
+                  Lisp_Object coding_systems;
+                  coding_systems = AREF (CODING_ID_ATTRS (this->id),
+                                         coding_attr_utf_bom);
+                  if (CONSP (coding_systems))
+                    {
+                      if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+                        found = XCAR (coding_systems);
+                      else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
+                        found = XCDR (coding_systems);
+                    }
+                  else
+                    found = CODING_ID_NAME (this->id);
+                }
+              else
+                found = CODING_ID_NAME (this->id);
+            }
          else if (null_byte_found)
-            setup_coding_system (Qno_conversion, coding);
+            found = Qno_conversion;
          else if ((detect_info.rejected & CATEGORY_MASK_ANY)
                   == CATEGORY_MASK_ANY)
-            setup_coding_system (Qraw_text, coding);
+            found = Qraw_text;
          else if (detect_info.rejected)
            for (i = 0; i < coding_category_raw_text; i++)
              if (! (detect_info.rejected & (1 << coding_priorities[i])))
                {
                  this = coding_categories + coding_priorities[i];
-                  setup_coding_system (CODING_ID_NAME (this->id), coding);
+                  found = CODING_ID_NAME (this->id);
                  break;
                }
        }
@@ -6570,12 +6625,8 @@ detect_coding (struct coding_system *coding)
      detect_info.found = detect_info.rejected = 0;
      if (check_ascii (coding) == coding->src_bytes)
        {
-          int head_ascii = coding->head_ascii;
+          if (CONSP (coding_systems))
+            found = XCDR (coding_systems);
-          if (coding->eol_seen != EOL_SEEN_NONE)
-            adjust_coding_eol_type (coding, coding->eol_seen);
-          setup_coding_system (XCDR (coding_systems), coding);
-          coding->head_ascii = head_ascii;
        }
      else
        {
@@ -6583,9 +6634,9 @@ detect_coding (struct coding_system *coding)
              && detect_coding_utf_8 (coding, &detect_info))
            {
              if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
-                setup_coding_system (XCAR (coding_systems), coding);
+                found = XCAR (coding_systems);
              else
-                setup_coding_system (XCDR (coding_systems), coding);
+                found = XCDR (coding_systems);
            }
        }
    }
@@ -6599,16 +6650,28 @@ detect_coding (struct coding_system *coding)
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
      detect_info.found = detect_info.rejected = 0;
      coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
      if (CONSP (coding_systems)
          && detect_coding_utf_16 (coding, &detect_info))
        {
          if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-            setup_coding_system (XCAR (coding_systems), coding);
+            found = XCAR (coding_systems);
          else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
-            setup_coding_system (XCDR (coding_systems), coding);
+            found = XCDR (coding_systems);
        }
    }
+  if (! NILP (found))
+    {
+      int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
+                           : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+                           : EQ (eol_type, Qmac) ? EOL_SEEN_CR
+                           : EOL_SEEN_LF);
+      setup_coding_system (found, coding);
+      if (specified_eol != EOL_SEEN_NONE)
+        adjust_coding_eol_type (coding, specified_eol);
+    }
  coding->mode = saved_mode;
 }
@@ -7729,6 +7792,9 @@ decode_coding_gap (struct coding_system *coding,
  coding->dst_pos_byte = PT_BYTE;
  coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
+  coding->head_ascii = -1;
+  coding->detected_utf8_chars = -1;
+  coding->eol_seen = EOL_SEEN_NONE;
  if (CODING_REQUIRE_DETECTION (coding))
    detect_coding (coding);
  attrs = CODING_ID_ATTRS (coding->id);
@@ -7743,17 +7809,38 @@ decode_coding_gap (struct coding_system *coding,
        chars = check_ascii (coding);
      if (chars != bytes)
        {
+          /* There exists a non-ASCII byte.  */
          if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
-            chars = check_utf_8 (coding);
+            {
+              if (coding->detected_utf8_chars >= 0)
+                chars = coding->detected_utf8_chars;
+              else
+                chars = check_utf_8 (coding);
+              if (CODING_UTF_8_BOM (coding) != utf_without_bom
+                  && coding->head_ascii == 0
+                  && coding->source[0] == UTF_8_BOM_1
+                  && coding->source[1] == UTF_8_BOM_2
+                  && coding->source[2] == UTF_8_BOM_3)
+                {
+                  chars--;
+                  bytes -= 3;
+                  coding->src_bytes -= 3;
+                }
+            }
          else
            chars = -1;
        }
      if (chars >= 0)
        {
-          if (coding->eol_seen != EOL_SEEN_NONE)
+          Lisp_Object eol_type;
-            adjust_coding_eol_type (coding, coding->eol_seen);
-          if (coding->eol_seen == EOL_SEEN_CR)
+          eol_type = CODING_ID_EOL_TYPE (coding->id);
+          if (VECTORP (eol_type))
+            {
+              if (coding->eol_seen != EOL_SEEN_NONE)
+                eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
+            }
+          if (EQ (eol_type, Qmac))
            {
              unsigned char *src_end = GAP_END_ADDR;
              unsigned char *src = src_end - coding->src_bytes;
@@ -7764,7 +7851,7 @@ decode_coding_gap (struct coding_system *coding,
                    src[-1] = '\n';
                }
            }
-          else if (coding->eol_seen == EOL_SEEN_CRLF)
+          else if (EQ (eol_type, Qdos))
            {
              unsigned char *src = GAP_END_ADDR;
              unsigned char *src_beg = src - coding->src_bytes;
author	Kenichi Handa	2013-05-22 23:53:21 +0900
committer	Kenichi Handa	2013-05-22 23:53:21 +0900
commit	e6d2f1553635a746396f2f4261dde31e03e0fdd1 (patch)
tree	00882ebfc0d82b37593f64bee4aee51c49b5f19b /src/coding.c
parent	59c886717271b57d661027685d203a3dd5cfafa7 (diff)
download	emacs-e6d2f1553635a746396f2f4261dde31e03e0fdd1.tar.gz emacs-e6d2f1553635a746396f2f4261dde31e03e0fdd1.zip

diff --git a/src/coding.c b/src/coding.c index f6664e179b7..42fd81b6322 100644 --- a/src/coding.c +++ b/src/coding.c
@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1125	*buf++ = id; \	1125	*buf++ = id; \
1126	} while (0)	1126	} while (0)
1127		1127
		1128
		1129	/* Bitmasks for coding->eol_seen. */
		1130
		1131	#define EOL_SEEN_NONE 0
		1132	#define EOL_SEEN_LF 1
		1133	#define EOL_SEEN_CR 2
		1134	#define EOL_SEEN_CRLF 4
		1135
1128		1136
1129	/* 2. Emacs' internal format (emacs-utf-8) */	1137	/* 2. Emacs' internal format (emacs-utf-8) */
1130		1138
@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1147	#define UTF_8_BOM_2 0xBB	1155	#define UTF_8_BOM_2 0xBB
1148	#define UTF_8_BOM_3 0xBF	1156	#define UTF_8_BOM_3 0xBF
1149		1157
		1158	/* Unlike the other detect_coding_XXX, this function counts number of
		1159	characters and check EOL format. */
		1160
1150	static bool	1161	static bool
1151	detect_coding_utf_8 (struct coding_system *coding,	1162	detect_coding_utf_8 (struct coding_system *coding,
1152	struct coding_detection_info *detect_info)	1163	struct coding_detection_info *detect_info)
@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
1156	bool multibytep = coding->src_multibyte;	1167	bool multibytep = coding->src_multibyte;
1157	ptrdiff_t consumed_chars = 0;	1168	ptrdiff_t consumed_chars = 0;
1158	bool bom_found = 0;	1169	bool bom_found = 0;
1159	bool found = 0;	1170	int nchars = coding->head_ascii;
		1171	int eol_seen = coding->eol_seen;
1160		1172
1161	detect_info->checked \|= CATEGORY_MASK_UTF_8;	1173	detect_info->checked \|= CATEGORY_MASK_UTF_8;
1162	/* A coding system of this category is always ASCII compatible. */	1174	/* A coding system of this category is always ASCII compatible. */
1163	src += coding->head_ascii;	1175	src += nchars;
		1176
		1177	if (src == coding->source /* BOM should be at the head. */
		1178	&& src + 3 < src_end /* BOM is 3-byte long. */
		1179	&& src[0] == UTF_8_BOM_1
		1180	&& src[1] == UTF_8_BOM_2
		1181	&& src[2] == UTF_8_BOM_3)
		1182	{
		1183	bom_found = 1;
		1184	src += 3;
		1185	nchars++;
		1186	}
1164		1187
1165	while (1)	1188	while (1)
1166	{	1189	{
@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
1169	src_base = src;	1192	src_base = src;
1170	ONE_MORE_BYTE (c);	1193	ONE_MORE_BYTE (c);
1171	if (c < 0 \|\| UTF_8_1_OCTET_P (c))	1194	if (c < 0 \|\| UTF_8_1_OCTET_P (c))
1172	continue;	1195	{
		1196	nchars++;
		1197	if (c == '\r')
		1198	{
		1199	if (src < src_end && *src == '\n')
		1200	{
		1201	eol_seen \|= EOL_SEEN_CRLF;
		1202	src++;
		1203	nchars++;
		1204	}
		1205	else
		1206	eol_seen \|= EOL_SEEN_CR;
		1207	}
		1208	else if (c == '\n')
		1209	eol_seen \|= EOL_SEEN_LF;
		1210	continue;
		1211	}
1173	ONE_MORE_BYTE (c1);	1212	ONE_MORE_BYTE (c1);
1174	if (c1 < 0 \|\| ! UTF_8_EXTRA_OCTET_P (c1))	1213	if (c1 < 0 \|\| ! UTF_8_EXTRA_OCTET_P (c1))
1175	break;	1214	break;
1176	if (UTF_8_2_OCTET_LEADING_P (c))	1215	if (UTF_8_2_OCTET_LEADING_P (c))
1177	{	1216	{
1178	found = 1;	1217	nchars++;
1179	continue;	1218	continue;
1180	}	1219	}
1181	ONE_MORE_BYTE (c2);	1220	ONE_MORE_BYTE (c2);
@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1183	break;	1222	break;
1184	if (UTF_8_3_OCTET_LEADING_P (c))	1223	if (UTF_8_3_OCTET_LEADING_P (c))
1185	{	1224	{
1186	found = 1;	1225	nchars++;
1187	if (src_base == coding->source
1188	&& c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189	bom_found = 1;
1190	continue;	1226	continue;
1191	}	1227	}
1192	ONE_MORE_BYTE (c3);	1228	ONE_MORE_BYTE (c3);
@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1194	break;	1230	break;
1195	if (UTF_8_4_OCTET_LEADING_P (c))	1231	if (UTF_8_4_OCTET_LEADING_P (c))
1196	{	1232	{
1197	found = 1;	1233	nchars++;
1198	continue;	1234	continue;
1199	}	1235	}
1200	ONE_MORE_BYTE (c4);	1236	ONE_MORE_BYTE (c4);
@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1202	break;	1238	break;
1203	if (UTF_8_5_OCTET_LEADING_P (c))	1239	if (UTF_8_5_OCTET_LEADING_P (c))
1204	{	1240	{
1205	found = 1;	1241	nchars++;
1206	continue;	1242	continue;
1207	}	1243	}
1208	break;	1244	break;
@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
1219	if (bom_found)	1255	if (bom_found)
1220	{	1256	{
1221	/* The first character 0xFFFE doesn't necessarily mean a BOM. */	1257	/* The first character 0xFFFE doesn't necessarily mean a BOM. */
1222	detect_info->found \|= CATEGORY_MASK_UTF_8_SIG \| CATEGORY_MASK_UTF_8_NOSIG;	1258	detect_info->found \|= CATEGORY_MASK_UTF_8_AUTO \| CATEGORY_MASK_UTF_8_SIG \| CATEGORY_MASK_UTF_8_NOSIG;
1223	}	1259	}
1224	else	1260	else
1225	{	1261	{
1226	detect_info->rejected \|= CATEGORY_MASK_UTF_8_SIG;	1262	detect_info->rejected \|= CATEGORY_MASK_UTF_8_SIG;
1227	if (found)	1263	if (nchars < src_end - coding->source)
1228	detect_info->found \|= CATEGORY_MASK_UTF_8_NOSIG;	1264	/* The found characters are less than source bytes, which
		1265	means that we found a valid non-ASCII characters. */
		1266	detect_info->found \|= CATEGORY_MASK_UTF_8_AUTO \| CATEGORY_MASK_UTF_8_NOSIG;
1229	}	1267	}
		1268	coding->detected_utf8_chars = nchars;
1230	return 1;	1269	return 1;
1231	}	1270	}
1232		1271
@@ -5622,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5622	eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);	5661	eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5623		5662
5624	coding->mode = 0;	5663	coding->mode = 0;
5625	coding->head_ascii = -1;
5626	if (VECTORP (eol_type))	5664	if (VECTORP (eol_type))
5627	coding->common_flags = (CODING_REQUIRE_DECODING_MASK	5665	coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5628	\| CODING_REQUIRE_DETECTION_MASK);	5666	\| CODING_REQUIRE_DETECTION_MASK);
@@ -6074,46 +6112,35 @@ complement_process_encoding_system (Lisp_Object coding_system)
6074		6112
6075	*/	6113	*/
6076		6114
6077	#define EOL_SEEN_NONE 0
6078	#define EOL_SEEN_LF 1
6079	#define EOL_SEEN_CR 2
6080	#define EOL_SEEN_CRLF 4
6081
6082
6083	static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,	6115	static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6084	int eol_seen);	6116	int eol_seen);
6085		6117
6086		6118
6087	/* Return the number of ASCII characters at the head of the source.	6119	/* Return the number of ASCII characters at the head of the source.
6088	By side effects, set coding->head_ascii and coding->eol_seen. The	6120	By side effects, set coding->head_ascii and update
6089	value of coding->eol_seen is "logical or" of EOL_SEEN_LF,	6121	coding->eol_seen. The value of coding->eol_seen is "logical or" of
6090	EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when	6122	EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6091	all the source bytes are ASCII. */	6123	reliable only when all the source bytes are ASCII. */
6092		6124
6093	static int	6125	static int
6094	check_ascii (struct coding_system *coding)	6126	check_ascii (struct coding_system *coding)
6095	{	6127	{
6096	const unsigned char src, end;	6128	const unsigned char src, end;
6097	Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);	6129	Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6098	int eol_seen;	6130	int eol_seen = coding->eol_seen;
6099		6131
6100	eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6101	: EQ (eol_type, Qunix) ? EOL_SEEN_LF
6102	: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6103	: EOL_SEEN_CR);
6104	coding_set_source (coding);	6132	coding_set_source (coding);
6105	src = coding->source;	6133	src = coding->source;
6106	end = src + coding->src_bytes;	6134	end = src + coding->src_bytes;
6107		6135
6108	if (inhibit_eol_conversion	6136	if (inhibit_eol_conversion
6109	\|\| eol_seen != EOL_SEEN_NONE)	6137	\|\| SYMBOLP (eol_type))
6110	{	6138	{
6111	/* We don't have to check EOL format. */	6139	/* We don't have to check EOL format. */
6112	while (src < end && !( *src & 0x80)) src++;	6140	while (src < end && !( *src & 0x80))
6113	if (inhibit_eol_conversion)
6114	{	6141	{
6115	eol_seen = EOL_SEEN_LF;	6142	if (*src++ == '\n')
6116	adjust_coding_eol_type (coding, eol_seen);	6143	eol_seen \|= EOL_SEEN_LF;
6117	}	6144	}
6118	}	6145	}
6119	else	6146	else
@@ -6171,7 +6198,7 @@ static int
6171	check_utf_8 (struct coding_system *coding)	6198	check_utf_8 (struct coding_system *coding)
6172	{	6199	{
6173	const unsigned char src, end;	6200	const unsigned char src, end;
6174	int eol_seen = coding->eol_seen;	6201	int eol_seen;
6175	int nchars = coding->head_ascii;	6202	int nchars = coding->head_ascii;
6176		6203
6177	if (coding->head_ascii < 0)	6204	if (coding->head_ascii < 0)
@@ -6181,7 +6208,7 @@ check_utf_8 (struct coding_system *coding)
6181	src = coding->source + coding->head_ascii;	6208	src = coding->source + coding->head_ascii;
6182	/* We look ahead one byte for CR LF. */	6209	/* We look ahead one byte for CR LF. */
6183	end = coding->source + coding->src_bytes - 1;	6210	end = coding->source + coding->src_bytes - 1;
6184		6211	eol_seen = coding->eol_seen;
6185	while (src < end)	6212	while (src < end)
6186	{	6213	{
6187	int c = *src;	6214	int c = *src;
@@ -6402,6 +6429,8 @@ detect_coding (struct coding_system *coding)
6402	{	6429	{
6403	const unsigned char src, src_end;	6430	const unsigned char src, src_end;
6404	unsigned int saved_mode = coding->mode;	6431	unsigned int saved_mode = coding->mode;
		6432	Lisp_Object found = Qnil;
		6433	Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6405		6434
6406	coding->consumed = coding->consumed_char = 0;	6435	coding->consumed = coding->consumed_char = 0;
6407	coding->produced = coding->produced_char = 0;	6436	coding->produced = coding->produced_char = 0;
@@ -6409,6 +6438,7 @@ detect_coding (struct coding_system *coding)
6409		6438
6410	src_end = coding->source + coding->src_bytes;	6439	src_end = coding->source + coding->src_bytes;
6411		6440
		6441	coding->eol_seen = EOL_SEEN_NONE;
6412	/* If we have not yet decided the text encoding type, detect it	6442	/* If we have not yet decided the text encoding type, detect it
6413	now. */	6443	now. */
6414	if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))	6444	if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6418,7 +6448,6 @@ detect_coding (struct coding_system *coding)
6418	bool null_byte_found = 0, eight_bit_found = 0;	6448	bool null_byte_found = 0, eight_bit_found = 0;
6419		6449
6420	coding->head_ascii = 0;	6450	coding->head_ascii = 0;
6421	coding->eol_seen = EOL_SEEN_NONE;
6422	detect_info.checked = detect_info.found = detect_info.rejected = 0;	6451	detect_info.checked = detect_info.found = detect_info.rejected = 0;
6423	for (src = coding->source; src < src_end; src++)	6452	for (src = coding->source; src < src_end; src++)
6424	{	6453	{
@@ -6529,32 +6558,58 @@ detect_coding (struct coding_system *coding)
6529	}	6558	}
6530	else if ((*(this->detector)) (coding, &detect_info)	6559	else if ((*(this->detector)) (coding, &detect_info)
6531	&& detect_info.found & (1 << category))	6560	&& detect_info.found & (1 << category))
6532	{	6561	break;
6533	if (category == coding_category_utf_16_auto)
6534	{
6535	if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6536	category = coding_category_utf_16_le;
6537	else
6538	category = coding_category_utf_16_be;
6539	}
6540	break;
6541	}
6542	}	6562	}
6543	}	6563	}
6544		6564
6545	if (i < coding_category_raw_text)	6565	if (i < coding_category_raw_text)
6546	setup_coding_system (CODING_ID_NAME (this->id), coding);	6566	{
		6567	if (category == coding_category_utf_8_auto)
		6568	{
		6569	Lisp_Object coding_systems;
		6570
		6571	coding_systems = AREF (CODING_ID_ATTRS (this->id),
		6572	coding_attr_utf_bom);
		6573	if (CONSP (coding_systems))
		6574	{
		6575	if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
		6576	found = XCAR (coding_systems);
		6577	else
		6578	found = XCDR (coding_systems);
		6579	}
		6580	else
		6581	found = CODING_ID_NAME (this->id);
		6582	}
		6583	else if (category == coding_category_utf_16_auto)
		6584	{
		6585	Lisp_Object coding_systems;
		6586
		6587	coding_systems = AREF (CODING_ID_ATTRS (this->id),
		6588	coding_attr_utf_bom);
		6589	if (CONSP (coding_systems))
		6590	{
		6591	if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
		6592	found = XCAR (coding_systems);
		6593	else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
		6594	found = XCDR (coding_systems);
		6595	}
		6596	else
		6597	found = CODING_ID_NAME (this->id);
		6598	}
		6599	else
		6600	found = CODING_ID_NAME (this->id);
		6601	}
6547	else if (null_byte_found)	6602	else if (null_byte_found)
6548	setup_coding_system (Qno_conversion, coding);	6603	found = Qno_conversion;
6549	else if ((detect_info.rejected & CATEGORY_MASK_ANY)	6604	else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6550	== CATEGORY_MASK_ANY)	6605	== CATEGORY_MASK_ANY)
6551	setup_coding_system (Qraw_text, coding);	6606	found = Qraw_text;
6552	else if (detect_info.rejected)	6607	else if (detect_info.rejected)
6553	for (i = 0; i < coding_category_raw_text; i++)	6608	for (i = 0; i < coding_category_raw_text; i++)
6554	if (! (detect_info.rejected & (1 << coding_priorities[i])))	6609	if (! (detect_info.rejected & (1 << coding_priorities[i])))
6555	{	6610	{
6556	this = coding_categories + coding_priorities[i];	6611	this = coding_categories + coding_priorities[i];
6557	setup_coding_system (CODING_ID_NAME (this->id), coding);	6612	found = CODING_ID_NAME (this->id);
6558	break;	6613	break;
6559	}	6614	}
6560	}	6615	}
@@ -6570,12 +6625,8 @@ detect_coding (struct coding_system *coding)
6570	detect_info.found = detect_info.rejected = 0;	6625	detect_info.found = detect_info.rejected = 0;
6571	if (check_ascii (coding) == coding->src_bytes)	6626	if (check_ascii (coding) == coding->src_bytes)
6572	{	6627	{
6573	int head_ascii = coding->head_ascii;	6628	if (CONSP (coding_systems))
6574		6629	found = XCDR (coding_systems);
6575	if (coding->eol_seen != EOL_SEEN_NONE)
6576	adjust_coding_eol_type (coding, coding->eol_seen);
6577	setup_coding_system (XCDR (coding_systems), coding);
6578	coding->head_ascii = head_ascii;
6579	}	6630	}
6580	else	6631	else
6581	{	6632	{
@@ -6583,9 +6634,9 @@ detect_coding (struct coding_system *coding)
6583	&& detect_coding_utf_8 (coding, &detect_info))	6634	&& detect_coding_utf_8 (coding, &detect_info))
6584	{	6635	{
6585	if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)	6636	if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6586	setup_coding_system (XCAR (coding_systems), coding);	6637	found = XCAR (coding_systems);
6587	else	6638	else
6588	setup_coding_system (XCDR (coding_systems), coding);	6639	found = XCDR (coding_systems);
6589	}	6640	}
6590	}	6641	}
6591	}	6642	}
@@ -6599,16 +6650,28 @@ detect_coding (struct coding_system *coding)
6599	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);	6650	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6600	detect_info.found = detect_info.rejected = 0;	6651	detect_info.found = detect_info.rejected = 0;
6601	coding->head_ascii = 0;	6652	coding->head_ascii = 0;
6602	coding->eol_seen = EOL_SEEN_NONE;
6603	if (CONSP (coding_systems)	6653	if (CONSP (coding_systems)
6604	&& detect_coding_utf_16 (coding, &detect_info))	6654	&& detect_coding_utf_16 (coding, &detect_info))
6605	{	6655	{
6606	if (detect_info.found & CATEGORY_MASK_UTF_16_LE)	6656	if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6607	setup_coding_system (XCAR (coding_systems), coding);	6657	found = XCAR (coding_systems);
6608	else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)	6658	else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6609	setup_coding_system (XCDR (coding_systems), coding);	6659	found = XCDR (coding_systems);
6610	}	6660	}
6611	}	6661	}
		6662
		6663	if (! NILP (found))
		6664	{
		6665	int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
		6666	: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
		6667	: EQ (eol_type, Qmac) ? EOL_SEEN_CR
		6668	: EOL_SEEN_LF);
		6669
		6670	setup_coding_system (found, coding);
		6671	if (specified_eol != EOL_SEEN_NONE)
		6672	adjust_coding_eol_type (coding, specified_eol);
		6673	}
		6674
6612	coding->mode = saved_mode;	6675	coding->mode = saved_mode;
6613	}	6676	}
6614		6677
@@ -7729,6 +7792,9 @@ decode_coding_gap (struct coding_system *coding,
7729	coding->dst_pos_byte = PT_BYTE;	7792	coding->dst_pos_byte = PT_BYTE;
7730	coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));	7793	coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7731		7794
		7795	coding->head_ascii = -1;
		7796	coding->detected_utf8_chars = -1;
		7797	coding->eol_seen = EOL_SEEN_NONE;
7732	if (CODING_REQUIRE_DETECTION (coding))	7798	if (CODING_REQUIRE_DETECTION (coding))
7733	detect_coding (coding);	7799	detect_coding (coding);
7734	attrs = CODING_ID_ATTRS (coding->id);	7800	attrs = CODING_ID_ATTRS (coding->id);
@@ -7743,17 +7809,38 @@ decode_coding_gap (struct coding_system *coding,
7743	chars = check_ascii (coding);	7809	chars = check_ascii (coding);
7744	if (chars != bytes)	7810	if (chars != bytes)
7745	{	7811	{
		7812	/* There exists a non-ASCII byte. */
7746	if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))	7813	if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7747	chars = check_utf_8 (coding);	7814	{
		7815	if (coding->detected_utf8_chars >= 0)
		7816	chars = coding->detected_utf8_chars;
		7817	else
		7818	chars = check_utf_8 (coding);
		7819	if (CODING_UTF_8_BOM (coding) != utf_without_bom
		7820	&& coding->head_ascii == 0
		7821	&& coding->source[0] == UTF_8_BOM_1
		7822	&& coding->source[1] == UTF_8_BOM_2
		7823	&& coding->source[2] == UTF_8_BOM_3)
		7824	{
		7825	chars--;
		7826	bytes -= 3;
		7827	coding->src_bytes -= 3;
		7828	}
		7829	}
7748	else	7830	else
7749	chars = -1;	7831	chars = -1;
7750	}	7832	}
7751	if (chars >= 0)	7833	if (chars >= 0)
7752	{	7834	{
7753	if (coding->eol_seen != EOL_SEEN_NONE)	7835	Lisp_Object eol_type;
7754	adjust_coding_eol_type (coding, coding->eol_seen);
7755		7836
7756	if (coding->eol_seen == EOL_SEEN_CR)	7837	eol_type = CODING_ID_EOL_TYPE (coding->id);
		7838	if (VECTORP (eol_type))
		7839	{
		7840	if (coding->eol_seen != EOL_SEEN_NONE)
		7841	eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
		7842	}
		7843	if (EQ (eol_type, Qmac))
7757	{	7844	{
7758	unsigned char *src_end = GAP_END_ADDR;	7845	unsigned char *src_end = GAP_END_ADDR;
7759	unsigned char *src = src_end - coding->src_bytes;	7846	unsigned char *src = src_end - coding->src_bytes;
@@ -7764,7 +7851,7 @@ decode_coding_gap (struct coding_system *coding,
7764	src[-1] = '\n';	7851	src[-1] = '\n';
7765	}	7852	}
7766	}	7853	}
7767	else if (coding->eol_seen == EOL_SEEN_CRLF)	7854	else if (EQ (eol_type, Qdos))
7768	{	7855	{
7769	unsigned char *src = GAP_END_ADDR;	7856	unsigned char *src = GAP_END_ADDR;
7770	unsigned char *src_beg = src - coding->src_bytes;	7857	unsigned char *src_beg = src - coding->src_bytes;