aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2008-05-29 22:58:15 +0000
committerKenichi Handa2008-05-29 22:58:15 +0000
commita470d443806985bc57c2ae9f7bce0048013ebf67 (patch)
treedcd63c718f75e5a6a0a7f70751b8ee2642dc1e21 /src/coding.c
parente4215ddd0753fffed307dd643f66ce0a0b26b0c4 (diff)
downloademacs-a470d443806985bc57c2ae9f7bce0048013ebf67.tar.gz
emacs-a470d443806985bc57c2ae9f7bce0048013ebf67.zip
(CODING_UTF_8_BOM): New macro.
(enum coding_category): Delete coding_category_utf_8, add coding_category_utf_8_auto, coding_category_utf_8_nosig, and coding_category_utf_8_sig. (CATEGORY_MASK_UTF_8): Delete it. (CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG) (CATEGORY_MASK_UTF_8_SIG): New macros. (CATEGORY_MASK_ANY): Delete CATEGORY_MASK_UTF_8, add CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG, and CATEGORY_MASK_UTF_8_SIG. (CATEGORY_MASK_UTF_8): New macro. (UTF_BOM, UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3): New macros. (detect_coding_utf_8): Check BOM. (decode_coding_utf_8, encode_coding_utf_8): Handle BOM. (decode_coding_utf_16): Adjusted for the change of enum utf_bom_type. (encode_coding_utf_16): Likewise. (setup_coding_system): Likewise. Set CODING_UTF_8_BOM (coding). (detect_coding, detect_coding_system): Handle utf-8-auto. (Fdefine_coding_system_internal): Handle `bom' property for utf-8. (syms_of_coding): Fix setting up of Vcoding_category_table.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c184
1 files changed, 159 insertions, 25 deletions
diff --git a/src/coding.c b/src/coding.c
index 1e31eda089b..7f9dc42ffa8 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -546,6 +546,9 @@ enum iso_code_class_type
546 character is prohibited by CODING_ISO_FLAG_SAFE. */ 546 character is prohibited by CODING_ISO_FLAG_SAFE. */
547#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' 547#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
548 548
549/* UTF-8 section */
550#define CODING_UTF_8_BOM(coding) \
551 ((coding)->spec.utf_8_bom)
549 552
550/* UTF-16 section */ 553/* UTF-16 section */
551#define CODING_UTF_16_BOM(coding) \ 554#define CODING_UTF_16_BOM(coding) \
@@ -576,7 +579,9 @@ enum coding_category
576 coding_category_iso_8_2, 579 coding_category_iso_8_2,
577 coding_category_iso_7_else, 580 coding_category_iso_7_else,
578 coding_category_iso_8_else, 581 coding_category_iso_8_else,
579 coding_category_utf_8, 582 coding_category_utf_8_auto,
583 coding_category_utf_8_nosig,
584 coding_category_utf_8_sig,
580 coding_category_utf_16_auto, 585 coding_category_utf_16_auto,
581 coding_category_utf_16_be, 586 coding_category_utf_16_be,
582 coding_category_utf_16_le, 587 coding_category_utf_16_le,
@@ -600,7 +605,9 @@ enum coding_category
600#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) 605#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
601#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) 606#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
602#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) 607#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
603#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) 608#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
609#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
610#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
604#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) 611#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
605#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) 612#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
606#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) 613#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
@@ -622,7 +629,9 @@ enum coding_category
622 | CATEGORY_MASK_ISO_8_2 \ 629 | CATEGORY_MASK_ISO_8_2 \
623 | CATEGORY_MASK_ISO_7_ELSE \ 630 | CATEGORY_MASK_ISO_7_ELSE \
624 | CATEGORY_MASK_ISO_8_ELSE \ 631 | CATEGORY_MASK_ISO_8_ELSE \
625 | CATEGORY_MASK_UTF_8 \ 632 | CATEGORY_MASK_UTF_8_AUTO \
633 | CATEGORY_MASK_UTF_8_NOSIG \
634 | CATEGORY_MASK_UTF_8_SIG \
626 | CATEGORY_MASK_UTF_16_AUTO \ 635 | CATEGORY_MASK_UTF_16_AUTO \
627 | CATEGORY_MASK_UTF_16_BE \ 636 | CATEGORY_MASK_UTF_16_BE \
628 | CATEGORY_MASK_UTF_16_LE \ 637 | CATEGORY_MASK_UTF_16_LE \
@@ -662,6 +671,10 @@ enum coding_category
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \ 671 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG) 672 | CATEGORY_MASK_UTF_16_LE_NOSIG)
664 673
674#define CATEGORY_MASK_UTF_8 \
675 (CATEGORY_MASK_UTF_8_AUTO \
676 | CATEGORY_MASK_UTF_8_NOSIG \
677 | CATEGORY_MASK_UTF_8_SIG)
665 678
666/* List of symbols `coding-category-xxx' ordered by priority. This 679/* List of symbols `coding-category-xxx' ordered by priority. This
667 variable is exposed to Emacs Lisp. */ 680 variable is exposed to Emacs Lisp. */
@@ -1214,6 +1227,11 @@ alloc_destination (coding, nbytes, dst)
1214#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) 1227#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1215#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) 1228#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1216 1229
1230#define UTF_BOM 0xFEFF
1231#define UTF_8_BOM_1 0xEF
1232#define UTF_8_BOM_2 0xBB
1233#define UTF_8_BOM_3 0xBF
1234
1217static int 1235static int
1218detect_coding_utf_8 (coding, detect_info) 1236detect_coding_utf_8 (coding, detect_info)
1219 struct coding_system *coding; 1237 struct coding_system *coding;
@@ -1223,6 +1241,7 @@ detect_coding_utf_8 (coding, detect_info)
1223 const unsigned char *src_end = coding->source + coding->src_bytes; 1241 const unsigned char *src_end = coding->source + coding->src_bytes;
1224 int multibytep = coding->src_multibyte; 1242 int multibytep = coding->src_multibyte;
1225 int consumed_chars = 0; 1243 int consumed_chars = 0;
1244 int bom_found = 0;
1226 int found = 0; 1245 int found = 0;
1227 1246
1228 detect_info->checked |= CATEGORY_MASK_UTF_8; 1247 detect_info->checked |= CATEGORY_MASK_UTF_8;
@@ -1242,7 +1261,7 @@ detect_coding_utf_8 (coding, detect_info)
1242 break; 1261 break;
1243 if (UTF_8_2_OCTET_LEADING_P (c)) 1262 if (UTF_8_2_OCTET_LEADING_P (c))
1244 { 1263 {
1245 found = CATEGORY_MASK_UTF_8; 1264 found = 1;
1246 continue; 1265 continue;
1247 } 1266 }
1248 ONE_MORE_BYTE (c2); 1267 ONE_MORE_BYTE (c2);
@@ -1250,7 +1269,10 @@ detect_coding_utf_8 (coding, detect_info)
1250 break; 1269 break;
1251 if (UTF_8_3_OCTET_LEADING_P (c)) 1270 if (UTF_8_3_OCTET_LEADING_P (c))
1252 { 1271 {
1253 found = CATEGORY_MASK_UTF_8; 1272 found = 1;
1273 if (src_base == coding->source
1274 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275 bom_found = 1;
1254 continue; 1276 continue;
1255 } 1277 }
1256 ONE_MORE_BYTE (c3); 1278 ONE_MORE_BYTE (c3);
@@ -1258,7 +1280,7 @@ detect_coding_utf_8 (coding, detect_info)
1258 break; 1280 break;
1259 if (UTF_8_4_OCTET_LEADING_P (c)) 1281 if (UTF_8_4_OCTET_LEADING_P (c))
1260 { 1282 {
1261 found = CATEGORY_MASK_UTF_8; 1283 found = 1;
1262 continue; 1284 continue;
1263 } 1285 }
1264 ONE_MORE_BYTE (c4); 1286 ONE_MORE_BYTE (c4);
@@ -1266,7 +1288,7 @@ detect_coding_utf_8 (coding, detect_info)
1266 break; 1288 break;
1267 if (UTF_8_5_OCTET_LEADING_P (c)) 1289 if (UTF_8_5_OCTET_LEADING_P (c))
1268 { 1290 {
1269 found = CATEGORY_MASK_UTF_8; 1291 found = 1;
1270 continue; 1292 continue;
1271 } 1293 }
1272 break; 1294 break;
@@ -1280,7 +1302,16 @@ detect_coding_utf_8 (coding, detect_info)
1280 detect_info->rejected |= CATEGORY_MASK_UTF_8; 1302 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1281 return 0; 1303 return 0;
1282 } 1304 }
1283 detect_info->found |= found; 1305 if (bom_found)
1306 {
1307 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1308 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309 }
1310 else
1311 {
1312 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1313 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1314 }
1284 return 1; 1315 return 1;
1285} 1316}
1286 1317
@@ -1296,12 +1327,46 @@ decode_coding_utf_8 (coding)
1296 int *charbuf_end = coding->charbuf + coding->charbuf_size; 1327 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297 int consumed_chars = 0, consumed_chars_base; 1328 int consumed_chars = 0, consumed_chars_base;
1298 int multibytep = coding->src_multibyte; 1329 int multibytep = coding->src_multibyte;
1330 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1299 Lisp_Object attr, charset_list; 1331 Lisp_Object attr, charset_list;
1300 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); 1332 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1301 int byte_after_cr = -1; 1333 int byte_after_cr = -1;
1302 1334
1303 CODING_GET_INFO (coding, attr, charset_list); 1335 CODING_GET_INFO (coding, attr, charset_list);
1304 1336
1337 if (bom != utf_without_bom)
1338 {
1339 int c1, c2, c3;
1340
1341 src_base = src;
1342 ONE_MORE_BYTE (c1);
1343 if (! UTF_8_3_OCTET_LEADING_P (c1))
1344 src = src_base;
1345 else
1346 {
1347 ONE_MORE_BYTE (c2);
1348 if (! UTF_8_EXTRA_OCTET_P (c2))
1349 src = src_base;
1350 else
1351 {
1352 ONE_MORE_BYTE (c3);
1353 if (! UTF_8_EXTRA_OCTET_P (c3))
1354 src = src_base;
1355 else
1356 {
1357 if ((c1 != UTF_8_BOM_1)
1358 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1359 src = src_base;
1360 else
1361 CODING_UTF_8_BOM (coding) = utf_without_bom;
1362 }
1363 }
1364 }
1365 }
1366 CODING_UTF_8_BOM (coding) = utf_without_bom;
1367
1368
1369
1305 while (1) 1370 while (1)
1306 { 1371 {
1307 int c, c1, c2, c3, c4, c5; 1372 int c, c1, c2, c3, c4, c5;
@@ -1415,6 +1480,13 @@ encode_coding_utf_8 (coding)
1415 int produced_chars = 0; 1480 int produced_chars = 0;
1416 int c; 1481 int c;
1417 1482
1483 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1484 {
1485 ASSURE_DESTINATION (3);
1486 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1487 CODING_UTF_8_BOM (coding) = utf_without_bom;
1488 }
1489
1418 if (multibytep) 1490 if (multibytep)
1419 { 1491 {
1420 int safe_room = MAX_MULTIBYTE_LENGTH * 2; 1492 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
@@ -1566,7 +1638,7 @@ decode_coding_utf_16 (coding)
1566 int *charbuf_end = coding->charbuf + coding->charbuf_size; 1638 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1567 int consumed_chars = 0, consumed_chars_base; 1639 int consumed_chars = 0, consumed_chars_base;
1568 int multibytep = coding->src_multibyte; 1640 int multibytep = coding->src_multibyte;
1569 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); 1641 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1570 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); 1642 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1571 int surrogate = CODING_UTF_16_SURROGATE (coding); 1643 int surrogate = CODING_UTF_16_SURROGATE (coding);
1572 Lisp_Object attr, charset_list; 1644 Lisp_Object attr, charset_list;
@@ -1575,7 +1647,7 @@ decode_coding_utf_16 (coding)
1575 1647
1576 CODING_GET_INFO (coding, attr, charset_list); 1648 CODING_GET_INFO (coding, attr, charset_list);
1577 1649
1578 if (bom == utf_16_with_bom) 1650 if (bom == utf_with_bom)
1579 { 1651 {
1580 int c, c1, c2; 1652 int c, c1, c2;
1581 1653
@@ -1592,13 +1664,13 @@ decode_coding_utf_16 (coding)
1592 src = src_base; 1664 src = src_base;
1593 coding->errors++; 1665 coding->errors++;
1594 } 1666 }
1595 CODING_UTF_16_BOM (coding) = utf_16_without_bom; 1667 CODING_UTF_16_BOM (coding) = utf_without_bom;
1596 } 1668 }
1597 else if (bom == utf_16_detect_bom) 1669 else if (bom == utf_detect_bom)
1598 { 1670 {
1599 /* We have already tried to detect BOM and failed in 1671 /* We have already tried to detect BOM and failed in
1600 detect_coding. */ 1672 detect_coding. */
1601 CODING_UTF_16_BOM (coding) = utf_16_without_bom; 1673 CODING_UTF_16_BOM (coding) = utf_without_bom;
1602 } 1674 }
1603 1675
1604 while (1) 1676 while (1)
@@ -1688,7 +1760,7 @@ encode_coding_utf_16 (coding)
1688 unsigned char *dst = coding->destination + coding->produced; 1760 unsigned char *dst = coding->destination + coding->produced;
1689 unsigned char *dst_end = coding->destination + coding->dst_bytes; 1761 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1690 int safe_room = 8; 1762 int safe_room = 8;
1691 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); 1763 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1692 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; 1764 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1693 int produced_chars = 0; 1765 int produced_chars = 0;
1694 Lisp_Object attrs, charset_list; 1766 Lisp_Object attrs, charset_list;
@@ -1696,14 +1768,14 @@ encode_coding_utf_16 (coding)
1696 1768
1697 CODING_GET_INFO (coding, attrs, charset_list); 1769 CODING_GET_INFO (coding, attrs, charset_list);
1698 1770
1699 if (bom != utf_16_without_bom) 1771 if (bom != utf_without_bom)
1700 { 1772 {
1701 ASSURE_DESTINATION (safe_room); 1773 ASSURE_DESTINATION (safe_room);
1702 if (big_endian) 1774 if (big_endian)
1703 EMIT_TWO_BYTES (0xFE, 0xFF); 1775 EMIT_TWO_BYTES (0xFE, 0xFF);
1704 else 1776 else
1705 EMIT_TWO_BYTES (0xFF, 0xFE); 1777 EMIT_TWO_BYTES (0xFF, 0xFE);
1706 CODING_UTF_16_BOM (coding) = utf_16_without_bom; 1778 CODING_UTF_16_BOM (coding) = utf_without_bom;
1707 } 1779 }
1708 1780
1709 while (charbuf < charbuf_end) 1781 while (charbuf < charbuf_end)
@@ -5272,18 +5344,24 @@ setup_coding_system (coding_system, coding)
5272 } 5344 }
5273 else if (EQ (coding_type, Qutf_8)) 5345 else if (EQ (coding_type, Qutf_8))
5274 { 5346 {
5347 val = AREF (attrs, coding_attr_utf_bom);
5348 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5349 : EQ (val, Qt) ? utf_with_bom
5350 : utf_without_bom);
5275 coding->detector = detect_coding_utf_8; 5351 coding->detector = detect_coding_utf_8;
5276 coding->decoder = decode_coding_utf_8; 5352 coding->decoder = decode_coding_utf_8;
5277 coding->encoder = encode_coding_utf_8; 5353 coding->encoder = encode_coding_utf_8;
5278 coding->common_flags 5354 coding->common_flags
5279 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); 5355 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5356 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5357 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5280 } 5358 }
5281 else if (EQ (coding_type, Qutf_16)) 5359 else if (EQ (coding_type, Qutf_16))
5282 { 5360 {
5283 val = AREF (attrs, coding_attr_utf_16_bom); 5361 val = AREF (attrs, coding_attr_utf_bom);
5284 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom 5362 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5285 : EQ (val, Qt) ? utf_16_with_bom 5363 : EQ (val, Qt) ? utf_with_bom
5286 : utf_16_without_bom); 5364 : utf_without_bom);
5287 val = AREF (attrs, coding_attr_utf_16_endian); 5365 val = AREF (attrs, coding_attr_utf_16_endian);
5288 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian 5366 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5289 : utf_16_little_endian); 5367 : utf_16_little_endian);
@@ -5293,7 +5371,7 @@ setup_coding_system (coding_system, coding)
5293 coding->encoder = encode_coding_utf_16; 5371 coding->encoder = encode_coding_utf_16;
5294 coding->common_flags 5372 coding->common_flags
5295 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); 5373 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5296 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom) 5374 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5297 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; 5375 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5298 } 5376 }
5299 else if (EQ (coding_type, Qccl)) 5377 else if (EQ (coding_type, Qccl))
@@ -5828,14 +5906,34 @@ detect_coding (coding)
5828 } 5906 }
5829 } 5907 }
5830 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 5908 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5909 == coding_category_utf_8_auto)
5910 {
5911 Lisp_Object coding_systems;
5912 struct coding_detection_info detect_info;
5913
5914 coding_systems
5915 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5916 detect_info.found = detect_info.rejected = 0;
5917 coding->head_ascii = 0;
5918 if (CONSP (coding_systems)
5919 && detect_coding_utf_8 (coding, &detect_info))
5920 {
5921 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5922 setup_coding_system (XCAR (coding_systems), coding);
5923 else
5924 setup_coding_system (XCDR (coding_systems), coding);
5925 }
5926 }
5927 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5831 == coding_category_utf_16_auto) 5928 == coding_category_utf_16_auto)
5832 { 5929 {
5833 Lisp_Object coding_systems; 5930 Lisp_Object coding_systems;
5834 struct coding_detection_info detect_info; 5931 struct coding_detection_info detect_info;
5835 5932
5836 coding_systems 5933 coding_systems
5837 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom); 5934 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5838 detect_info.found = detect_info.rejected = 0; 5935 detect_info.found = detect_info.rejected = 0;
5936 coding->head_ascii = 0;
5839 if (CONSP (coding_systems) 5937 if (CONSP (coding_systems)
5840 && detect_coding_utf_16 (coding, &detect_info)) 5938 && detect_coding_utf_16 (coding, &detect_info))
5841 { 5939 {
@@ -7724,6 +7822,19 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7724 detect_info.found |= found; 7822 detect_info.found |= found;
7725 } 7823 }
7726 } 7824 }
7825 else if (base_category == coding_category_utf_8_auto)
7826 {
7827 if (detect_coding_utf_8 (&coding, &detect_info))
7828 {
7829 struct coding_system *this;
7830
7831 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7832 this = coding_categories + coding_category_utf_8_sig;
7833 else
7834 this = coding_categories + coding_category_utf_8_nosig;
7835 val = Fcons (make_number (this->id), Qnil);
7836 }
7837 }
7727 else if (base_category == coding_category_utf_16_auto) 7838 else if (base_category == coding_category_utf_16_auto)
7728 { 7839 {
7729 if (detect_coding_utf_16 (&coding, &detect_info)) 7840 if (detect_coding_utf_16 (&coding, &detect_info))
@@ -9154,7 +9265,7 @@ usage: (define-coding-system-internal ...) */)
9154 val = XCDR (bom); 9265 val = XCDR (bom);
9155 CHECK_CODING_SYSTEM (val); 9266 CHECK_CODING_SYSTEM (val);
9156 } 9267 }
9157 ASET (attrs, coding_attr_utf_16_bom, bom); 9268 ASET (attrs, coding_attr_utf_bom, bom);
9158 9269
9159 endian = args[coding_arg_utf16_endian]; 9270 endian = args[coding_arg_utf16_endian];
9160 CHECK_SYMBOL (endian); 9271 CHECK_SYMBOL (endian);
@@ -9333,8 +9444,27 @@ usage: (define-coding-system-internal ...) */)
9333 } 9444 }
9334 else if (EQ (coding_type, Qutf_8)) 9445 else if (EQ (coding_type, Qutf_8))
9335 { 9446 {
9336 category = coding_category_utf_8; 9447 Lisp_Object bom;
9448
9337 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; 9449 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9450
9451 if (nargs < coding_arg_utf8_max)
9452 goto short_args;
9453
9454 bom = args[coding_arg_utf8_bom];
9455 if (! NILP (bom) && ! EQ (bom, Qt))
9456 {
9457 CHECK_CONS (bom);
9458 val = XCAR (bom);
9459 CHECK_CODING_SYSTEM (val);
9460 val = XCDR (bom);
9461 CHECK_CODING_SYSTEM (val);
9462 }
9463 ASET (attrs, coding_attr_utf_bom, bom);
9464
9465 category = (CONSP (bom) ? coding_category_utf_8_auto
9466 : NILP (bom) ? coding_category_utf_8_nosig
9467 : coding_category_utf_8_sig);
9338 } 9468 }
9339 else if (EQ (coding_type, Qundecided)) 9469 else if (EQ (coding_type, Qundecided))
9340 category = coding_category_undecided; 9470 category = coding_category_undecided;
@@ -9755,8 +9885,12 @@ syms_of_coding ()
9755 intern ("coding-category-iso-7-else")); 9885 intern ("coding-category-iso-7-else"));
9756 ASET (Vcoding_category_table, coding_category_iso_8_else, 9886 ASET (Vcoding_category_table, coding_category_iso_8_else,
9757 intern ("coding-category-iso-8-else")); 9887 intern ("coding-category-iso-8-else"));
9758 ASET (Vcoding_category_table, coding_category_utf_8, 9888 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9889 intern ("coding-category-utf-8-auto"));
9890 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
9759 intern ("coding-category-utf-8")); 9891 intern ("coding-category-utf-8"));
9892 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9893 intern ("coding-category-utf-8-sig"));
9760 ASET (Vcoding_category_table, coding_category_utf_16_be, 9894 ASET (Vcoding_category_table, coding_category_utf_16_be,
9761 intern ("coding-category-utf-16-be")); 9895 intern ("coding-category-utf-16-be"));
9762 ASET (Vcoding_category_table, coding_category_utf_16_auto, 9896 ASET (Vcoding_category_table, coding_category_utf_16_auto,