diff options
| author | Kenichi Handa | 2008-05-29 22:58:15 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2008-05-29 22:58:15 +0000 |
| commit | a470d443806985bc57c2ae9f7bce0048013ebf67 (patch) | |
| tree | dcd63c718f75e5a6a0a7f70751b8ee2642dc1e21 /src/coding.c | |
| parent | e4215ddd0753fffed307dd643f66ce0a0b26b0c4 (diff) | |
| download | emacs-a470d443806985bc57c2ae9f7bce0048013ebf67.tar.gz emacs-a470d443806985bc57c2ae9f7bce0048013ebf67.zip | |
(CODING_UTF_8_BOM): New macro.
(enum coding_category): Delete coding_category_utf_8, add
coding_category_utf_8_auto, coding_category_utf_8_nosig, and
coding_category_utf_8_sig.
(CATEGORY_MASK_UTF_8): Delete it.
(CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG)
(CATEGORY_MASK_UTF_8_SIG): New macros.
(CATEGORY_MASK_ANY): Delete CATEGORY_MASK_UTF_8, add
CATEGORY_MASK_UTF_8_AUTO, CATEGORY_MASK_UTF_8_NOSIG, and
CATEGORY_MASK_UTF_8_SIG.
(CATEGORY_MASK_UTF_8): New macro.
(UTF_BOM, UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3): New macros.
(detect_coding_utf_8): Check BOM.
(decode_coding_utf_8, encode_coding_utf_8): Handle BOM.
(decode_coding_utf_16): Adjusted for the change of enum
utf_bom_type.
(encode_coding_utf_16): Likewise.
(setup_coding_system): Likewise. Set CODING_UTF_8_BOM (coding).
(detect_coding, detect_coding_system): Handle utf-8-auto.
(Fdefine_coding_system_internal): Handle `bom' property for utf-8.
(syms_of_coding): Fix setting up of Vcoding_category_table.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 184 |
1 files changed, 159 insertions, 25 deletions
diff --git a/src/coding.c b/src/coding.c index 1e31eda089b..7f9dc42ffa8 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -546,6 +546,9 @@ enum iso_code_class_type | |||
| 546 | character is prohibited by CODING_ISO_FLAG_SAFE. */ | 546 | character is prohibited by CODING_ISO_FLAG_SAFE. */ |
| 547 | #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' | 547 | #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' |
| 548 | 548 | ||
| 549 | /* UTF-8 section */ | ||
| 550 | #define CODING_UTF_8_BOM(coding) \ | ||
| 551 | ((coding)->spec.utf_8_bom) | ||
| 549 | 552 | ||
| 550 | /* UTF-16 section */ | 553 | /* UTF-16 section */ |
| 551 | #define CODING_UTF_16_BOM(coding) \ | 554 | #define CODING_UTF_16_BOM(coding) \ |
| @@ -576,7 +579,9 @@ enum coding_category | |||
| 576 | coding_category_iso_8_2, | 579 | coding_category_iso_8_2, |
| 577 | coding_category_iso_7_else, | 580 | coding_category_iso_7_else, |
| 578 | coding_category_iso_8_else, | 581 | coding_category_iso_8_else, |
| 579 | coding_category_utf_8, | 582 | coding_category_utf_8_auto, |
| 583 | coding_category_utf_8_nosig, | ||
| 584 | coding_category_utf_8_sig, | ||
| 580 | coding_category_utf_16_auto, | 585 | coding_category_utf_16_auto, |
| 581 | coding_category_utf_16_be, | 586 | coding_category_utf_16_be, |
| 582 | coding_category_utf_16_le, | 587 | coding_category_utf_16_le, |
| @@ -600,7 +605,9 @@ enum coding_category | |||
| 600 | #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) | 605 | #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) |
| 601 | #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) | 606 | #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) |
| 602 | #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) | 607 | #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) |
| 603 | #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) | 608 | #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto) |
| 609 | #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig) | ||
| 610 | #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig) | ||
| 604 | #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) | 611 | #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) |
| 605 | #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) | 612 | #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) |
| 606 | #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) | 613 | #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) |
| @@ -622,7 +629,9 @@ enum coding_category | |||
| 622 | | CATEGORY_MASK_ISO_8_2 \ | 629 | | CATEGORY_MASK_ISO_8_2 \ |
| 623 | | CATEGORY_MASK_ISO_7_ELSE \ | 630 | | CATEGORY_MASK_ISO_7_ELSE \ |
| 624 | | CATEGORY_MASK_ISO_8_ELSE \ | 631 | | CATEGORY_MASK_ISO_8_ELSE \ |
| 625 | | CATEGORY_MASK_UTF_8 \ | 632 | | CATEGORY_MASK_UTF_8_AUTO \ |
| 633 | | CATEGORY_MASK_UTF_8_NOSIG \ | ||
| 634 | | CATEGORY_MASK_UTF_8_SIG \ | ||
| 626 | | CATEGORY_MASK_UTF_16_AUTO \ | 635 | | CATEGORY_MASK_UTF_16_AUTO \ |
| 627 | | CATEGORY_MASK_UTF_16_BE \ | 636 | | CATEGORY_MASK_UTF_16_BE \ |
| 628 | | CATEGORY_MASK_UTF_16_LE \ | 637 | | CATEGORY_MASK_UTF_16_LE \ |
| @@ -662,6 +671,10 @@ enum coding_category | |||
| 662 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 671 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
| 663 | | CATEGORY_MASK_UTF_16_LE_NOSIG) | 672 | | CATEGORY_MASK_UTF_16_LE_NOSIG) |
| 664 | 673 | ||
| 674 | #define CATEGORY_MASK_UTF_8 \ | ||
| 675 | (CATEGORY_MASK_UTF_8_AUTO \ | ||
| 676 | | CATEGORY_MASK_UTF_8_NOSIG \ | ||
| 677 | | CATEGORY_MASK_UTF_8_SIG) | ||
| 665 | 678 | ||
| 666 | /* List of symbols `coding-category-xxx' ordered by priority. This | 679 | /* List of symbols `coding-category-xxx' ordered by priority. This |
| 667 | variable is exposed to Emacs Lisp. */ | 680 | variable is exposed to Emacs Lisp. */ |
| @@ -1214,6 +1227,11 @@ alloc_destination (coding, nbytes, dst) | |||
| 1214 | #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) | 1227 | #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) |
| 1215 | #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | 1228 | #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) |
| 1216 | 1229 | ||
| 1230 | #define UTF_BOM 0xFEFF | ||
| 1231 | #define UTF_8_BOM_1 0xEF | ||
| 1232 | #define UTF_8_BOM_2 0xBB | ||
| 1233 | #define UTF_8_BOM_3 0xBF | ||
| 1234 | |||
| 1217 | static int | 1235 | static int |
| 1218 | detect_coding_utf_8 (coding, detect_info) | 1236 | detect_coding_utf_8 (coding, detect_info) |
| 1219 | struct coding_system *coding; | 1237 | struct coding_system *coding; |
| @@ -1223,6 +1241,7 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1223 | const unsigned char *src_end = coding->source + coding->src_bytes; | 1241 | const unsigned char *src_end = coding->source + coding->src_bytes; |
| 1224 | int multibytep = coding->src_multibyte; | 1242 | int multibytep = coding->src_multibyte; |
| 1225 | int consumed_chars = 0; | 1243 | int consumed_chars = 0; |
| 1244 | int bom_found = 0; | ||
| 1226 | int found = 0; | 1245 | int found = 0; |
| 1227 | 1246 | ||
| 1228 | detect_info->checked |= CATEGORY_MASK_UTF_8; | 1247 | detect_info->checked |= CATEGORY_MASK_UTF_8; |
| @@ -1242,7 +1261,7 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1242 | break; | 1261 | break; |
| 1243 | if (UTF_8_2_OCTET_LEADING_P (c)) | 1262 | if (UTF_8_2_OCTET_LEADING_P (c)) |
| 1244 | { | 1263 | { |
| 1245 | found = CATEGORY_MASK_UTF_8; | 1264 | found = 1; |
| 1246 | continue; | 1265 | continue; |
| 1247 | } | 1266 | } |
| 1248 | ONE_MORE_BYTE (c2); | 1267 | ONE_MORE_BYTE (c2); |
| @@ -1250,7 +1269,10 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1250 | break; | 1269 | break; |
| 1251 | if (UTF_8_3_OCTET_LEADING_P (c)) | 1270 | if (UTF_8_3_OCTET_LEADING_P (c)) |
| 1252 | { | 1271 | { |
| 1253 | found = CATEGORY_MASK_UTF_8; | 1272 | found = 1; |
| 1273 | if (src_base == coding->source | ||
| 1274 | && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3) | ||
| 1275 | bom_found = 1; | ||
| 1254 | continue; | 1276 | continue; |
| 1255 | } | 1277 | } |
| 1256 | ONE_MORE_BYTE (c3); | 1278 | ONE_MORE_BYTE (c3); |
| @@ -1258,7 +1280,7 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1258 | break; | 1280 | break; |
| 1259 | if (UTF_8_4_OCTET_LEADING_P (c)) | 1281 | if (UTF_8_4_OCTET_LEADING_P (c)) |
| 1260 | { | 1282 | { |
| 1261 | found = CATEGORY_MASK_UTF_8; | 1283 | found = 1; |
| 1262 | continue; | 1284 | continue; |
| 1263 | } | 1285 | } |
| 1264 | ONE_MORE_BYTE (c4); | 1286 | ONE_MORE_BYTE (c4); |
| @@ -1266,7 +1288,7 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1266 | break; | 1288 | break; |
| 1267 | if (UTF_8_5_OCTET_LEADING_P (c)) | 1289 | if (UTF_8_5_OCTET_LEADING_P (c)) |
| 1268 | { | 1290 | { |
| 1269 | found = CATEGORY_MASK_UTF_8; | 1291 | found = 1; |
| 1270 | continue; | 1292 | continue; |
| 1271 | } | 1293 | } |
| 1272 | break; | 1294 | break; |
| @@ -1280,7 +1302,16 @@ detect_coding_utf_8 (coding, detect_info) | |||
| 1280 | detect_info->rejected |= CATEGORY_MASK_UTF_8; | 1302 | detect_info->rejected |= CATEGORY_MASK_UTF_8; |
| 1281 | return 0; | 1303 | return 0; |
| 1282 | } | 1304 | } |
| 1283 | detect_info->found |= found; | 1305 | if (bom_found) |
| 1306 | { | ||
| 1307 | /* The first character 0xFFFE doesn't necessarily mean a BOM. */ | ||
| 1308 | detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; | ||
| 1309 | } | ||
| 1310 | else | ||
| 1311 | { | ||
| 1312 | detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; | ||
| 1313 | detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; | ||
| 1314 | } | ||
| 1284 | return 1; | 1315 | return 1; |
| 1285 | } | 1316 | } |
| 1286 | 1317 | ||
| @@ -1296,12 +1327,46 @@ decode_coding_utf_8 (coding) | |||
| 1296 | int *charbuf_end = coding->charbuf + coding->charbuf_size; | 1327 | int *charbuf_end = coding->charbuf + coding->charbuf_size; |
| 1297 | int consumed_chars = 0, consumed_chars_base; | 1328 | int consumed_chars = 0, consumed_chars_base; |
| 1298 | int multibytep = coding->src_multibyte; | 1329 | int multibytep = coding->src_multibyte; |
| 1330 | enum utf_bom_type bom = CODING_UTF_8_BOM (coding); | ||
| 1299 | Lisp_Object attr, charset_list; | 1331 | Lisp_Object attr, charset_list; |
| 1300 | int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); | 1332 | int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); |
| 1301 | int byte_after_cr = -1; | 1333 | int byte_after_cr = -1; |
| 1302 | 1334 | ||
| 1303 | CODING_GET_INFO (coding, attr, charset_list); | 1335 | CODING_GET_INFO (coding, attr, charset_list); |
| 1304 | 1336 | ||
| 1337 | if (bom != utf_without_bom) | ||
| 1338 | { | ||
| 1339 | int c1, c2, c3; | ||
| 1340 | |||
| 1341 | src_base = src; | ||
| 1342 | ONE_MORE_BYTE (c1); | ||
| 1343 | if (! UTF_8_3_OCTET_LEADING_P (c1)) | ||
| 1344 | src = src_base; | ||
| 1345 | else | ||
| 1346 | { | ||
| 1347 | ONE_MORE_BYTE (c2); | ||
| 1348 | if (! UTF_8_EXTRA_OCTET_P (c2)) | ||
| 1349 | src = src_base; | ||
| 1350 | else | ||
| 1351 | { | ||
| 1352 | ONE_MORE_BYTE (c3); | ||
| 1353 | if (! UTF_8_EXTRA_OCTET_P (c3)) | ||
| 1354 | src = src_base; | ||
| 1355 | else | ||
| 1356 | { | ||
| 1357 | if ((c1 != UTF_8_BOM_1) | ||
| 1358 | || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3)) | ||
| 1359 | src = src_base; | ||
| 1360 | else | ||
| 1361 | CODING_UTF_8_BOM (coding) = utf_without_bom; | ||
| 1362 | } | ||
| 1363 | } | ||
| 1364 | } | ||
| 1365 | } | ||
| 1366 | CODING_UTF_8_BOM (coding) = utf_without_bom; | ||
| 1367 | |||
| 1368 | |||
| 1369 | |||
| 1305 | while (1) | 1370 | while (1) |
| 1306 | { | 1371 | { |
| 1307 | int c, c1, c2, c3, c4, c5; | 1372 | int c, c1, c2, c3, c4, c5; |
| @@ -1415,6 +1480,13 @@ encode_coding_utf_8 (coding) | |||
| 1415 | int produced_chars = 0; | 1480 | int produced_chars = 0; |
| 1416 | int c; | 1481 | int c; |
| 1417 | 1482 | ||
| 1483 | if (CODING_UTF_8_BOM (coding) == utf_with_bom) | ||
| 1484 | { | ||
| 1485 | ASSURE_DESTINATION (3); | ||
| 1486 | EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3); | ||
| 1487 | CODING_UTF_8_BOM (coding) = utf_without_bom; | ||
| 1488 | } | ||
| 1489 | |||
| 1418 | if (multibytep) | 1490 | if (multibytep) |
| 1419 | { | 1491 | { |
| 1420 | int safe_room = MAX_MULTIBYTE_LENGTH * 2; | 1492 | int safe_room = MAX_MULTIBYTE_LENGTH * 2; |
| @@ -1566,7 +1638,7 @@ decode_coding_utf_16 (coding) | |||
| 1566 | int *charbuf_end = coding->charbuf + coding->charbuf_size; | 1638 | int *charbuf_end = coding->charbuf + coding->charbuf_size; |
| 1567 | int consumed_chars = 0, consumed_chars_base; | 1639 | int consumed_chars = 0, consumed_chars_base; |
| 1568 | int multibytep = coding->src_multibyte; | 1640 | int multibytep = coding->src_multibyte; |
| 1569 | enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); | 1641 | enum utf_bom_type bom = CODING_UTF_16_BOM (coding); |
| 1570 | enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); | 1642 | enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); |
| 1571 | int surrogate = CODING_UTF_16_SURROGATE (coding); | 1643 | int surrogate = CODING_UTF_16_SURROGATE (coding); |
| 1572 | Lisp_Object attr, charset_list; | 1644 | Lisp_Object attr, charset_list; |
| @@ -1575,7 +1647,7 @@ decode_coding_utf_16 (coding) | |||
| 1575 | 1647 | ||
| 1576 | CODING_GET_INFO (coding, attr, charset_list); | 1648 | CODING_GET_INFO (coding, attr, charset_list); |
| 1577 | 1649 | ||
| 1578 | if (bom == utf_16_with_bom) | 1650 | if (bom == utf_with_bom) |
| 1579 | { | 1651 | { |
| 1580 | int c, c1, c2; | 1652 | int c, c1, c2; |
| 1581 | 1653 | ||
| @@ -1592,13 +1664,13 @@ decode_coding_utf_16 (coding) | |||
| 1592 | src = src_base; | 1664 | src = src_base; |
| 1593 | coding->errors++; | 1665 | coding->errors++; |
| 1594 | } | 1666 | } |
| 1595 | CODING_UTF_16_BOM (coding) = utf_16_without_bom; | 1667 | CODING_UTF_16_BOM (coding) = utf_without_bom; |
| 1596 | } | 1668 | } |
| 1597 | else if (bom == utf_16_detect_bom) | 1669 | else if (bom == utf_detect_bom) |
| 1598 | { | 1670 | { |
| 1599 | /* We have already tried to detect BOM and failed in | 1671 | /* We have already tried to detect BOM and failed in |
| 1600 | detect_coding. */ | 1672 | detect_coding. */ |
| 1601 | CODING_UTF_16_BOM (coding) = utf_16_without_bom; | 1673 | CODING_UTF_16_BOM (coding) = utf_without_bom; |
| 1602 | } | 1674 | } |
| 1603 | 1675 | ||
| 1604 | while (1) | 1676 | while (1) |
| @@ -1688,7 +1760,7 @@ encode_coding_utf_16 (coding) | |||
| 1688 | unsigned char *dst = coding->destination + coding->produced; | 1760 | unsigned char *dst = coding->destination + coding->produced; |
| 1689 | unsigned char *dst_end = coding->destination + coding->dst_bytes; | 1761 | unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 1690 | int safe_room = 8; | 1762 | int safe_room = 8; |
| 1691 | enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); | 1763 | enum utf_bom_type bom = CODING_UTF_16_BOM (coding); |
| 1692 | int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; | 1764 | int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; |
| 1693 | int produced_chars = 0; | 1765 | int produced_chars = 0; |
| 1694 | Lisp_Object attrs, charset_list; | 1766 | Lisp_Object attrs, charset_list; |
| @@ -1696,14 +1768,14 @@ encode_coding_utf_16 (coding) | |||
| 1696 | 1768 | ||
| 1697 | CODING_GET_INFO (coding, attrs, charset_list); | 1769 | CODING_GET_INFO (coding, attrs, charset_list); |
| 1698 | 1770 | ||
| 1699 | if (bom != utf_16_without_bom) | 1771 | if (bom != utf_without_bom) |
| 1700 | { | 1772 | { |
| 1701 | ASSURE_DESTINATION (safe_room); | 1773 | ASSURE_DESTINATION (safe_room); |
| 1702 | if (big_endian) | 1774 | if (big_endian) |
| 1703 | EMIT_TWO_BYTES (0xFE, 0xFF); | 1775 | EMIT_TWO_BYTES (0xFE, 0xFF); |
| 1704 | else | 1776 | else |
| 1705 | EMIT_TWO_BYTES (0xFF, 0xFE); | 1777 | EMIT_TWO_BYTES (0xFF, 0xFE); |
| 1706 | CODING_UTF_16_BOM (coding) = utf_16_without_bom; | 1778 | CODING_UTF_16_BOM (coding) = utf_without_bom; |
| 1707 | } | 1779 | } |
| 1708 | 1780 | ||
| 1709 | while (charbuf < charbuf_end) | 1781 | while (charbuf < charbuf_end) |
| @@ -5272,18 +5344,24 @@ setup_coding_system (coding_system, coding) | |||
| 5272 | } | 5344 | } |
| 5273 | else if (EQ (coding_type, Qutf_8)) | 5345 | else if (EQ (coding_type, Qutf_8)) |
| 5274 | { | 5346 | { |
| 5347 | val = AREF (attrs, coding_attr_utf_bom); | ||
| 5348 | CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom | ||
| 5349 | : EQ (val, Qt) ? utf_with_bom | ||
| 5350 | : utf_without_bom); | ||
| 5275 | coding->detector = detect_coding_utf_8; | 5351 | coding->detector = detect_coding_utf_8; |
| 5276 | coding->decoder = decode_coding_utf_8; | 5352 | coding->decoder = decode_coding_utf_8; |
| 5277 | coding->encoder = encode_coding_utf_8; | 5353 | coding->encoder = encode_coding_utf_8; |
| 5278 | coding->common_flags | 5354 | coding->common_flags |
| 5279 | |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | 5355 | |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); |
| 5356 | if (CODING_UTF_8_BOM (coding) == utf_detect_bom) | ||
| 5357 | coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | ||
| 5280 | } | 5358 | } |
| 5281 | else if (EQ (coding_type, Qutf_16)) | 5359 | else if (EQ (coding_type, Qutf_16)) |
| 5282 | { | 5360 | { |
| 5283 | val = AREF (attrs, coding_attr_utf_16_bom); | 5361 | val = AREF (attrs, coding_attr_utf_bom); |
| 5284 | CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom | 5362 | CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom |
| 5285 | : EQ (val, Qt) ? utf_16_with_bom | 5363 | : EQ (val, Qt) ? utf_with_bom |
| 5286 | : utf_16_without_bom); | 5364 | : utf_without_bom); |
| 5287 | val = AREF (attrs, coding_attr_utf_16_endian); | 5365 | val = AREF (attrs, coding_attr_utf_16_endian); |
| 5288 | CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian | 5366 | CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian |
| 5289 | : utf_16_little_endian); | 5367 | : utf_16_little_endian); |
| @@ -5293,7 +5371,7 @@ setup_coding_system (coding_system, coding) | |||
| 5293 | coding->encoder = encode_coding_utf_16; | 5371 | coding->encoder = encode_coding_utf_16; |
| 5294 | coding->common_flags | 5372 | coding->common_flags |
| 5295 | |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | 5373 | |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); |
| 5296 | if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom) | 5374 | if (CODING_UTF_16_BOM (coding) == utf_detect_bom) |
| 5297 | coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | 5375 | coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; |
| 5298 | } | 5376 | } |
| 5299 | else if (EQ (coding_type, Qccl)) | 5377 | else if (EQ (coding_type, Qccl)) |
| @@ -5828,14 +5906,34 @@ detect_coding (coding) | |||
| 5828 | } | 5906 | } |
| 5829 | } | 5907 | } |
| 5830 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 5908 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| 5909 | == coding_category_utf_8_auto) | ||
| 5910 | { | ||
| 5911 | Lisp_Object coding_systems; | ||
| 5912 | struct coding_detection_info detect_info; | ||
| 5913 | |||
| 5914 | coding_systems | ||
| 5915 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | ||
| 5916 | detect_info.found = detect_info.rejected = 0; | ||
| 5917 | coding->head_ascii = 0; | ||
| 5918 | if (CONSP (coding_systems) | ||
| 5919 | && detect_coding_utf_8 (coding, &detect_info)) | ||
| 5920 | { | ||
| 5921 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | ||
| 5922 | setup_coding_system (XCAR (coding_systems), coding); | ||
| 5923 | else | ||
| 5924 | setup_coding_system (XCDR (coding_systems), coding); | ||
| 5925 | } | ||
| 5926 | } | ||
| 5927 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | ||
| 5831 | == coding_category_utf_16_auto) | 5928 | == coding_category_utf_16_auto) |
| 5832 | { | 5929 | { |
| 5833 | Lisp_Object coding_systems; | 5930 | Lisp_Object coding_systems; |
| 5834 | struct coding_detection_info detect_info; | 5931 | struct coding_detection_info detect_info; |
| 5835 | 5932 | ||
| 5836 | coding_systems | 5933 | coding_systems |
| 5837 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom); | 5934 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 5838 | detect_info.found = detect_info.rejected = 0; | 5935 | detect_info.found = detect_info.rejected = 0; |
| 5936 | coding->head_ascii = 0; | ||
| 5839 | if (CONSP (coding_systems) | 5937 | if (CONSP (coding_systems) |
| 5840 | && detect_coding_utf_16 (coding, &detect_info)) | 5938 | && detect_coding_utf_16 (coding, &detect_info)) |
| 5841 | { | 5939 | { |
| @@ -7724,6 +7822,19 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, | |||
| 7724 | detect_info.found |= found; | 7822 | detect_info.found |= found; |
| 7725 | } | 7823 | } |
| 7726 | } | 7824 | } |
| 7825 | else if (base_category == coding_category_utf_8_auto) | ||
| 7826 | { | ||
| 7827 | if (detect_coding_utf_8 (&coding, &detect_info)) | ||
| 7828 | { | ||
| 7829 | struct coding_system *this; | ||
| 7830 | |||
| 7831 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | ||
| 7832 | this = coding_categories + coding_category_utf_8_sig; | ||
| 7833 | else | ||
| 7834 | this = coding_categories + coding_category_utf_8_nosig; | ||
| 7835 | val = Fcons (make_number (this->id), Qnil); | ||
| 7836 | } | ||
| 7837 | } | ||
| 7727 | else if (base_category == coding_category_utf_16_auto) | 7838 | else if (base_category == coding_category_utf_16_auto) |
| 7728 | { | 7839 | { |
| 7729 | if (detect_coding_utf_16 (&coding, &detect_info)) | 7840 | if (detect_coding_utf_16 (&coding, &detect_info)) |
| @@ -9154,7 +9265,7 @@ usage: (define-coding-system-internal ...) */) | |||
| 9154 | val = XCDR (bom); | 9265 | val = XCDR (bom); |
| 9155 | CHECK_CODING_SYSTEM (val); | 9266 | CHECK_CODING_SYSTEM (val); |
| 9156 | } | 9267 | } |
| 9157 | ASET (attrs, coding_attr_utf_16_bom, bom); | 9268 | ASET (attrs, coding_attr_utf_bom, bom); |
| 9158 | 9269 | ||
| 9159 | endian = args[coding_arg_utf16_endian]; | 9270 | endian = args[coding_arg_utf16_endian]; |
| 9160 | CHECK_SYMBOL (endian); | 9271 | CHECK_SYMBOL (endian); |
| @@ -9333,8 +9444,27 @@ usage: (define-coding-system-internal ...) */) | |||
| 9333 | } | 9444 | } |
| 9334 | else if (EQ (coding_type, Qutf_8)) | 9445 | else if (EQ (coding_type, Qutf_8)) |
| 9335 | { | 9446 | { |
| 9336 | category = coding_category_utf_8; | 9447 | Lisp_Object bom; |
| 9448 | |||
| 9337 | CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | 9449 | CODING_ATTR_ASCII_COMPAT (attrs) = Qt; |
| 9450 | |||
| 9451 | if (nargs < coding_arg_utf8_max) | ||
| 9452 | goto short_args; | ||
| 9453 | |||
| 9454 | bom = args[coding_arg_utf8_bom]; | ||
| 9455 | if (! NILP (bom) && ! EQ (bom, Qt)) | ||
| 9456 | { | ||
| 9457 | CHECK_CONS (bom); | ||
| 9458 | val = XCAR (bom); | ||
| 9459 | CHECK_CODING_SYSTEM (val); | ||
| 9460 | val = XCDR (bom); | ||
| 9461 | CHECK_CODING_SYSTEM (val); | ||
| 9462 | } | ||
| 9463 | ASET (attrs, coding_attr_utf_bom, bom); | ||
| 9464 | |||
| 9465 | category = (CONSP (bom) ? coding_category_utf_8_auto | ||
| 9466 | : NILP (bom) ? coding_category_utf_8_nosig | ||
| 9467 | : coding_category_utf_8_sig); | ||
| 9338 | } | 9468 | } |
| 9339 | else if (EQ (coding_type, Qundecided)) | 9469 | else if (EQ (coding_type, Qundecided)) |
| 9340 | category = coding_category_undecided; | 9470 | category = coding_category_undecided; |
| @@ -9755,8 +9885,12 @@ syms_of_coding () | |||
| 9755 | intern ("coding-category-iso-7-else")); | 9885 | intern ("coding-category-iso-7-else")); |
| 9756 | ASET (Vcoding_category_table, coding_category_iso_8_else, | 9886 | ASET (Vcoding_category_table, coding_category_iso_8_else, |
| 9757 | intern ("coding-category-iso-8-else")); | 9887 | intern ("coding-category-iso-8-else")); |
| 9758 | ASET (Vcoding_category_table, coding_category_utf_8, | 9888 | ASET (Vcoding_category_table, coding_category_utf_8_auto, |
| 9889 | intern ("coding-category-utf-8-auto")); | ||
| 9890 | ASET (Vcoding_category_table, coding_category_utf_8_nosig, | ||
| 9759 | intern ("coding-category-utf-8")); | 9891 | intern ("coding-category-utf-8")); |
| 9892 | ASET (Vcoding_category_table, coding_category_utf_8_sig, | ||
| 9893 | intern ("coding-category-utf-8-sig")); | ||
| 9760 | ASET (Vcoding_category_table, coding_category_utf_16_be, | 9894 | ASET (Vcoding_category_table, coding_category_utf_16_be, |
| 9761 | intern ("coding-category-utf-16-be")); | 9895 | intern ("coding-category-utf-16-be")); |
| 9762 | ASET (Vcoding_category_table, coding_category_utf_16_auto, | 9896 | ASET (Vcoding_category_table, coding_category_utf_16_auto, |