diff options
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 313 |
1 files changed, 205 insertions, 108 deletions
diff --git a/src/coding.c b/src/coding.c index 2fe14444f33..10443090eff 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -625,6 +625,7 @@ enum coding_category | |||
| 625 | | CATEGORY_MASK_ISO_7_ELSE \ | 625 | | CATEGORY_MASK_ISO_7_ELSE \ |
| 626 | | CATEGORY_MASK_ISO_8_ELSE \ | 626 | | CATEGORY_MASK_ISO_8_ELSE \ |
| 627 | | CATEGORY_MASK_UTF_8 \ | 627 | | CATEGORY_MASK_UTF_8 \ |
| 628 | | CATEGORY_MASK_UTF_16_AUTO \ | ||
| 628 | | CATEGORY_MASK_UTF_16_BE \ | 629 | | CATEGORY_MASK_UTF_16_BE \ |
| 629 | | CATEGORY_MASK_UTF_16_LE \ | 630 | | CATEGORY_MASK_UTF_16_LE \ |
| 630 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 631 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
| @@ -657,7 +658,8 @@ enum coding_category | |||
| 657 | | CATEGORY_MASK_ISO_ELSE) | 658 | | CATEGORY_MASK_ISO_ELSE) |
| 658 | 659 | ||
| 659 | #define CATEGORY_MASK_UTF_16 \ | 660 | #define CATEGORY_MASK_UTF_16 \ |
| 660 | (CATEGORY_MASK_UTF_16_BE \ | 661 | (CATEGORY_MASK_UTF_16_AUTO \ |
| 662 | | CATEGORY_MASK_UTF_16_BE \ | ||
| 661 | | CATEGORY_MASK_UTF_16_LE \ | 663 | | CATEGORY_MASK_UTF_16_LE \ |
| 662 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 664 | | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
| 663 | | CATEGORY_MASK_UTF_16_LE_NOSIG) | 665 | | CATEGORY_MASK_UTF_16_LE_NOSIG) |
| @@ -1513,11 +1515,44 @@ detect_coding_utf_16 (coding, detect_info) | |||
| 1513 | | CATEGORY_MASK_UTF_16_BE_NOSIG | 1515 | | CATEGORY_MASK_UTF_16_BE_NOSIG |
| 1514 | | CATEGORY_MASK_UTF_16_LE_NOSIG); | 1516 | | CATEGORY_MASK_UTF_16_LE_NOSIG); |
| 1515 | } | 1517 | } |
| 1516 | else if (c1 >= 0 && c2 >= 0) | 1518 | else |
| 1517 | { | 1519 | { |
| 1520 | /* We check the dispersion of Eth and Oth bytes where E is even and | ||
| 1521 | O is odd. If both are high, we assume binary data.*/ | ||
| 1522 | unsigned char e[256], o[256]; | ||
| 1523 | unsigned e_num = 1, o_num = 1; | ||
| 1524 | |||
| 1525 | memset (e, 0, 256); | ||
| 1526 | memset (o, 0, 256); | ||
| 1527 | e[c1] = 1; | ||
| 1528 | o[c2] = 1; | ||
| 1529 | |||
| 1518 | detect_info->rejected | 1530 | detect_info->rejected |
| 1519 | |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); | 1531 | |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); |
| 1532 | |||
| 1533 | while (1) | ||
| 1534 | { | ||
| 1535 | ONE_MORE_BYTE (c1); | ||
| 1536 | ONE_MORE_BYTE (c2); | ||
| 1537 | if (! e[c1]) | ||
| 1538 | { | ||
| 1539 | e[c1] = 1; | ||
| 1540 | e_num++; | ||
| 1541 | if (e_num >= 128) | ||
| 1542 | break; | ||
| 1543 | } | ||
| 1544 | if (! o[c2]) | ||
| 1545 | { | ||
| 1546 | o[c1] = 1; | ||
| 1547 | o_num++; | ||
| 1548 | if (o_num >= 128) | ||
| 1549 | break; | ||
| 1550 | } | ||
| 1551 | } | ||
| 1552 | detect_info->rejected |= CATEGORY_MASK_UTF_16; | ||
| 1553 | return 0; | ||
| 1520 | } | 1554 | } |
| 1555 | |||
| 1521 | no_more_source: | 1556 | no_more_source: |
| 1522 | return 1; | 1557 | return 1; |
| 1523 | } | 1558 | } |
| @@ -5677,32 +5712,53 @@ detect_coding (coding) | |||
| 5677 | { | 5712 | { |
| 5678 | int c, i; | 5713 | int c, i; |
| 5679 | struct coding_detection_info detect_info; | 5714 | struct coding_detection_info detect_info; |
| 5715 | int null_byte_found = 0, eight_bit_found = 0; | ||
| 5680 | 5716 | ||
| 5681 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | 5717 | detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 5682 | for (i = 0, src = coding->source; src < src_end; i++, src++) | 5718 | coding->head_ascii = -1; |
| 5719 | for (src = coding->source; src < src_end; src++) | ||
| 5683 | { | 5720 | { |
| 5684 | c = *src; | 5721 | c = *src; |
| 5685 | if (c & 0x80) | 5722 | if (c & 0x80) |
| 5686 | break; | ||
| 5687 | if (c < 0x20 | ||
| 5688 | && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | ||
| 5689 | && ! inhibit_iso_escape_detection | ||
| 5690 | && ! detect_info.checked) | ||
| 5691 | { | 5723 | { |
| 5692 | coding->head_ascii = src - (coding->source + coding->consumed); | 5724 | eight_bit_found = 1; |
| 5693 | if (detect_coding_iso_2022 (coding, &detect_info)) | 5725 | if (coding->head_ascii < 0) |
| 5726 | coding->head_ascii = src - coding->source; | ||
| 5727 | if (null_byte_found) | ||
| 5728 | break; | ||
| 5729 | } | ||
| 5730 | else if (c < 0x20) | ||
| 5731 | { | ||
| 5732 | if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | ||
| 5733 | && ! inhibit_iso_escape_detection | ||
| 5734 | && ! detect_info.checked) | ||
| 5694 | { | 5735 | { |
| 5695 | /* We have scanned the whole data. */ | 5736 | if (coding->head_ascii < 0) |
| 5696 | if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 5737 | coding->head_ascii = src - coding->source; |
| 5697 | /* We didn't find an 8-bit code. */ | 5738 | if (detect_coding_iso_2022 (coding, &detect_info)) |
| 5698 | src = src_end; | 5739 | { |
| 5699 | break; | 5740 | /* We have scanned the whole data. */ |
| 5741 | if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | ||
| 5742 | /* We didn't find an 8-bit code. We may have | ||
| 5743 | found a null-byte, but it's very rare that | ||
| 5744 | a binary file confirm to ISO-2022. */ | ||
| 5745 | src = src_end; | ||
| 5746 | break; | ||
| 5747 | } | ||
| 5748 | } | ||
| 5749 | else if (! c) | ||
| 5750 | { | ||
| 5751 | null_byte_found = 1; | ||
| 5752 | if (eight_bit_found) | ||
| 5753 | break; | ||
| 5700 | } | 5754 | } |
| 5701 | } | 5755 | } |
| 5702 | } | 5756 | } |
| 5703 | coding->head_ascii = src - (coding->source + coding->consumed); | 5757 | if (coding->head_ascii < 0) |
| 5758 | coding->head_ascii = src - coding->source; | ||
| 5704 | 5759 | ||
| 5705 | if (coding->head_ascii < coding->src_bytes | 5760 | if (null_byte_found || eight_bit_found |
| 5761 | || coding->head_ascii < coding->src_bytes | ||
| 5706 | || detect_info.found) | 5762 | || detect_info.found) |
| 5707 | { | 5763 | { |
| 5708 | enum coding_category category; | 5764 | enum coding_category category; |
| @@ -5718,48 +5774,58 @@ detect_coding (coding) | |||
| 5718 | break; | 5774 | break; |
| 5719 | } | 5775 | } |
| 5720 | else | 5776 | else |
| 5721 | for (i = 0; i < coding_category_raw_text; i++) | 5777 | { |
| 5722 | { | 5778 | if (null_byte_found) |
| 5723 | category = coding_priorities[i]; | ||
| 5724 | this = coding_categories + category; | ||
| 5725 | if (this->id < 0) | ||
| 5726 | { | ||
| 5727 | /* No coding system of this category is defined. */ | ||
| 5728 | detect_info.rejected |= (1 << category); | ||
| 5729 | } | ||
| 5730 | else if (category >= coding_category_raw_text) | ||
| 5731 | continue; | ||
| 5732 | else if (detect_info.checked & (1 << category)) | ||
| 5733 | { | ||
| 5734 | if (detect_info.found & (1 << category)) | ||
| 5735 | break; | ||
| 5736 | } | ||
| 5737 | else if ((*(this->detector)) (coding, &detect_info) | ||
| 5738 | && detect_info.found & (1 << category)) | ||
| 5739 | { | ||
| 5740 | if (category == coding_category_utf_16_auto) | ||
| 5741 | { | ||
| 5742 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 5743 | category = coding_category_utf_16_le; | ||
| 5744 | else | ||
| 5745 | category = coding_category_utf_16_be; | ||
| 5746 | } | ||
| 5747 | break; | ||
| 5748 | } | ||
| 5749 | } | ||
| 5750 | |||
| 5751 | if (i < coding_category_raw_text) | ||
| 5752 | setup_coding_system (CODING_ID_NAME (this->id), coding); | ||
| 5753 | else if (detect_info.rejected == CATEGORY_MASK_ANY) | ||
| 5754 | setup_coding_system (Qraw_text, coding); | ||
| 5755 | else if (detect_info.rejected) | ||
| 5756 | for (i = 0; i < coding_category_raw_text; i++) | ||
| 5757 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) | ||
| 5758 | { | 5779 | { |
| 5759 | this = coding_categories + coding_priorities[i]; | 5780 | detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
| 5760 | setup_coding_system (CODING_ID_NAME (this->id), coding); | 5781 | detect_info.rejected |= ~CATEGORY_MASK_UTF_16; |
| 5761 | break; | ||
| 5762 | } | 5782 | } |
| 5783 | for (i = 0; i < coding_category_raw_text; i++) | ||
| 5784 | { | ||
| 5785 | category = coding_priorities[i]; | ||
| 5786 | this = coding_categories + category; | ||
| 5787 | if (this->id < 0) | ||
| 5788 | { | ||
| 5789 | /* No coding system of this category is defined. */ | ||
| 5790 | detect_info.rejected |= (1 << category); | ||
| 5791 | } | ||
| 5792 | else if (category >= coding_category_raw_text) | ||
| 5793 | continue; | ||
| 5794 | else if (detect_info.checked & (1 << category)) | ||
| 5795 | { | ||
| 5796 | if (detect_info.found & (1 << category)) | ||
| 5797 | break; | ||
| 5798 | } | ||
| 5799 | else if ((*(this->detector)) (coding, &detect_info) | ||
| 5800 | && detect_info.found & (1 << category)) | ||
| 5801 | { | ||
| 5802 | if (category == coding_category_utf_16_auto) | ||
| 5803 | { | ||
| 5804 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 5805 | category = coding_category_utf_16_le; | ||
| 5806 | else | ||
| 5807 | category = coding_category_utf_16_be; | ||
| 5808 | } | ||
| 5809 | break; | ||
| 5810 | } | ||
| 5811 | } | ||
| 5812 | |||
| 5813 | if (i < coding_category_raw_text) | ||
| 5814 | setup_coding_system (CODING_ID_NAME (this->id), coding); | ||
| 5815 | else if (null_byte_found) | ||
| 5816 | setup_coding_system (Qno_conversion, coding); | ||
| 5817 | else if ((detect_info.rejected & CATEGORY_MASK_ANY) | ||
| 5818 | == CATEGORY_MASK_ANY) | ||
| 5819 | setup_coding_system (Qraw_text, coding); | ||
| 5820 | else if (detect_info.rejected) | ||
| 5821 | for (i = 0; i < coding_category_raw_text; i++) | ||
| 5822 | if (! (detect_info.rejected & (1 << coding_priorities[i]))) | ||
| 5823 | { | ||
| 5824 | this = coding_categories + coding_priorities[i]; | ||
| 5825 | setup_coding_system (CODING_ID_NAME (this->id), coding); | ||
| 5826 | break; | ||
| 5827 | } | ||
| 5828 | } | ||
| 5763 | } | 5829 | } |
| 5764 | } | 5830 | } |
| 5765 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 5831 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| @@ -7472,6 +7538,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, | |||
| 7472 | int id; | 7538 | int id; |
| 7473 | struct coding_detection_info detect_info; | 7539 | struct coding_detection_info detect_info; |
| 7474 | enum coding_category base_category; | 7540 | enum coding_category base_category; |
| 7541 | int null_byte_found = 0, eight_bit_found = 0; | ||
| 7475 | 7542 | ||
| 7476 | if (NILP (coding_system)) | 7543 | if (NILP (coding_system)) |
| 7477 | coding_system = Qundecided; | 7544 | coding_system = Qundecided; |
| @@ -7497,33 +7564,54 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, | |||
| 7497 | struct coding_system *this; | 7564 | struct coding_system *this; |
| 7498 | int c, i; | 7565 | int c, i; |
| 7499 | 7566 | ||
| 7567 | coding.head_ascii = -1; | ||
| 7500 | /* Skip all ASCII bytes except for a few ISO2022 controls. */ | 7568 | /* Skip all ASCII bytes except for a few ISO2022 controls. */ |
| 7501 | for (i = 0; src < src_end; i++, src++) | 7569 | for (; src < src_end; src++) |
| 7502 | { | 7570 | { |
| 7503 | c = *src; | 7571 | c = *src; |
| 7504 | if (c & 0x80) | 7572 | if (c & 0x80) |
| 7505 | break; | ||
| 7506 | if (c < 0x20 | ||
| 7507 | && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | ||
| 7508 | && ! inhibit_iso_escape_detection) | ||
| 7509 | { | 7573 | { |
| 7510 | coding.head_ascii = src - coding.source; | 7574 | eight_bit_found = 1; |
| 7511 | if (detect_coding_iso_2022 (&coding, &detect_info)) | 7575 | if (coding.head_ascii < 0) |
| 7576 | coding.head_ascii = src - coding.source; | ||
| 7577 | if (null_byte_found) | ||
| 7578 | break; | ||
| 7579 | } | ||
| 7580 | if (c < 0x20) | ||
| 7581 | { | ||
| 7582 | if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | ||
| 7583 | && ! inhibit_iso_escape_detection | ||
| 7584 | && ! detect_info.checked) | ||
| 7512 | { | 7585 | { |
| 7513 | /* We have scanned the whole data. */ | 7586 | if (coding.head_ascii < 0) |
| 7514 | if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 7587 | coding.head_ascii = src - coding.source; |
| 7515 | /* We didn't find an 8-bit code. */ | 7588 | if (detect_coding_iso_2022 (&coding, &detect_info)) |
| 7516 | src = src_end; | 7589 | { |
| 7517 | break; | 7590 | /* We have scanned the whole data. */ |
| 7591 | if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | ||
| 7592 | /* We didn't find an 8-bit code. We may have | ||
| 7593 | found a null-byte, but it's very rare that | ||
| 7594 | a binary file confirm to ISO-2022. */ | ||
| 7595 | src = src_end; | ||
| 7596 | break; | ||
| 7597 | } | ||
| 7598 | } | ||
| 7599 | else if (! c) | ||
| 7600 | { | ||
| 7601 | null_byte_found = 1; | ||
| 7602 | if (eight_bit_found) | ||
| 7603 | break; | ||
| 7518 | } | 7604 | } |
| 7519 | } | 7605 | } |
| 7520 | } | 7606 | } |
| 7521 | coding.head_ascii = src - coding.source; | 7607 | if (coding.head_ascii < 0) |
| 7608 | coding.head_ascii = src - coding.source; | ||
| 7522 | 7609 | ||
| 7523 | if (src < src_end | 7610 | if (null_byte_found || eight_bit_found |
| 7611 | || coding.head_ascii < coding.src_bytes | ||
| 7524 | || detect_info.found) | 7612 | || detect_info.found) |
| 7525 | { | 7613 | { |
| 7526 | if (src == src_end) | 7614 | if (coding.head_ascii == coding.src_bytes) |
| 7527 | /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ | 7615 | /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ |
| 7528 | for (i = 0; i < coding_category_raw_text; i++) | 7616 | for (i = 0; i < coding_category_raw_text; i++) |
| 7529 | { | 7617 | { |
| @@ -7533,44 +7621,48 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, | |||
| 7533 | break; | 7621 | break; |
| 7534 | } | 7622 | } |
| 7535 | else | 7623 | else |
| 7536 | for (i = 0; i < coding_category_raw_text; i++) | 7624 | { |
| 7537 | { | 7625 | if (null_byte_found) |
| 7538 | category = coding_priorities[i]; | 7626 | { |
| 7539 | this = coding_categories + category; | 7627 | detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
| 7628 | detect_info.rejected |= ~CATEGORY_MASK_UTF_16; | ||
| 7629 | } | ||
| 7630 | for (i = 0; i < coding_category_raw_text; i++) | ||
| 7631 | { | ||
| 7632 | category = coding_priorities[i]; | ||
| 7633 | this = coding_categories + category; | ||
| 7540 | 7634 | ||
| 7541 | if (this->id < 0) | 7635 | if (this->id < 0) |
| 7542 | { | 7636 | { |
| 7543 | /* No coding system of this category is defined. */ | 7637 | /* No coding system of this category is defined. */ |
| 7544 | detect_info.rejected |= (1 << category); | 7638 | detect_info.rejected |= (1 << category); |
| 7545 | } | 7639 | } |
| 7546 | else if (category >= coding_category_raw_text) | 7640 | else if (category >= coding_category_raw_text) |
| 7547 | continue; | 7641 | continue; |
| 7548 | else if (detect_info.checked & (1 << category)) | 7642 | else if (detect_info.checked & (1 << category)) |
| 7549 | { | 7643 | { |
| 7550 | if (highest | 7644 | if (highest |
| 7551 | && (detect_info.found & (1 << category))) | 7645 | && (detect_info.found & (1 << category))) |
| 7552 | break; | ||
| 7553 | } | ||
| 7554 | else | ||
| 7555 | { | ||
| 7556 | if ((*(this->detector)) (&coding, &detect_info) | ||
| 7557 | && highest | ||
| 7558 | && (detect_info.found & (1 << category))) | ||
| 7559 | { | ||
| 7560 | if (category == coding_category_utf_16_auto) | ||
| 7561 | { | ||
| 7562 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 7563 | category = coding_category_utf_16_le; | ||
| 7564 | else | ||
| 7565 | category = coding_category_utf_16_be; | ||
| 7566 | } | ||
| 7567 | break; | 7646 | break; |
| 7568 | } | 7647 | } |
| 7569 | } | 7648 | else if ((*(this->detector)) (&coding, &detect_info) |
| 7570 | } | 7649 | && highest |
| 7650 | && (detect_info.found & (1 << category))) | ||
| 7651 | { | ||
| 7652 | if (category == coding_category_utf_16_auto) | ||
| 7653 | { | ||
| 7654 | if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | ||
| 7655 | category = coding_category_utf_16_le; | ||
| 7656 | else | ||
| 7657 | category = coding_category_utf_16_be; | ||
| 7658 | } | ||
| 7659 | break; | ||
| 7660 | } | ||
| 7661 | } | ||
| 7662 | } | ||
| 7571 | } | 7663 | } |
| 7572 | 7664 | ||
| 7573 | if (detect_info.rejected == CATEGORY_MASK_ANY) | 7665 | if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) |
| 7574 | { | 7666 | { |
| 7575 | detect_info.found = CATEGORY_MASK_RAW_TEXT; | 7667 | detect_info.found = CATEGORY_MASK_RAW_TEXT; |
| 7576 | id = coding_categories[coding_category_raw_text].id; | 7668 | id = coding_categories[coding_category_raw_text].id; |
| @@ -7659,8 +7751,13 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, | |||
| 7659 | if (VECTORP (eol_type)) | 7751 | if (VECTORP (eol_type)) |
| 7660 | { | 7752 | { |
| 7661 | if (detect_info.found & ~CATEGORY_MASK_UTF_16) | 7753 | if (detect_info.found & ~CATEGORY_MASK_UTF_16) |
| 7662 | normal_eol = detect_eol (coding.source, src_bytes, | 7754 | { |
| 7663 | coding_category_raw_text); | 7755 | if (null_byte_found) |
| 7756 | normal_eol = EOL_SEEN_LF; | ||
| 7757 | else | ||
| 7758 | normal_eol = detect_eol (coding.source, src_bytes, | ||
| 7759 | coding_category_raw_text); | ||
| 7760 | } | ||
| 7664 | if (detect_info.found & (CATEGORY_MASK_UTF_16_BE | 7761 | if (detect_info.found & (CATEGORY_MASK_UTF_16_BE |
| 7665 | | CATEGORY_MASK_UTF_16_BE_NOSIG)) | 7762 | | CATEGORY_MASK_UTF_16_BE_NOSIG)) |
| 7666 | utf_16_be_eol = detect_eol (coding.source, src_bytes, | 7763 | utf_16_be_eol = detect_eol (coding.source, src_bytes, |