aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2008-04-03 12:31:27 +0000
committerKenichi Handa2008-04-03 12:31:27 +0000
commit2f3cbb326b9519d3ae4ef8049ec6f2bb5b99e028 (patch)
tree3f3f5d525e474e9403dd99c07cdfbf590f3a23ee /src/coding.c
parent164ce7fab306e1d10e8be4a8d192adc0a99bfc67 (diff)
downloademacs-2f3cbb326b9519d3ae4ef8049ec6f2bb5b99e028.tar.gz
emacs-2f3cbb326b9519d3ae4ef8049ec6f2bb5b99e028.zip
(CATEGORY_MASK_ANY): Add CATEGORY_MASK_UTF_16_AUTO.
(CATEGORY_MASK_UTF_16): Likewise. (detect_coding_utf_16): Add heuristics to reject utf-16 for a binary file. (detect_coding): Add null-byte detection for a binary file. (detect_coding_system): Likewise.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c313
1 files changed, 205 insertions, 108 deletions
diff --git a/src/coding.c b/src/coding.c
index 2fe14444f33..10443090eff 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -625,6 +625,7 @@ enum coding_category
625 | CATEGORY_MASK_ISO_7_ELSE \ 625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \ 626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \ 627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_AUTO \
628 | CATEGORY_MASK_UTF_16_BE \ 629 | CATEGORY_MASK_UTF_16_BE \
629 | CATEGORY_MASK_UTF_16_LE \ 630 | CATEGORY_MASK_UTF_16_LE \
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \ 631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
@@ -657,7 +658,8 @@ enum coding_category
657 | CATEGORY_MASK_ISO_ELSE) 658 | CATEGORY_MASK_ISO_ELSE)
658 659
659#define CATEGORY_MASK_UTF_16 \ 660#define CATEGORY_MASK_UTF_16 \
660 (CATEGORY_MASK_UTF_16_BE \ 661 (CATEGORY_MASK_UTF_16_AUTO \
662 | CATEGORY_MASK_UTF_16_BE \
661 | CATEGORY_MASK_UTF_16_LE \ 663 | CATEGORY_MASK_UTF_16_LE \
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \ 664 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG) 665 | CATEGORY_MASK_UTF_16_LE_NOSIG)
@@ -1513,11 +1515,44 @@ detect_coding_utf_16 (coding, detect_info)
1513 | CATEGORY_MASK_UTF_16_BE_NOSIG 1515 | CATEGORY_MASK_UTF_16_BE_NOSIG
1514 | CATEGORY_MASK_UTF_16_LE_NOSIG); 1516 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1515 } 1517 }
1516 else if (c1 >= 0 && c2 >= 0) 1518 else
1517 { 1519 {
1520 /* We check the dispersion of Eth and Oth bytes where E is even and
1521 O is odd. If both are high, we assume binary data.*/
1522 unsigned char e[256], o[256];
1523 unsigned e_num = 1, o_num = 1;
1524
1525 memset (e, 0, 256);
1526 memset (o, 0, 256);
1527 e[c1] = 1;
1528 o[c2] = 1;
1529
1518 detect_info->rejected 1530 detect_info->rejected
1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); 1531 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1532
1533 while (1)
1534 {
1535 ONE_MORE_BYTE (c1);
1536 ONE_MORE_BYTE (c2);
1537 if (! e[c1])
1538 {
1539 e[c1] = 1;
1540 e_num++;
1541 if (e_num >= 128)
1542 break;
1543 }
1544 if (! o[c2])
1545 {
1546 o[c1] = 1;
1547 o_num++;
1548 if (o_num >= 128)
1549 break;
1550 }
1551 }
1552 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1553 return 0;
1520 } 1554 }
1555
1521 no_more_source: 1556 no_more_source:
1522 return 1; 1557 return 1;
1523} 1558}
@@ -5677,32 +5712,53 @@ detect_coding (coding)
5677 { 5712 {
5678 int c, i; 5713 int c, i;
5679 struct coding_detection_info detect_info; 5714 struct coding_detection_info detect_info;
5715 int null_byte_found = 0, eight_bit_found = 0;
5680 5716
5681 detect_info.checked = detect_info.found = detect_info.rejected = 0; 5717 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5682 for (i = 0, src = coding->source; src < src_end; i++, src++) 5718 coding->head_ascii = -1;
5719 for (src = coding->source; src < src_end; src++)
5683 { 5720 {
5684 c = *src; 5721 c = *src;
5685 if (c & 0x80) 5722 if (c & 0x80)
5686 break;
5687 if (c < 0x20
5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5689 && ! inhibit_iso_escape_detection
5690 && ! detect_info.checked)
5691 { 5723 {
5692 coding->head_ascii = src - (coding->source + coding->consumed); 5724 eight_bit_found = 1;
5693 if (detect_coding_iso_2022 (coding, &detect_info)) 5725 if (coding->head_ascii < 0)
5726 coding->head_ascii = src - coding->source;
5727 if (null_byte_found)
5728 break;
5729 }
5730 else if (c < 0x20)
5731 {
5732 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5733 && ! inhibit_iso_escape_detection
5734 && ! detect_info.checked)
5694 { 5735 {
5695 /* We have scanned the whole data. */ 5736 if (coding->head_ascii < 0)
5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 5737 coding->head_ascii = src - coding->source;
5697 /* We didn't find an 8-bit code. */ 5738 if (detect_coding_iso_2022 (coding, &detect_info))
5698 src = src_end; 5739 {
5699 break; 5740 /* We have scanned the whole data. */
5741 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5742 /* We didn't find an 8-bit code. We may have
5743 found a null-byte, but it's very rare that
5744 a binary file confirm to ISO-2022. */
5745 src = src_end;
5746 break;
5747 }
5748 }
5749 else if (! c)
5750 {
5751 null_byte_found = 1;
5752 if (eight_bit_found)
5753 break;
5700 } 5754 }
5701 } 5755 }
5702 } 5756 }
5703 coding->head_ascii = src - (coding->source + coding->consumed); 5757 if (coding->head_ascii < 0)
5758 coding->head_ascii = src - coding->source;
5704 5759
5705 if (coding->head_ascii < coding->src_bytes 5760 if (null_byte_found || eight_bit_found
5761 || coding->head_ascii < coding->src_bytes
5706 || detect_info.found) 5762 || detect_info.found)
5707 { 5763 {
5708 enum coding_category category; 5764 enum coding_category category;
@@ -5718,48 +5774,58 @@ detect_coding (coding)
5718 break; 5774 break;
5719 } 5775 }
5720 else 5776 else
5721 for (i = 0; i < coding_category_raw_text; i++) 5777 {
5722 { 5778 if (null_byte_found)
5723 category = coding_priorities[i];
5724 this = coding_categories + category;
5725 if (this->id < 0)
5726 {
5727 /* No coding system of this category is defined. */
5728 detect_info.rejected |= (1 << category);
5729 }
5730 else if (category >= coding_category_raw_text)
5731 continue;
5732 else if (detect_info.checked & (1 << category))
5733 {
5734 if (detect_info.found & (1 << category))
5735 break;
5736 }
5737 else if ((*(this->detector)) (coding, &detect_info)
5738 && detect_info.found & (1 << category))
5739 {
5740 if (category == coding_category_utf_16_auto)
5741 {
5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5743 category = coding_category_utf_16_le;
5744 else
5745 category = coding_category_utf_16_be;
5746 }
5747 break;
5748 }
5749 }
5750
5751 if (i < coding_category_raw_text)
5752 setup_coding_system (CODING_ID_NAME (this->id), coding);
5753 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5754 setup_coding_system (Qraw_text, coding);
5755 else if (detect_info.rejected)
5756 for (i = 0; i < coding_category_raw_text; i++)
5757 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5758 { 5779 {
5759 this = coding_categories + coding_priorities[i]; 5780 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5760 setup_coding_system (CODING_ID_NAME (this->id), coding); 5781 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5761 break;
5762 } 5782 }
5783 for (i = 0; i < coding_category_raw_text; i++)
5784 {
5785 category = coding_priorities[i];
5786 this = coding_categories + category;
5787 if (this->id < 0)
5788 {
5789 /* No coding system of this category is defined. */
5790 detect_info.rejected |= (1 << category);
5791 }
5792 else if (category >= coding_category_raw_text)
5793 continue;
5794 else if (detect_info.checked & (1 << category))
5795 {
5796 if (detect_info.found & (1 << category))
5797 break;
5798 }
5799 else if ((*(this->detector)) (coding, &detect_info)
5800 && detect_info.found & (1 << category))
5801 {
5802 if (category == coding_category_utf_16_auto)
5803 {
5804 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5805 category = coding_category_utf_16_le;
5806 else
5807 category = coding_category_utf_16_be;
5808 }
5809 break;
5810 }
5811 }
5812
5813 if (i < coding_category_raw_text)
5814 setup_coding_system (CODING_ID_NAME (this->id), coding);
5815 else if (null_byte_found)
5816 setup_coding_system (Qno_conversion, coding);
5817 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5818 == CATEGORY_MASK_ANY)
5819 setup_coding_system (Qraw_text, coding);
5820 else if (detect_info.rejected)
5821 for (i = 0; i < coding_category_raw_text; i++)
5822 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5823 {
5824 this = coding_categories + coding_priorities[i];
5825 setup_coding_system (CODING_ID_NAME (this->id), coding);
5826 break;
5827 }
5828 }
5763 } 5829 }
5764 } 5830 }
5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 5831 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -7472,6 +7538,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7472 int id; 7538 int id;
7473 struct coding_detection_info detect_info; 7539 struct coding_detection_info detect_info;
7474 enum coding_category base_category; 7540 enum coding_category base_category;
7541 int null_byte_found = 0, eight_bit_found = 0;
7475 7542
7476 if (NILP (coding_system)) 7543 if (NILP (coding_system))
7477 coding_system = Qundecided; 7544 coding_system = Qundecided;
@@ -7497,33 +7564,54 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7497 struct coding_system *this; 7564 struct coding_system *this;
7498 int c, i; 7565 int c, i;
7499 7566
7567 coding.head_ascii = -1;
7500 /* Skip all ASCII bytes except for a few ISO2022 controls. */ 7568 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7501 for (i = 0; src < src_end; i++, src++) 7569 for (; src < src_end; src++)
7502 { 7570 {
7503 c = *src; 7571 c = *src;
7504 if (c & 0x80) 7572 if (c & 0x80)
7505 break;
7506 if (c < 0x20
7507 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7508 && ! inhibit_iso_escape_detection)
7509 { 7573 {
7510 coding.head_ascii = src - coding.source; 7574 eight_bit_found = 1;
7511 if (detect_coding_iso_2022 (&coding, &detect_info)) 7575 if (coding.head_ascii < 0)
7576 coding.head_ascii = src - coding.source;
7577 if (null_byte_found)
7578 break;
7579 }
7580 if (c < 0x20)
7581 {
7582 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7583 && ! inhibit_iso_escape_detection
7584 && ! detect_info.checked)
7512 { 7585 {
7513 /* We have scanned the whole data. */ 7586 if (coding.head_ascii < 0)
7514 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) 7587 coding.head_ascii = src - coding.source;
7515 /* We didn't find an 8-bit code. */ 7588 if (detect_coding_iso_2022 (&coding, &detect_info))
7516 src = src_end; 7589 {
7517 break; 7590 /* We have scanned the whole data. */
7591 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7592 /* We didn't find an 8-bit code. We may have
7593 found a null-byte, but it's very rare that
7594 a binary file confirm to ISO-2022. */
7595 src = src_end;
7596 break;
7597 }
7598 }
7599 else if (! c)
7600 {
7601 null_byte_found = 1;
7602 if (eight_bit_found)
7603 break;
7518 } 7604 }
7519 } 7605 }
7520 } 7606 }
7521 coding.head_ascii = src - coding.source; 7607 if (coding.head_ascii < 0)
7608 coding.head_ascii = src - coding.source;
7522 7609
7523 if (src < src_end 7610 if (null_byte_found || eight_bit_found
7611 || coding.head_ascii < coding.src_bytes
7524 || detect_info.found) 7612 || detect_info.found)
7525 { 7613 {
7526 if (src == src_end) 7614 if (coding.head_ascii == coding.src_bytes)
7527 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ 7615 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7528 for (i = 0; i < coding_category_raw_text; i++) 7616 for (i = 0; i < coding_category_raw_text; i++)
7529 { 7617 {
@@ -7533,44 +7621,48 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7533 break; 7621 break;
7534 } 7622 }
7535 else 7623 else
7536 for (i = 0; i < coding_category_raw_text; i++) 7624 {
7537 { 7625 if (null_byte_found)
7538 category = coding_priorities[i]; 7626 {
7539 this = coding_categories + category; 7627 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7628 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7629 }
7630 for (i = 0; i < coding_category_raw_text; i++)
7631 {
7632 category = coding_priorities[i];
7633 this = coding_categories + category;
7540 7634
7541 if (this->id < 0) 7635 if (this->id < 0)
7542 { 7636 {
7543 /* No coding system of this category is defined. */ 7637 /* No coding system of this category is defined. */
7544 detect_info.rejected |= (1 << category); 7638 detect_info.rejected |= (1 << category);
7545 } 7639 }
7546 else if (category >= coding_category_raw_text) 7640 else if (category >= coding_category_raw_text)
7547 continue; 7641 continue;
7548 else if (detect_info.checked & (1 << category)) 7642 else if (detect_info.checked & (1 << category))
7549 { 7643 {
7550 if (highest 7644 if (highest
7551 && (detect_info.found & (1 << category))) 7645 && (detect_info.found & (1 << category)))
7552 break;
7553 }
7554 else
7555 {
7556 if ((*(this->detector)) (&coding, &detect_info)
7557 && highest
7558 && (detect_info.found & (1 << category)))
7559 {
7560 if (category == coding_category_utf_16_auto)
7561 {
7562 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7563 category = coding_category_utf_16_le;
7564 else
7565 category = coding_category_utf_16_be;
7566 }
7567 break; 7646 break;
7568 } 7647 }
7569 } 7648 else if ((*(this->detector)) (&coding, &detect_info)
7570 } 7649 && highest
7650 && (detect_info.found & (1 << category)))
7651 {
7652 if (category == coding_category_utf_16_auto)
7653 {
7654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7655 category = coding_category_utf_16_le;
7656 else
7657 category = coding_category_utf_16_be;
7658 }
7659 break;
7660 }
7661 }
7662 }
7571 } 7663 }
7572 7664
7573 if (detect_info.rejected == CATEGORY_MASK_ANY) 7665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7574 { 7666 {
7575 detect_info.found = CATEGORY_MASK_RAW_TEXT; 7667 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7576 id = coding_categories[coding_category_raw_text].id; 7668 id = coding_categories[coding_category_raw_text].id;
@@ -7659,8 +7751,13 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7659 if (VECTORP (eol_type)) 7751 if (VECTORP (eol_type))
7660 { 7752 {
7661 if (detect_info.found & ~CATEGORY_MASK_UTF_16) 7753 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7662 normal_eol = detect_eol (coding.source, src_bytes, 7754 {
7663 coding_category_raw_text); 7755 if (null_byte_found)
7756 normal_eol = EOL_SEEN_LF;
7757 else
7758 normal_eol = detect_eol (coding.source, src_bytes,
7759 coding_category_raw_text);
7760 }
7664 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE 7761 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7665 | CATEGORY_MASK_UTF_16_BE_NOSIG)) 7762 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7666 utf_16_be_eol = detect_eol (coding.source, src_bytes, 7763 utf_16_be_eol = detect_eol (coding.source, src_bytes,