diff options
| -rw-r--r-- | src/regex-emacs.c | 24 | ||||
| -rw-r--r-- | test/src/regex-emacs-tests.el | 120 |
2 files changed, 140 insertions, 4 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index c353a78fb4f..5887eaa30c7 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -2794,6 +2794,7 @@ static int | |||
| 2794 | analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | 2794 | analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) |
| 2795 | { | 2795 | { |
| 2796 | int j, k; | 2796 | int j, k; |
| 2797 | int nbits; | ||
| 2797 | bool not; | 2798 | bool not; |
| 2798 | 2799 | ||
| 2799 | /* If all elements for base leading-codes in fastmap is set, this | 2800 | /* If all elements for base leading-codes in fastmap is set, this |
| @@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 2854 | each byte is a character. Thus, this works in both | 2855 | each byte is a character. Thus, this works in both |
| 2855 | cases. */ | 2856 | cases. */ |
| 2856 | fastmap[p[1]] = 1; | 2857 | fastmap[p[1]] = 1; |
| 2857 | if (! multibyte) | 2858 | if (multibyte) |
| 2859 | { | ||
| 2860 | /* Cover the case of matching a raw char in a | ||
| 2861 | multibyte regexp against unibyte. */ | ||
| 2862 | if (CHAR_BYTE8_HEAD_P (p[1])) | ||
| 2863 | fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1; | ||
| 2864 | } | ||
| 2865 | else | ||
| 2858 | { | 2866 | { |
| 2859 | /* For the case of matching this unibyte regex | 2867 | /* For the case of matching this unibyte regex |
| 2860 | against multibyte, we must set a leading code of | 2868 | against multibyte, we must set a leading code of |
| @@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 2886 | case charset: | 2894 | case charset: |
| 2887 | if (!fastmap) break; | 2895 | if (!fastmap) break; |
| 2888 | not = (re_opcode_t) *(p - 1) == charset_not; | 2896 | not = (re_opcode_t) *(p - 1) == charset_not; |
| 2889 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; | 2897 | nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; |
| 2890 | j >= 0; j--) | 2898 | p++; |
| 2899 | for (j = 0; j < nbits; j++) | ||
| 2891 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | 2900 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) |
| 2892 | fastmap[j] = 1; | 2901 | fastmap[j] = 1; |
| 2893 | 2902 | ||
| 2903 | /* To match raw bytes (in the 80..ff range) against multibyte | ||
| 2904 | strings, add their leading bytes to the fastmap. */ | ||
| 2905 | for (j = 0x80; j < nbits; j++) | ||
| 2906 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | ||
| 2907 | fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1; | ||
| 2908 | |||
| 2894 | if (/* Any leading code can possibly start a character | 2909 | if (/* Any leading code can possibly start a character |
| 2895 | which doesn't match the specified set of characters. */ | 2910 | which doesn't match the specified set of characters. */ |
| 2896 | not | 2911 | not |
| @@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |||
| 4251 | } | 4266 | } |
| 4252 | p += pat_charlen; | 4267 | p += pat_charlen; |
| 4253 | d++; | 4268 | d++; |
| 4269 | mcnt -= pat_charlen; | ||
| 4254 | } | 4270 | } |
| 4255 | while (--mcnt); | 4271 | while (mcnt > 0); |
| 4256 | 4272 | ||
| 4257 | break; | 4273 | break; |
| 4258 | 4274 | ||
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index 0ae50c94d4c..50ed3e870a5 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el | |||
| @@ -683,4 +683,124 @@ This evaluates the TESTS test cases from glibc." | |||
| 683 | (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) | 683 | (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) |
| 684 | (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp)) | 684 | (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp)) |
| 685 | 685 | ||
| 686 | (ert-deftest regexp-unibyte-unibyte () | ||
| 687 | "Test matching a unibyte regexp against a unibyte string." | ||
| 688 | ;; Sanity check | ||
| 689 | (should-not (multibyte-string-p "ab")) | ||
| 690 | (should-not (multibyte-string-p "\xff")) | ||
| 691 | ;; ASCII | ||
| 692 | (should (string-match "a[b]" "ab")) | ||
| 693 | ;; Raw | ||
| 694 | (should (string-match "\xf1" "\xf1")) | ||
| 695 | (should-not (string-match "\xf1" "\xc1\xb1")) | ||
| 696 | ;; Raw, char alt | ||
| 697 | (should (string-match "[\xf1]" "\xf1")) | ||
| 698 | (should-not (string-match "[\xf1]" "\xc1\xb1")) | ||
| 699 | ;; Raw range | ||
| 700 | (should (string-match "[\x82-\xd3]" "\xbb")) | ||
| 701 | (should-not (string-match "[\x82-\xd3]" "a")) | ||
| 702 | (should-not (string-match "[\x82-\xd3]" "\x81")) | ||
| 703 | (should-not (string-match "[\x82-\xd3]" "\xd4")) | ||
| 704 | ;; ASCII-raw range | ||
| 705 | (should (string-match "[f-\xd3]" "q")) | ||
| 706 | (should (string-match "[f-\xd3]" "\xbb")) | ||
| 707 | (should-not (string-match "[f-\xd3]" "e")) | ||
| 708 | (should-not (string-match "[f-\xd3]" "\xd4"))) | ||
| 709 | |||
| 710 | (ert-deftest regexp-multibyte-multibyte () | ||
| 711 | "Test matching a multibyte regexp against a multibyte string." | ||
| 712 | ;; Sanity check | ||
| 713 | (should (multibyte-string-p "åü")) | ||
| 714 | ;; ASCII | ||
| 715 | (should (string-match (string-to-multibyte "a[b]") | ||
| 716 | (string-to-multibyte "ab"))) | ||
| 717 | ;; Unicode | ||
| 718 | (should (string-match "å[ü]z" "åüz")) | ||
| 719 | (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc"))) | ||
| 720 | ;; Raw | ||
| 721 | (should (string-match (string-to-multibyte "\xf1") | ||
| 722 | (string-to-multibyte "\xf1"))) | ||
| 723 | (should-not (string-match (string-to-multibyte "\xf1") | ||
| 724 | (string-to-multibyte "\xc1\xb1"))) | ||
| 725 | (should-not (string-match (string-to-multibyte "\xc1\xb1") | ||
| 726 | (string-to-multibyte "\xf1"))) | ||
| 727 | ;; Raw, char alt | ||
| 728 | (should (string-match (string-to-multibyte "[\xf1]") | ||
| 729 | (string-to-multibyte "\xf1"))) | ||
| 730 | ;; Raw range | ||
| 731 | (should (string-match (string-to-multibyte "[\x82-\xd3]") | ||
| 732 | (string-to-multibyte "\xbb"))) | ||
| 733 | (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a")) | ||
| 734 | (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å")) | ||
| 735 | (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü")) | ||
| 736 | (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81")) | ||
| 737 | (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4")) | ||
| 738 | ;; ASCII-raw range: should exclude U+0100..U+10FFFF | ||
| 739 | (should (string-match (string-to-multibyte "[f-\xd3]") | ||
| 740 | (string-to-multibyte "q"))) | ||
| 741 | (should (string-match (string-to-multibyte "[f-\xd3]") | ||
| 742 | (string-to-multibyte "\xbb"))) | ||
| 743 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "e")) | ||
| 744 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å")) | ||
| 745 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü")) | ||
| 746 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4")) | ||
| 747 | ;; Unicode-raw range: should be empty | ||
| 748 | (should-not (string-match "[å-\xd3]" "å")) | ||
| 749 | (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3"))) | ||
| 750 | (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb"))) | ||
| 751 | (should-not (string-match "[å-\xd3]" "ü")) | ||
| 752 | ;; No equivalence between raw bytes and latin-1 | ||
| 753 | (should-not (string-match "å" (string-to-multibyte "\xe5"))) | ||
| 754 | (should-not (string-match "[å]" (string-to-multibyte "\xe5"))) | ||
| 755 | (should-not (string-match "\xe5" "å")) | ||
| 756 | (should-not (string-match "[\xe5]" "å"))) | ||
| 757 | |||
| 758 | (ert-deftest regexp-unibyte-multibyte () | ||
| 759 | "Test matching a unibyte regexp against a multibyte string." | ||
| 760 | ;; ASCII | ||
| 761 | (should (string-match "a[b]" (string-to-multibyte "ab"))) | ||
| 762 | ;; Unicode | ||
| 763 | (should (string-match "a.[^b]c" (string-to-multibyte "aåüc"))) | ||
| 764 | ;; Raw | ||
| 765 | (should (string-match "\xf1" (string-to-multibyte "\xf1"))) | ||
| 766 | (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1"))) | ||
| 767 | ;; Raw, char alt | ||
| 768 | (should (string-match "[\xf1]" (string-to-multibyte "\xf1"))) | ||
| 769 | (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1"))) | ||
| 770 | ;; ASCII-raw range: should exclude U+0100..U+10FFFF | ||
| 771 | (should (string-match "[f-\xd3]" (string-to-multibyte "q"))) | ||
| 772 | (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb"))) | ||
| 773 | (should-not (string-match "[f-\xd3]" "e")) | ||
| 774 | (should-not (string-match "[f-\xd3]" "Å")) | ||
| 775 | (should-not (string-match "[f-\xd3]" "ü")) | ||
| 776 | (should-not (string-match "[f-\xd3]" "\xd4")) | ||
| 777 | ;; No equivalence between raw bytes and latin-1 | ||
| 778 | (should-not (string-match "\xe5" "å")) | ||
| 779 | (should-not (string-match "[\xe5]" "å"))) | ||
| 780 | |||
| 781 | (ert-deftest regexp-multibyte-unibyte () | ||
| 782 | "Test matching a multibyte regexp against a unibyte string." | ||
| 783 | ;; ASCII | ||
| 784 | (should (string-match (string-to-multibyte "a[b]") "ab")) | ||
| 785 | ;; Unicode | ||
| 786 | (should (string-match "a[^ü]c" "abc")) | ||
| 787 | (should-not (string-match "ü" "\xc3\xbc")) | ||
| 788 | ;; Raw | ||
| 789 | (should (string-match (string-to-multibyte "\xf1") "\xf1")) | ||
| 790 | (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1")) | ||
| 791 | ;; Raw, char alt | ||
| 792 | (should (string-match (string-to-multibyte "[\xf1]") "\xf1")) | ||
| 793 | (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1")) | ||
| 794 | ;; ASCII-raw range: should exclude U+0100..U+10FFFF | ||
| 795 | (should (string-match (string-to-multibyte "[f-\xd3]") "q")) | ||
| 796 | (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb")) | ||
| 797 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "e")) | ||
| 798 | (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4")) | ||
| 799 | ;; Unicode-raw range: should be empty | ||
| 800 | (should-not (string-match "[å-\xd3]" "\xd3")) | ||
| 801 | (should-not (string-match "[å-\xd3]" "\xbb")) | ||
| 802 | ;; No equivalence between raw bytes and latin-1 | ||
| 803 | (should-not (string-match "å" "\xe5")) | ||
| 804 | (should-not (string-match "[å]" "\xe5"))) | ||
| 805 | |||
| 686 | ;;; regex-emacs-tests.el ends here | 806 | ;;; regex-emacs-tests.el ends here |