aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/regex-emacs.c24
-rw-r--r--test/src/regex-emacs-tests.el120
2 files changed, 140 insertions, 4 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index c353a78fb4f..5887eaa30c7 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -2794,6 +2794,7 @@ static int
2794analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) 2794analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2795{ 2795{
2796 int j, k; 2796 int j, k;
2797 int nbits;
2797 bool not; 2798 bool not;
2798 2799
2799 /* If all elements for base leading-codes in fastmap is set, this 2800 /* If all elements for base leading-codes in fastmap is set, this
@@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2854 each byte is a character. Thus, this works in both 2855 each byte is a character. Thus, this works in both
2855 cases. */ 2856 cases. */
2856 fastmap[p[1]] = 1; 2857 fastmap[p[1]] = 1;
2857 if (! multibyte) 2858 if (multibyte)
2859 {
2860 /* Cover the case of matching a raw char in a
2861 multibyte regexp against unibyte. */
2862 if (CHAR_BYTE8_HEAD_P (p[1]))
2863 fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2864 }
2865 else
2858 { 2866 {
2859 /* For the case of matching this unibyte regex 2867 /* For the case of matching this unibyte regex
2860 against multibyte, we must set a leading code of 2868 against multibyte, we must set a leading code of
@@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2886 case charset: 2894 case charset:
2887 if (!fastmap) break; 2895 if (!fastmap) break;
2888 not = (re_opcode_t) *(p - 1) == charset_not; 2896 not = (re_opcode_t) *(p - 1) == charset_not;
2889 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; 2897 nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2890 j >= 0; j--) 2898 p++;
2899 for (j = 0; j < nbits; j++)
2891 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) 2900 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2892 fastmap[j] = 1; 2901 fastmap[j] = 1;
2893 2902
2903 /* To match raw bytes (in the 80..ff range) against multibyte
2904 strings, add their leading bytes to the fastmap. */
2905 for (j = 0x80; j < nbits; j++)
2906 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2907 fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2908
2894 if (/* Any leading code can possibly start a character 2909 if (/* Any leading code can possibly start a character
2895 which doesn't match the specified set of characters. */ 2910 which doesn't match the specified set of characters. */
2896 not 2911 not
@@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
4251 } 4266 }
4252 p += pat_charlen; 4267 p += pat_charlen;
4253 d++; 4268 d++;
4269 mcnt -= pat_charlen;
4254 } 4270 }
4255 while (--mcnt); 4271 while (mcnt > 0);
4256 4272
4257 break; 4273 break;
4258 4274
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 0ae50c94d4c..50ed3e870a5 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -683,4 +683,124 @@ This evaluates the TESTS test cases from glibc."
683 (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) 683 (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
684 (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp)) 684 (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
685 685
686(ert-deftest regexp-unibyte-unibyte ()
687 "Test matching a unibyte regexp against a unibyte string."
688 ;; Sanity check
689 (should-not (multibyte-string-p "ab"))
690 (should-not (multibyte-string-p "\xff"))
691 ;; ASCII
692 (should (string-match "a[b]" "ab"))
693 ;; Raw
694 (should (string-match "\xf1" "\xf1"))
695 (should-not (string-match "\xf1" "\xc1\xb1"))
696 ;; Raw, char alt
697 (should (string-match "[\xf1]" "\xf1"))
698 (should-not (string-match "[\xf1]" "\xc1\xb1"))
699 ;; Raw range
700 (should (string-match "[\x82-\xd3]" "\xbb"))
701 (should-not (string-match "[\x82-\xd3]" "a"))
702 (should-not (string-match "[\x82-\xd3]" "\x81"))
703 (should-not (string-match "[\x82-\xd3]" "\xd4"))
704 ;; ASCII-raw range
705 (should (string-match "[f-\xd3]" "q"))
706 (should (string-match "[f-\xd3]" "\xbb"))
707 (should-not (string-match "[f-\xd3]" "e"))
708 (should-not (string-match "[f-\xd3]" "\xd4")))
709
710(ert-deftest regexp-multibyte-multibyte ()
711 "Test matching a multibyte regexp against a multibyte string."
712 ;; Sanity check
713 (should (multibyte-string-p "åü"))
714 ;; ASCII
715 (should (string-match (string-to-multibyte "a[b]")
716 (string-to-multibyte "ab")))
717 ;; Unicode
718 (should (string-match "å[ü]z" "åüz"))
719 (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
720 ;; Raw
721 (should (string-match (string-to-multibyte "\xf1")
722 (string-to-multibyte "\xf1")))
723 (should-not (string-match (string-to-multibyte "\xf1")
724 (string-to-multibyte "\xc1\xb1")))
725 (should-not (string-match (string-to-multibyte "\xc1\xb1")
726 (string-to-multibyte "\xf1")))
727 ;; Raw, char alt
728 (should (string-match (string-to-multibyte "[\xf1]")
729 (string-to-multibyte "\xf1")))
730 ;; Raw range
731 (should (string-match (string-to-multibyte "[\x82-\xd3]")
732 (string-to-multibyte "\xbb")))
733 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
734 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å"))
735 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
736 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
737 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
738 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
739 (should (string-match (string-to-multibyte "[f-\xd3]")
740 (string-to-multibyte "q")))
741 (should (string-match (string-to-multibyte "[f-\xd3]")
742 (string-to-multibyte "\xbb")))
743 (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
744 (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å"))
745 (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
746 (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
747 ;; Unicode-raw range: should be empty
748 (should-not (string-match "[å-\xd3]" "å"))
749 (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3")))
750 (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb")))
751 (should-not (string-match "[å-\xd3]" "ü"))
752 ;; No equivalence between raw bytes and latin-1
753 (should-not (string-match "å" (string-to-multibyte "\xe5")))
754 (should-not (string-match "[å]" (string-to-multibyte "\xe5")))
755 (should-not (string-match "\xe5" "å"))
756 (should-not (string-match "[\xe5]" "å")))
757
758(ert-deftest regexp-unibyte-multibyte ()
759 "Test matching a unibyte regexp against a multibyte string."
760 ;; ASCII
761 (should (string-match "a[b]" (string-to-multibyte "ab")))
762 ;; Unicode
763 (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
764 ;; Raw
765 (should (string-match "\xf1" (string-to-multibyte "\xf1")))
766 (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
767 ;; Raw, char alt
768 (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
769 (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
770 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
771 (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
772 (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
773 (should-not (string-match "[f-\xd3]" "e"))
774 (should-not (string-match "[f-\xd3]" "Å"))
775 (should-not (string-match "[f-\xd3]" "ü"))
776 (should-not (string-match "[f-\xd3]" "\xd4"))
777 ;; No equivalence between raw bytes and latin-1
778 (should-not (string-match "\xe5" "å"))
779 (should-not (string-match "[\xe5]" "å")))
780
781(ert-deftest regexp-multibyte-unibyte ()
782 "Test matching a multibyte regexp against a unibyte string."
783 ;; ASCII
784 (should (string-match (string-to-multibyte "a[b]") "ab"))
785 ;; Unicode
786 (should (string-match "a[^ü]c" "abc"))
787 (should-not (string-match "ü" "\xc3\xbc"))
788 ;; Raw
789 (should (string-match (string-to-multibyte "\xf1") "\xf1"))
790 (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
791 ;; Raw, char alt
792 (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
793 (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
794 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
795 (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
796 (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
797 (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
798 (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
799 ;; Unicode-raw range: should be empty
800 (should-not (string-match "[å-\xd3]" "\xd3"))
801 (should-not (string-match "[å-\xd3]" "\xbb"))
802 ;; No equivalence between raw bytes and latin-1
803 (should-not (string-match "å" "\xe5"))
804 (should-not (string-match "[å]" "\xe5")))
805
686;;; regex-emacs-tests.el ends here 806;;; regex-emacs-tests.el ends here