aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Engdegård2019-06-28 10:20:55 +0200
committerMattias Engdegård2019-06-28 17:30:18 +0200
commita1f76adfb03c23bb4242928e8efe6193c301f0c1 (patch)
tree7e2a5c58656ffbe78d34dc58639d7cd5bf8f943a
parentaae5bf4438712c9fe761c5e4b5a871192852cd97 (diff)
downloademacs-a1f76adfb03c23bb4242928e8efe6193c301f0c1.tar.gz
emacs-a1f76adfb03c23bb4242928e8efe6193c301f0c1.zip
Correct regexp matching of raw bytes
Make regexp matching of raw bytes work in all combination of unibyte and multibyte patterns and targets, as exact strings and in character alternatives (bug#3687). * src/regex-emacs.c (analyze_first): Include raw byte in fastmap when pattern is a multibyte exact string. Include leading byte in fastmap for raw bytes in character alternatives. (re_match_2_internal): Decrement the byte count by the number of bytes in the pattern character, not 1. * test/src/regex-emacs-tests.el (regexp-unibyte-unibyte) (regexp-multibyte-unibyte, regexp-unibyte-mutibyte) (regexp-multibyte-multibyte): New tests.
-rw-r--r--src/regex-emacs.c24
-rw-r--r--test/src/regex-emacs-tests.el120
2 files changed, 140 insertions, 4 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index c353a78fb4f..5887eaa30c7 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -2794,6 +2794,7 @@ static int
2794analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) 2794analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2795{ 2795{
2796 int j, k; 2796 int j, k;
2797 int nbits;
2797 bool not; 2798 bool not;
2798 2799
2799 /* If all elements for base leading-codes in fastmap is set, this 2800 /* If all elements for base leading-codes in fastmap is set, this
@@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2854 each byte is a character. Thus, this works in both 2855 each byte is a character. Thus, this works in both
2855 cases. */ 2856 cases. */
2856 fastmap[p[1]] = 1; 2857 fastmap[p[1]] = 1;
2857 if (! multibyte) 2858 if (multibyte)
2859 {
2860 /* Cover the case of matching a raw char in a
2861 multibyte regexp against unibyte. */
2862 if (CHAR_BYTE8_HEAD_P (p[1]))
2863 fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2864 }
2865 else
2858 { 2866 {
2859 /* For the case of matching this unibyte regex 2867 /* For the case of matching this unibyte regex
2860 against multibyte, we must set a leading code of 2868 against multibyte, we must set a leading code of
@@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2886 case charset: 2894 case charset:
2887 if (!fastmap) break; 2895 if (!fastmap) break;
2888 not = (re_opcode_t) *(p - 1) == charset_not; 2896 not = (re_opcode_t) *(p - 1) == charset_not;
2889 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; 2897 nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2890 j >= 0; j--) 2898 p++;
2899 for (j = 0; j < nbits; j++)
2891 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) 2900 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2892 fastmap[j] = 1; 2901 fastmap[j] = 1;
2893 2902
2903 /* To match raw bytes (in the 80..ff range) against multibyte
2904 strings, add their leading bytes to the fastmap. */
2905 for (j = 0x80; j < nbits; j++)
2906 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2907 fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2908
2894 if (/* Any leading code can possibly start a character 2909 if (/* Any leading code can possibly start a character
2895 which doesn't match the specified set of characters. */ 2910 which doesn't match the specified set of characters. */
2896 not 2911 not
@@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
4251 } 4266 }
4252 p += pat_charlen; 4267 p += pat_charlen;
4253 d++; 4268 d++;
4269 mcnt -= pat_charlen;
4254 } 4270 }
4255 while (--mcnt); 4271 while (mcnt > 0);
4256 4272
4257 break; 4273 break;
4258 4274
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 0ae50c94d4c..50ed3e870a5 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -683,4 +683,124 @@ This evaluates the TESTS test cases from glibc."
683 (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) 683 (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
684 (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp)) 684 (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
685 685
686(ert-deftest regexp-unibyte-unibyte ()
687 "Test matching a unibyte regexp against a unibyte string."
688 ;; Sanity check
689 (should-not (multibyte-string-p "ab"))
690 (should-not (multibyte-string-p "\xff"))
691 ;; ASCII
692 (should (string-match "a[b]" "ab"))
693 ;; Raw
694 (should (string-match "\xf1" "\xf1"))
695 (should-not (string-match "\xf1" "\xc1\xb1"))
696 ;; Raw, char alt
697 (should (string-match "[\xf1]" "\xf1"))
698 (should-not (string-match "[\xf1]" "\xc1\xb1"))
699 ;; Raw range
700 (should (string-match "[\x82-\xd3]" "\xbb"))
701 (should-not (string-match "[\x82-\xd3]" "a"))
702 (should-not (string-match "[\x82-\xd3]" "\x81"))
703 (should-not (string-match "[\x82-\xd3]" "\xd4"))
704 ;; ASCII-raw range
705 (should (string-match "[f-\xd3]" "q"))
706 (should (string-match "[f-\xd3]" "\xbb"))
707 (should-not (string-match "[f-\xd3]" "e"))
708 (should-not (string-match "[f-\xd3]" "\xd4")))
709
710(ert-deftest regexp-multibyte-multibyte ()
711 "Test matching a multibyte regexp against a multibyte string."
712 ;; Sanity check
713 (should (multibyte-string-p "åü"))
714 ;; ASCII
715 (should (string-match (string-to-multibyte "a[b]")
716 (string-to-multibyte "ab")))
717 ;; Unicode
718 (should (string-match "å[ü]z" "åüz"))
719 (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
720 ;; Raw
721 (should (string-match (string-to-multibyte "\xf1")
722 (string-to-multibyte "\xf1")))
723 (should-not (string-match (string-to-multibyte "\xf1")
724 (string-to-multibyte "\xc1\xb1")))
725 (should-not (string-match (string-to-multibyte "\xc1\xb1")
726 (string-to-multibyte "\xf1")))
727 ;; Raw, char alt
728 (should (string-match (string-to-multibyte "[\xf1]")
729 (string-to-multibyte "\xf1")))
730 ;; Raw range
731 (should (string-match (string-to-multibyte "[\x82-\xd3]")
732 (string-to-multibyte "\xbb")))
733 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
734 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å"))
735 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
736 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
737 (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
738 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
739 (should (string-match (string-to-multibyte "[f-\xd3]")
740 (string-to-multibyte "q")))
741 (should (string-match (string-to-multibyte "[f-\xd3]")
742 (string-to-multibyte "\xbb")))
743 (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
744 (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å"))
745 (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
746 (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
747 ;; Unicode-raw range: should be empty
748 (should-not (string-match "[å-\xd3]" "å"))
749 (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3")))
750 (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb")))
751 (should-not (string-match "[å-\xd3]" "ü"))
752 ;; No equivalence between raw bytes and latin-1
753 (should-not (string-match "å" (string-to-multibyte "\xe5")))
754 (should-not (string-match "[å]" (string-to-multibyte "\xe5")))
755 (should-not (string-match "\xe5" "å"))
756 (should-not (string-match "[\xe5]" "å")))
757
758(ert-deftest regexp-unibyte-multibyte ()
759 "Test matching a unibyte regexp against a multibyte string."
760 ;; ASCII
761 (should (string-match "a[b]" (string-to-multibyte "ab")))
762 ;; Unicode
763 (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
764 ;; Raw
765 (should (string-match "\xf1" (string-to-multibyte "\xf1")))
766 (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
767 ;; Raw, char alt
768 (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
769 (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
770 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
771 (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
772 (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
773 (should-not (string-match "[f-\xd3]" "e"))
774 (should-not (string-match "[f-\xd3]" "Å"))
775 (should-not (string-match "[f-\xd3]" "ü"))
776 (should-not (string-match "[f-\xd3]" "\xd4"))
777 ;; No equivalence between raw bytes and latin-1
778 (should-not (string-match "\xe5" "å"))
779 (should-not (string-match "[\xe5]" "å")))
780
781(ert-deftest regexp-multibyte-unibyte ()
782 "Test matching a multibyte regexp against a unibyte string."
783 ;; ASCII
784 (should (string-match (string-to-multibyte "a[b]") "ab"))
785 ;; Unicode
786 (should (string-match "a[^ü]c" "abc"))
787 (should-not (string-match "ü" "\xc3\xbc"))
788 ;; Raw
789 (should (string-match (string-to-multibyte "\xf1") "\xf1"))
790 (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
791 ;; Raw, char alt
792 (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
793 (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
794 ;; ASCII-raw range: should exclude U+0100..U+10FFFF
795 (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
796 (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
797 (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
798 (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
799 ;; Unicode-raw range: should be empty
800 (should-not (string-match "[å-\xd3]" "\xd3"))
801 (should-not (string-match "[å-\xd3]" "\xbb"))
802 ;; No equivalence between raw bytes and latin-1
803 (should-not (string-match "å" "\xe5"))
804 (should-not (string-match "[å]" "\xe5")))
805
686;;; regex-emacs-tests.el ends here 806;;; regex-emacs-tests.el ends here