Fix ‘[[:cc:]]*literal’ regex failing to match ‘literal’ (bug#24020)

The regex engine tries to optimise Kleene star by avoiding backtracking when it can detect that star’s operand cannot match what follows it in the pattern. For example, when ‘[[:alpha:]]*1’ tries to match a ‘foo’, the engine will test the longest match for ‘[[:alpha:]]*’, namely ’foo’ which is the entire string. Literal digit one still present in the pattern will however not match the remaining empty string. Normally, backtracking would be performed trying a shorter match for the character class (namely ‘fo’ leaving ‘o’ in the string), but since the engine knows whatever would be put back into the string cannot possibly match literal digit one so no backtracking will be attempted. In the regexes of the form ‘[[:CC:]]*X’, the optimisation can be applied if the character class CC does not match character X. In the above example, this holds because digit one is not in alpha character class. This test is performed by mutually_exclusive_p function but it did not check class bits of a charset opcode. This resulted in an assumption that character classes do not match multibyte characters. For example, it would incorrectly conclude that [[:alpha:]] doesn’t match ‘ż’. This, in turn, led to the aforementioned Kleene star optimisation being incorrectly applied in patterns such as ‘[[:graph:]]*☠’ (which should match ‘☠’ but doesn’t as can be tested by executing (string-match-p "[[:graph:]]*☠" "☠") which should return 0 but instead yields nil. This issue affects any class witch matches multibyte characters, i.e. if ‘[[:cc:]]’ matches a multibyte character X then ‘[[:cc:]]*X’ will fail to match ‘X’. * src/regex.c (executing_charset): A new function for executing the charset and charset_not opcodes. It performs check on the character taking into consideration existing bitmap, range table and class bits. It also advances the pointer in the regex bytecode past the parsed opcode. (CHARSET_LOOKUP_RANGE_TABLE_RAW, CHARSET_LOOKUP_RANGE_TABLE): Removed. Code now included in executing_charset. (mutually_exclusive_p, re_match_2_internal): Changed to take advantage of executing_charset function. * test/src/regex-tests.el: New file with tests for the character class matching.
author: Michal Nazarewicz 2016-07-18 15:59:26 +0200
committer: Michal Nazarewicz 2016-07-25 23:52:27 +0200
commit: 6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43 (patch)
tree: ffad067337d44c6b559f474fb7421fa7fcf892c6
parent: b176d169347925d57ca63ab63b85d92e49a53c81 (diff)
download: emacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.tar.gz
emacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.zip
2 files changed, 185 insertions, 116 deletions
diff --git a/src/regex.c b/src/regex.c
index f92bcb7923b..297bf718848 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -783,44 +783,6 @@ extract_number_and_incr (re_char **source)
   and end.  */
 #define CHARSET_RANGE_TABLE_END(range_table, count)     \
  ((range_table) + (count) * 2 * 3)
-/* Test if C is in RANGE_TABLE.  A flag NOT is negated if C is in.
-   COUNT is number of ranges in RANGE_TABLE.  */
-#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)      \
-  do                                                                    \
-    {                                                                   \
-      re_wchar_t range_start, range_end;                                \
-      re_char *rtp;                                                     \
-      re_char *range_table_end                                          \
-        = CHARSET_RANGE_TABLE_END ((range_table), (count));             \
-                                                                        \
-      for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3)    \
-        {                                                               \
-          EXTRACT_CHARACTER (range_start, rtp);                         \
-          EXTRACT_CHARACTER (range_end, rtp + 3);                       \
-                                                                        \
-          if (range_start <= (c) && (c) <= range_end)                   \
-            {                                                           \
-              (not) = !(not);                                           \
-              break;                                                    \
-            }                                                           \
-        }                                                               \
-    }                                                                   \
-  while (0)
-/* Test if C is in range table of CHARSET.  The flag NOT is negated if
-   C is listed in it.  */
-#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)                     \
-  do                                                                    \
-    {                                                                   \
-      /* Number of ranges in range table. */                            \
-      int count;                                                        \
-      re_char *range_table = CHARSET_RANGE_TABLE (charset);             \
-                                                                        \
-      EXTRACT_NUMBER_AND_INCR (count, range_table);                     \
-      CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);  \
-    }                                                                   \
-  while (0)
 /* If DEBUG is defined, Regex prints many voluminous messages about what
   it is doing (if the variable `debug' is nonzero).  If linked with the
@@ -4661,6 +4623,93 @@ skip_noops (const_re_char *p, const_re_char *pend)
  return p;
 }
+/* Test if C matches charset op.  *PP points to the charset or chraset_not
+   opcode.  When the function finishes, *PP will be advanced past that opcode.
+   C is character to test (possibly after translations) and CORIG is original
+   character (i.e. without any translations).  UNIBYTE denotes whether c is
+   unibyte or multibyte character. */
+static bool
+execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
+{
+  re_char *p = *pp, *rtp = NULL;
+  bool not = (re_opcode_t) *p == charset_not;
+  if (CHARSET_RANGE_TABLE_EXISTS_P (p))
+    {
+      int count;
+      rtp = CHARSET_RANGE_TABLE (p);
+      EXTRACT_NUMBER_AND_INCR (count, rtp);
+      *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
+    }
+  else
+    *pp += 2 + CHARSET_BITMAP_SIZE (p);
+  if (unibyte && c < (1 << BYTEWIDTH))
+    {                   /* Lookup bitmap.  */
+      /* Cast to `unsigned' instead of `unsigned char' in
+         case the bit list is a full 32 bytes long.  */
+      if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
+          && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
+        return !not;
+    }
+#ifdef emacs
+  else if (rtp)
+    {
+      int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+      re_wchar_t range_start, range_end;
+  /* Sort tests by the most commonly used classes with some adjustment to which
+     tests are easiest to perform.  Frequencies of character class names used in
+     Emacs sources as of 2016-07-15:
+     $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
+           sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
+         213 [:alnum:]
+         104 [:alpha:]
+          62 [:space:]
+          39 [:digit:]
+          36 [:blank:]
+          26 [:upper:]
+          24 [:word:]
+          21 [:lower:]
+          10 [:punct:]
+          10 [:ascii:]
+           9 [:xdigit:]
+           4 [:nonascii:]
+           4 [:graph:]
+           2 [:print:]
+           2 [:cntrl:]
+           1 [:ff:]
+   */
+      if ((class_bits & BIT_MULTIBYTE) ||
+          (class_bits & BIT_ALNUM && ISALNUM (c)) ||
+          (class_bits & BIT_ALPHA && ISALPHA (c)) ||
+          (class_bits & BIT_SPACE && ISSPACE (c)) ||
+          (class_bits & BIT_WORD  && ISWORD  (c)) ||
+          ((class_bits & BIT_UPPER) &&
+           (ISUPPER (c) || (corig != c &&
+                            c == downcase (corig) && ISLOWER (c)))) ||
+          ((class_bits & BIT_LOWER) &&
+           (ISLOWER (c) || (corig != c &&
+                            c == upcase (corig) && ISUPPER(c)))) ||
+          (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
+          (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
+          (class_bits & BIT_PRINT && ISPRINT (c)))
+        return !not;
+      for (p = *pp; rtp < p; rtp += 2 * 3)
+        {
+          EXTRACT_CHARACTER (range_start, rtp);
+          EXTRACT_CHARACTER (range_end, rtp + 3);
+          if (range_start <= c && c <= range_end)
+            return !not;
+        }
+    }
+#endif /* emacs */
+  return not;
+}
 /* Non-zero if "p1 matches something" implies "p2 fails".  */
 static int
 mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
@@ -4718,22 +4767,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
        else if ((re_opcode_t) *p1 == charset
                 || (re_opcode_t) *p1 == charset_not)
          {
-            int not = (re_opcode_t) *p1 == charset_not;
+            if (!execute_charset (&p1, c, c, !multibyte))
-            /* Test if C is listed in charset (or charset_not)
-               at `p1'.  */
-            if (! multibyte || IS_REAL_ASCII (c))
-              {
-                if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
-                    && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
-                  not = !not;
-              }
-            else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
-              CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
-            /* `not' is equal to 1 if c would match, which means
-               that we can't change to pop_failure_jump.  */
-            if (!not)
              {
                DEBUG_PRINT ("   No match => fast loop.\n");
                return 1;
@@ -5439,32 +5473,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
        case charset_not:
          {
            register unsigned int c, corig;
-            boolean not = (re_opcode_t) *(p - 1) == charset_not;
            int len;
-            /* Start of actual range_table, or end of bitmap if there is no
-               range table.  */
-            re_char *range_table UNINIT;
-            /* Nonzero if there is a range table.  */
-            int range_table_exists;
-            /* Number of ranges of range table.  This is not included
-               in the initial byte-length of the command.  */
-            int count = 0;
            /* Whether matching against a unibyte character.  */
            boolean unibyte_char = false;
-            DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
+            DEBUG_PRINT ("EXECUTING charset%s.\n",
+                         (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
-            range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
-            if (range_table_exists)
-              {
-                range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap.  */
-                EXTRACT_NUMBER_AND_INCR (count, range_table);
-              }
            PREFETCH ();
            corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
@@ -5498,47 +5513,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
                  unibyte_char = true;
              }
-            if (unibyte_char && c < (1 << BYTEWIDTH))
+            p -= 1;
-              {                 /* Lookup bitmap.  */
+            if (!execute_charset (&p, c, corig, unibyte_char))
-                /* Cast to `unsigned' instead of `unsigned char' in
+              goto fail;
-                   case the bit list is a full 32 bytes long.  */
-                if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
-                    && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
-                  not = !not;
-              }
-#ifdef emacs
-            else if (range_table_exists)
-              {
-                int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
-                if (  (class_bits & BIT_LOWER
-                       && (ISLOWER (c)
-                           || (corig != c
-                               && c == upcase (corig) && ISUPPER(c))))
-                    | (class_bits & BIT_MULTIBYTE)
-                    | (class_bits & BIT_PUNCT && ISPUNCT (c))
-                    | (class_bits & BIT_SPACE && ISSPACE (c))
-                    | (class_bits & BIT_UPPER
-                       && (ISUPPER (c)
-                           || (corig != c
-                               && c == downcase (corig) && ISLOWER (c))))
-                    | (class_bits & BIT_WORD  && ISWORD  (c))
-                    | (class_bits & BIT_ALPHA && ISALPHA (c))
-                    | (class_bits & BIT_ALNUM && ISALNUM (c))
-                    | (class_bits & BIT_GRAPH && ISGRAPH (c))
-                    | (class_bits & BIT_PRINT && ISPRINT (c)))
-                  not = !not;
-                else
-                  CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
-              }
-#endif /* emacs */
-            if (range_table_exists)
-              p = CHARSET_RANGE_TABLE_END (range_table, count);
-            else
-              p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
-            if (!not) goto fail;
            d += len;
          }
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
new file mode 100644
index 00000000000..00165ab0512
--- /dev/null
+++ b/test/src/regex-tests.el
@@ -0,0 +1,92 @@
+;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
+;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
+;; This file is part of GNU Emacs.
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
+;;; Code:
+(require 'ert)
+(ert-deftest regex-word-cc-fallback-test ()
+  "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
+Test that a regex of the form \"[[:cc:]]*x\" where CC is
+a character class which matches a multibyte character X, matches
+string \"x\".
+For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
+character) must match a string \"\u2420\"."
+  (dolist (class '("[[:word:]]" "\\sw"))
+    (dolist (repeat '("*" "+"))
+      (dolist (suffix '("" "b" "bar" "\u2620"))
+        (dolist (string '("" "foo"))
+          (when (not (and (string-equal repeat "+")
+                          (string-equal string "")))
+            (should (string-match (concat "^" class repeat suffix "$")
+                                  (concat string suffix)))))))))
+(defun regex--test-cc (name matching not-matching)
+  (should (string-match-p (concat "^[[:" name ":]]*$") matching))
+  (should (string-match-p (concat "^[[:" name ":]]*?\u2622$")
+                          (concat matching "\u2622")))
+  (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching))
+  (should (string-match-p (concat "^[^[:" name ":]]*\u2622$")
+                          (concat not-matching "\u2622")))
+  (with-temp-buffer
+    (insert matching)
+    (let ((p (point)))
+      (insert not-matching)
+      (goto-char (point-min))
+      (skip-chars-forward (concat "[:" name ":]"))
+      (should (equal (point) p))
+      (skip-chars-forward (concat "^[:" name ":]"))
+      (should (equal (point) (point-max)))
+      (goto-char (point-min))
+      (skip-chars-forward (concat "[:" name ":]\u2622"))
+      (should (or (equal (point) p) (equal (point) (1+ p)))))))
+(ert-deftest regex-character-classes ()
+  "Perform sanity test of regexes using character classes.
+Go over all the supported character classes and test whether the
+classes and their inversions match what they are supposed to
+match.  The test is done using `string-match-p' as well as
+`skip-chars-forward'."
+  (let (case-fold-search)
+    (regex--test-cc "alnum" "abcABC012łąka" "-, \t\n")
+    (regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n")
+    (regex--test-cc "digit" "012" "abcABCłąka-, \t\n")
+    (regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n")
+    (regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n")
+    (regex--test-cc "lower" "abcłąka" "ABC012-, \t\n")
+    (regex--test-cc "word" "abcABC012\u2620" "-, \t\n")
+    (regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n")
+    (regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
+    (regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1")
+    (regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1")
+    (regex--test-cc "space" " \t\n\u2001" "abcABCł0123")
+    (regex--test-cc "blank" " \t" "\n\u2001")
+    (regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620")
+    (regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1")
+    (regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
+    (regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
+;;; regex-tests.el ends here
author	Michal Nazarewicz	2016-07-18 15:59:26 +0200
committer	Michal Nazarewicz	2016-07-25 23:52:27 +0200
commit	6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43 (patch)
tree	ffad067337d44c6b559f474fb7421fa7fcf892c6
parent	b176d169347925d57ca63ab63b85d92e49a53c81 (diff)
download	emacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.tar.gz emacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.zip

diff --git a/src/regex.c b/src/regex.c index f92bcb7923b..297bf718848 100644 --- a/src/regex.c +++ b/src/regex.c
@@ -783,44 +783,6 @@ extract_number_and_incr (re_char **source)
783	and end. */	783	and end. */
784	#define CHARSET_RANGE_TABLE_END(range_table, count) \	784	#define CHARSET_RANGE_TABLE_END(range_table, count) \
785	((range_table) + (count) * 2 * 3)	785	((range_table) + (count) * 2 * 3)
786
787	/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
788	COUNT is number of ranges in RANGE_TABLE. */
789	#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
790	do \
791	{ \
792	re_wchar_t range_start, range_end; \
793	re_char *rtp; \
794	re_char *range_table_end \
795	= CHARSET_RANGE_TABLE_END ((range_table), (count)); \
796	\
797	for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
798	{ \
799	EXTRACT_CHARACTER (range_start, rtp); \
800	EXTRACT_CHARACTER (range_end, rtp + 3); \
801	\
802	if (range_start <= (c) && (c) <= range_end) \
803	{ \
804	(not) = !(not); \
805	break; \
806	} \
807	} \
808	} \
809	while (0)
810
811	/* Test if C is in range table of CHARSET. The flag NOT is negated if
812	C is listed in it. */
813	#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
814	do \
815	{ \
816	/* Number of ranges in range table. */ \
817	int count; \
818	re_char *range_table = CHARSET_RANGE_TABLE (charset); \
819	\
820	EXTRACT_NUMBER_AND_INCR (count, range_table); \
821	CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
822	} \
823	while (0)
824		786
825	/* If DEBUG is defined, Regex prints many voluminous messages about what	787	/* If DEBUG is defined, Regex prints many voluminous messages about what
826	it is doing (if the variable `debug' is nonzero). If linked with the	788	it is doing (if the variable `debug' is nonzero). If linked with the
@@ -4661,6 +4623,93 @@ skip_noops (const_re_char p, const_re_char pend)
4661	return p;	4623	return p;
4662	}	4624	}
4663		4625
		4626	/* Test if C matches charset op. *PP points to the charset or chraset_not
		4627	opcode. When the function finishes, *PP will be advanced past that opcode.
		4628	C is character to test (possibly after translations) and CORIG is original
		4629	character (i.e. without any translations). UNIBYTE denotes whether c is
		4630	unibyte or multibyte character. */
		4631	static bool
		4632	execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
		4633	{
		4634	re_char p = pp, *rtp = NULL;
		4635	bool not = (re_opcode_t) *p == charset_not;
		4636
		4637	if (CHARSET_RANGE_TABLE_EXISTS_P (p))
		4638	{
		4639	int count;
		4640	rtp = CHARSET_RANGE_TABLE (p);
		4641	EXTRACT_NUMBER_AND_INCR (count, rtp);
		4642	*pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
		4643	}
		4644	else
		4645	*pp += 2 + CHARSET_BITMAP_SIZE (p);
		4646
		4647	if (unibyte && c < (1 << BYTEWIDTH))
		4648	{ /* Lookup bitmap. */
		4649	/* Cast to `unsigned' instead of `unsigned char' in
		4650	case the bit list is a full 32 bytes long. */
		4651	if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
		4652	&& p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
		4653	return !not;
		4654	}
		4655	#ifdef emacs
		4656	else if (rtp)
		4657	{
		4658	int class_bits = CHARSET_RANGE_TABLE_BITS (p);
		4659	re_wchar_t range_start, range_end;
		4660
		4661	/* Sort tests by the most commonly used classes with some adjustment to which
		4662	tests are easiest to perform. Frequencies of character class names used in
		4663	Emacs sources as of 2016-07-15:
		4664
		4665	$ find \( -name \.c -o -name \.el \) -exec grep -h '\[:[a-z]*:]' {} + \|
		4666	sed 's/]/]\n/g' \|grep -o '\[:[a-z]*:]' \|sort \|uniq -c \|sort -nr
		4667	213 [:alnum:]
		4668	104 [:alpha:]
		4669	62 [:space:]
		4670	39 [:digit:]
		4671	36 [:blank:]
		4672	26 [:upper:]
		4673	24 [:word:]
		4674	21 [:lower:]
		4675	10 [:punct:]
		4676	10 [:ascii:]
		4677	9 [:xdigit:]
		4678	4 [:nonascii:]
		4679	4 [:graph:]
		4680	2 [:print:]
		4681	2 [:cntrl:]
		4682	1 [:ff:]
		4683	*/
		4684
		4685	if ((class_bits & BIT_MULTIBYTE) \|\|
		4686	(class_bits & BIT_ALNUM && ISALNUM (c)) \|\|
		4687	(class_bits & BIT_ALPHA && ISALPHA (c)) \|\|
		4688	(class_bits & BIT_SPACE && ISSPACE (c)) \|\|
		4689	(class_bits & BIT_WORD && ISWORD (c)) \|\|
		4690	((class_bits & BIT_UPPER) &&
		4691	(ISUPPER (c) \|\| (corig != c &&
		4692	c == downcase (corig) && ISLOWER (c)))) \|\|
		4693	((class_bits & BIT_LOWER) &&
		4694	(ISLOWER (c) \|\| (corig != c &&
		4695	c == upcase (corig) && ISUPPER(c)))) \|\|
		4696	(class_bits & BIT_PUNCT && ISPUNCT (c)) \|\|
		4697	(class_bits & BIT_GRAPH && ISGRAPH (c)) \|\|
		4698	(class_bits & BIT_PRINT && ISPRINT (c)))
		4699	return !not;
		4700
		4701	for (p = pp; rtp < p; rtp += 2 3)
		4702	{
		4703	EXTRACT_CHARACTER (range_start, rtp);
		4704	EXTRACT_CHARACTER (range_end, rtp + 3);
		4705	if (range_start <= c && c <= range_end)
		4706	return !not;
		4707	}
		4708	}
		4709	#endif /* emacs */
		4710	return not;
		4711	}
		4712
4664	/* Non-zero if "p1 matches something" implies "p2 fails". */	4713	/* Non-zero if "p1 matches something" implies "p2 fails". */
4665	static int	4714	static int
4666	mutually_exclusive_p (struct re_pattern_buffer bufp, const_re_char p1,	4715	mutually_exclusive_p (struct re_pattern_buffer bufp, const_re_char p1,
@@ -4718,22 +4767,7 @@ mutually_exclusive_p (struct re_pattern_buffer bufp, const_re_char p1,
4718	else if ((re_opcode_t) *p1 == charset	4767	else if ((re_opcode_t) *p1 == charset
4719	\|\| (re_opcode_t) *p1 == charset_not)	4768	\|\| (re_opcode_t) *p1 == charset_not)
4720	{	4769	{
4721	int not = (re_opcode_t) *p1 == charset_not;	4770	if (!execute_charset (&p1, c, c, !multibyte))
4722
4723	/* Test if C is listed in charset (or charset_not)
4724	at `p1'. */
4725	if (! multibyte \|\| IS_REAL_ASCII (c))
4726	{
4727	if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4728	&& p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4729	not = !not;
4730	}
4731	else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4732	CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4733
4734	/* `not' is equal to 1 if c would match, which means
4735	that we can't change to pop_failure_jump. */
4736	if (!not)
4737	{	4771	{
4738	DEBUG_PRINT (" No match => fast loop.\n");	4772	DEBUG_PRINT (" No match => fast loop.\n");
4739	return 1;	4773	return 1;
@@ -5439,32 +5473,13 @@ re_match_2_internal (struct re_pattern_buffer bufp, const_re_char string1,
5439	case charset_not:	5473	case charset_not:
5440	{	5474	{
5441	register unsigned int c, corig;	5475	register unsigned int c, corig;
5442	boolean not = (re_opcode_t) *(p - 1) == charset_not;
5443	int len;	5476	int len;
5444		5477
5445	/* Start of actual range_table, or end of bitmap if there is no
5446	range table. */
5447	re_char *range_table UNINIT;
5448
5449	/* Nonzero if there is a range table. */
5450	int range_table_exists;
5451
5452	/* Number of ranges of range table. This is not included
5453	in the initial byte-length of the command. */
5454	int count = 0;
5455
5456	/* Whether matching against a unibyte character. */	5478	/* Whether matching against a unibyte character. */
5457	boolean unibyte_char = false;	5479	boolean unibyte_char = false;
5458		5480
5459	DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");	5481	DEBUG_PRINT ("EXECUTING charset%s.\n",
5460		5482	(re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
5461	range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5462
5463	if (range_table_exists)
5464	{
5465	range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5466	EXTRACT_NUMBER_AND_INCR (count, range_table);
5467	}
5468		5483
5469	PREFETCH ();	5484	PREFETCH ();
5470	corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);	5485	corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
@@ -5498,47 +5513,9 @@ re_match_2_internal (struct re_pattern_buffer bufp, const_re_char string1,
5498	unibyte_char = true;	5513	unibyte_char = true;
5499	}	5514	}
5500		5515
5501	if (unibyte_char && c < (1 << BYTEWIDTH))	5516	p -= 1;
5502	{ /* Lookup bitmap. */	5517	if (!execute_charset (&p, c, corig, unibyte_char))
5503	/* Cast to `unsigned' instead of `unsigned char' in	5518	goto fail;
5504	case the bit list is a full 32 bytes long. */
5505	if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5506	&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5507	not = !not;
5508	}
5509	#ifdef emacs
5510	else if (range_table_exists)
5511	{
5512	int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5513
5514	if ( (class_bits & BIT_LOWER
5515	&& (ISLOWER (c)
5516	\|\| (corig != c
5517	&& c == upcase (corig) && ISUPPER(c))))
5518	\| (class_bits & BIT_MULTIBYTE)
5519	\| (class_bits & BIT_PUNCT && ISPUNCT (c))
5520	\| (class_bits & BIT_SPACE && ISSPACE (c))
5521	\| (class_bits & BIT_UPPER
5522	&& (ISUPPER (c)
5523	\|\| (corig != c
5524	&& c == downcase (corig) && ISLOWER (c))))
5525	\| (class_bits & BIT_WORD && ISWORD (c))
5526	\| (class_bits & BIT_ALPHA && ISALPHA (c))
5527	\| (class_bits & BIT_ALNUM && ISALNUM (c))
5528	\| (class_bits & BIT_GRAPH && ISGRAPH (c))
5529	\| (class_bits & BIT_PRINT && ISPRINT (c)))
5530	not = !not;
5531	else
5532	CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5533	}
5534	#endif /* emacs */
5535
5536	if (range_table_exists)
5537	p = CHARSET_RANGE_TABLE_END (range_table, count);
5538	else
5539	p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5540
5541	if (!not) goto fail;
5542		5519
5543	d += len;	5520	d += len;
5544	}	5521	}


diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el new file mode 100644 index 00000000000..00165ab0512 --- /dev/null +++ b/test/src/regex-tests.el
@@ -0,0 +1,92 @@
		1	;;; regex-tests.el --- tests for regex.c functions -- lexical-binding: t --
		2
		3	;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
		4
		5	;; This file is part of GNU Emacs.
		6
		7	;; GNU Emacs is free software: you can redistribute it and/or modify
		8	;; it under the terms of the GNU General Public License as published by
		9	;; the Free Software Foundation, either version 3 of the License, or
		10	;; (at your option) any later version.
		11
		12	;; GNU Emacs is distributed in the hope that it will be useful,
		13	;; but WITHOUT ANY WARRANTY; without even the implied warranty of
		14	;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		15	;; GNU General Public License for more details.
		16
		17	;; You should have received a copy of the GNU General Public License
		18	;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
		19
		20	;;; Code:
		21
		22	(require 'ert)
		23
		24	(ert-deftest regex-word-cc-fallback-test ()
		25	"Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
		26
		27	Test that a regex of the form \"[[:cc:]]*x\" where CC is
		28	a character class which matches a multibyte character X, matches
		29	string \"x\".
		30
		31	For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
		32	character) must match a string \"\u2420\"."
		33	(dolist (class '("[[:word:]]" "\\sw"))
		34	(dolist (repeat '("*" "+"))
		35	(dolist (suffix '("" "b" "bar" "\u2620"))
		36	(dolist (string '("" "foo"))
		37	(when (not (and (string-equal repeat "+")
		38	(string-equal string "")))
		39	(should (string-match (concat "^" class repeat suffix "$")
		40	(concat string suffix)))))))))
		41
		42	(defun regex--test-cc (name matching not-matching)
		43	(should (string-match-p (concat "^[[:" name ":]]*$") matching))
		44	(should (string-match-p (concat "^[[:" name ":]]*?\u2622$")
		45	(concat matching "\u2622")))
		46	(should (string-match-p (concat "^[^[:" name ":]]*$") not-matching))
		47	(should (string-match-p (concat "^[^[:" name ":]]*\u2622$")
		48	(concat not-matching "\u2622")))
		49	(with-temp-buffer
		50	(insert matching)
		51	(let ((p (point)))
		52	(insert not-matching)
		53	(goto-char (point-min))
		54	(skip-chars-forward (concat "[:" name ":]"))
		55	(should (equal (point) p))
		56	(skip-chars-forward (concat "^[:" name ":]"))
		57	(should (equal (point) (point-max)))
		58	(goto-char (point-min))
		59	(skip-chars-forward (concat "[:" name ":]\u2622"))
		60	(should (or (equal (point) p) (equal (point) (1+ p)))))))
		61
		62	(ert-deftest regex-character-classes ()
		63	"Perform sanity test of regexes using character classes.
		64
		65	Go over all the supported character classes and test whether the
		66	classes and their inversions match what they are supposed to
		67	match. The test is done using `string-match-p' as well as
		68	`skip-chars-forward'."
		69	(let (case-fold-search)
		70	(regex--test-cc "alnum" "abcABC012łąka" "-, \t\n")
		71	(regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n")
		72	(regex--test-cc "digit" "012" "abcABCłąka-, \t\n")
		73	(regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n")
		74	(regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n")
		75	(regex--test-cc "lower" "abcłąka" "ABC012-, \t\n")
		76
		77	(regex--test-cc "word" "abcABC012\u2620" "-, \t\n")
		78
		79	(regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n")
		80	(regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
		81	(regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1")
		82	(regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1")
		83
		84	(regex--test-cc "space" " \t\n\u2001" "abcABCł0123")
		85	(regex--test-cc "blank" " \t" "\n\u2001")
		86
		87	(regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620")
		88	(regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1")
		89	(regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
		90	(regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
		91
		92	;;; regex-tests.el ends here