aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Nazarewicz2016-07-18 15:59:26 +0200
committerMichal Nazarewicz2016-07-25 23:52:27 +0200
commit6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43 (patch)
treeffad067337d44c6b559f474fb7421fa7fcf892c6
parentb176d169347925d57ca63ab63b85d92e49a53c81 (diff)
downloademacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.tar.gz
emacs-6dc6b0079ed3632ed9082bc79d8cb6fc96d33f43.zip
Fix ‘[[:cc:]]*literal’ regex failing to match ‘literal’ (bug#24020)
The regex engine tries to optimise Kleene star by avoiding backtracking when it can detect that star’s operand cannot match what follows it in the pattern. For example, when ‘[[:alpha:]]*1’ tries to match a ‘foo’, the engine will test the longest match for ‘[[:alpha:]]*’, namely ’foo’ which is the entire string. Literal digit one still present in the pattern will however not match the remaining empty string. Normally, backtracking would be performed trying a shorter match for the character class (namely ‘fo’ leaving ‘o’ in the string), but since the engine knows whatever would be put back into the string cannot possibly match literal digit one so no backtracking will be attempted. In the regexes of the form ‘[[:CC:]]*X’, the optimisation can be applied if the character class CC does not match character X. In the above example, this holds because digit one is not in alpha character class. This test is performed by mutually_exclusive_p function but it did not check class bits of a charset opcode. This resulted in an assumption that character classes do not match multibyte characters. For example, it would incorrectly conclude that [[:alpha:]] doesn’t match ‘ż’. This, in turn, led to the aforementioned Kleene star optimisation being incorrectly applied in patterns such as ‘[[:graph:]]*☠’ (which should match ‘☠’ but doesn’t as can be tested by executing (string-match-p "[[:graph:]]*☠" "☠") which should return 0 but instead yields nil. This issue affects any class witch matches multibyte characters, i.e. if ‘[[:cc:]]’ matches a multibyte character X then ‘[[:cc:]]*X’ will fail to match ‘X’. * src/regex.c (executing_charset): A new function for executing the charset and charset_not opcodes. It performs check on the character taking into consideration existing bitmap, range table and class bits. It also advances the pointer in the regex bytecode past the parsed opcode. (CHARSET_LOOKUP_RANGE_TABLE_RAW, CHARSET_LOOKUP_RANGE_TABLE): Removed. Code now included in executing_charset. (mutually_exclusive_p, re_match_2_internal): Changed to take advantage of executing_charset function. * test/src/regex-tests.el: New file with tests for the character class matching.
-rw-r--r--src/regex.c209
-rw-r--r--test/src/regex-tests.el92
2 files changed, 185 insertions, 116 deletions
diff --git a/src/regex.c b/src/regex.c
index f92bcb7923b..297bf718848 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -783,44 +783,6 @@ extract_number_and_incr (re_char **source)
783 and end. */ 783 and end. */
784#define CHARSET_RANGE_TABLE_END(range_table, count) \ 784#define CHARSET_RANGE_TABLE_END(range_table, count) \
785 ((range_table) + (count) * 2 * 3) 785 ((range_table) + (count) * 2 * 3)
786
787/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
788 COUNT is number of ranges in RANGE_TABLE. */
789#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
790 do \
791 { \
792 re_wchar_t range_start, range_end; \
793 re_char *rtp; \
794 re_char *range_table_end \
795 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
796 \
797 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
798 { \
799 EXTRACT_CHARACTER (range_start, rtp); \
800 EXTRACT_CHARACTER (range_end, rtp + 3); \
801 \
802 if (range_start <= (c) && (c) <= range_end) \
803 { \
804 (not) = !(not); \
805 break; \
806 } \
807 } \
808 } \
809 while (0)
810
811/* Test if C is in range table of CHARSET. The flag NOT is negated if
812 C is listed in it. */
813#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
814 do \
815 { \
816 /* Number of ranges in range table. */ \
817 int count; \
818 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
819 \
820 EXTRACT_NUMBER_AND_INCR (count, range_table); \
821 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
822 } \
823 while (0)
824 786
825/* If DEBUG is defined, Regex prints many voluminous messages about what 787/* If DEBUG is defined, Regex prints many voluminous messages about what
826 it is doing (if the variable `debug' is nonzero). If linked with the 788 it is doing (if the variable `debug' is nonzero). If linked with the
@@ -4661,6 +4623,93 @@ skip_noops (const_re_char *p, const_re_char *pend)
4661 return p; 4623 return p;
4662} 4624}
4663 4625
4626/* Test if C matches charset op. *PP points to the charset or chraset_not
4627 opcode. When the function finishes, *PP will be advanced past that opcode.
4628 C is character to test (possibly after translations) and CORIG is original
4629 character (i.e. without any translations). UNIBYTE denotes whether c is
4630 unibyte or multibyte character. */
4631static bool
4632execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
4633{
4634 re_char *p = *pp, *rtp = NULL;
4635 bool not = (re_opcode_t) *p == charset_not;
4636
4637 if (CHARSET_RANGE_TABLE_EXISTS_P (p))
4638 {
4639 int count;
4640 rtp = CHARSET_RANGE_TABLE (p);
4641 EXTRACT_NUMBER_AND_INCR (count, rtp);
4642 *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
4643 }
4644 else
4645 *pp += 2 + CHARSET_BITMAP_SIZE (p);
4646
4647 if (unibyte && c < (1 << BYTEWIDTH))
4648 { /* Lookup bitmap. */
4649 /* Cast to `unsigned' instead of `unsigned char' in
4650 case the bit list is a full 32 bytes long. */
4651 if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
4652 && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4653 return !not;
4654 }
4655#ifdef emacs
4656 else if (rtp)
4657 {
4658 int class_bits = CHARSET_RANGE_TABLE_BITS (p);
4659 re_wchar_t range_start, range_end;
4660
4661 /* Sort tests by the most commonly used classes with some adjustment to which
4662 tests are easiest to perform. Frequencies of character class names used in
4663 Emacs sources as of 2016-07-15:
4664
4665 $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
4666 sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
4667 213 [:alnum:]
4668 104 [:alpha:]
4669 62 [:space:]
4670 39 [:digit:]
4671 36 [:blank:]
4672 26 [:upper:]
4673 24 [:word:]
4674 21 [:lower:]
4675 10 [:punct:]
4676 10 [:ascii:]
4677 9 [:xdigit:]
4678 4 [:nonascii:]
4679 4 [:graph:]
4680 2 [:print:]
4681 2 [:cntrl:]
4682 1 [:ff:]
4683 */
4684
4685 if ((class_bits & BIT_MULTIBYTE) ||
4686 (class_bits & BIT_ALNUM && ISALNUM (c)) ||
4687 (class_bits & BIT_ALPHA && ISALPHA (c)) ||
4688 (class_bits & BIT_SPACE && ISSPACE (c)) ||
4689 (class_bits & BIT_WORD && ISWORD (c)) ||
4690 ((class_bits & BIT_UPPER) &&
4691 (ISUPPER (c) || (corig != c &&
4692 c == downcase (corig) && ISLOWER (c)))) ||
4693 ((class_bits & BIT_LOWER) &&
4694 (ISLOWER (c) || (corig != c &&
4695 c == upcase (corig) && ISUPPER(c)))) ||
4696 (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
4697 (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
4698 (class_bits & BIT_PRINT && ISPRINT (c)))
4699 return !not;
4700
4701 for (p = *pp; rtp < p; rtp += 2 * 3)
4702 {
4703 EXTRACT_CHARACTER (range_start, rtp);
4704 EXTRACT_CHARACTER (range_end, rtp + 3);
4705 if (range_start <= c && c <= range_end)
4706 return !not;
4707 }
4708 }
4709#endif /* emacs */
4710 return not;
4711}
4712
4664/* Non-zero if "p1 matches something" implies "p2 fails". */ 4713/* Non-zero if "p1 matches something" implies "p2 fails". */
4665static int 4714static int
4666mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, 4715mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
@@ -4718,22 +4767,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
4718 else if ((re_opcode_t) *p1 == charset 4767 else if ((re_opcode_t) *p1 == charset
4719 || (re_opcode_t) *p1 == charset_not) 4768 || (re_opcode_t) *p1 == charset_not)
4720 { 4769 {
4721 int not = (re_opcode_t) *p1 == charset_not; 4770 if (!execute_charset (&p1, c, c, !multibyte))
4722
4723 /* Test if C is listed in charset (or charset_not)
4724 at `p1'. */
4725 if (! multibyte || IS_REAL_ASCII (c))
4726 {
4727 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4728 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4729 not = !not;
4730 }
4731 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4732 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4733
4734 /* `not' is equal to 1 if c would match, which means
4735 that we can't change to pop_failure_jump. */
4736 if (!not)
4737 { 4771 {
4738 DEBUG_PRINT (" No match => fast loop.\n"); 4772 DEBUG_PRINT (" No match => fast loop.\n");
4739 return 1; 4773 return 1;
@@ -5439,32 +5473,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
5439 case charset_not: 5473 case charset_not:
5440 { 5474 {
5441 register unsigned int c, corig; 5475 register unsigned int c, corig;
5442 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5443 int len; 5476 int len;
5444 5477
5445 /* Start of actual range_table, or end of bitmap if there is no
5446 range table. */
5447 re_char *range_table UNINIT;
5448
5449 /* Nonzero if there is a range table. */
5450 int range_table_exists;
5451
5452 /* Number of ranges of range table. This is not included
5453 in the initial byte-length of the command. */
5454 int count = 0;
5455
5456 /* Whether matching against a unibyte character. */ 5478 /* Whether matching against a unibyte character. */
5457 boolean unibyte_char = false; 5479 boolean unibyte_char = false;
5458 5480
5459 DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : ""); 5481 DEBUG_PRINT ("EXECUTING charset%s.\n",
5460 5482 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
5461 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5462
5463 if (range_table_exists)
5464 {
5465 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5466 EXTRACT_NUMBER_AND_INCR (count, range_table);
5467 }
5468 5483
5469 PREFETCH (); 5484 PREFETCH ();
5470 corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); 5485 corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
@@ -5498,47 +5513,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
5498 unibyte_char = true; 5513 unibyte_char = true;
5499 } 5514 }
5500 5515
5501 if (unibyte_char && c < (1 << BYTEWIDTH)) 5516 p -= 1;
5502 { /* Lookup bitmap. */ 5517 if (!execute_charset (&p, c, corig, unibyte_char))
5503 /* Cast to `unsigned' instead of `unsigned char' in 5518 goto fail;
5504 case the bit list is a full 32 bytes long. */
5505 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5506 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5507 not = !not;
5508 }
5509#ifdef emacs
5510 else if (range_table_exists)
5511 {
5512 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5513
5514 if ( (class_bits & BIT_LOWER
5515 && (ISLOWER (c)
5516 || (corig != c
5517 && c == upcase (corig) && ISUPPER(c))))
5518 | (class_bits & BIT_MULTIBYTE)
5519 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5520 | (class_bits & BIT_SPACE && ISSPACE (c))
5521 | (class_bits & BIT_UPPER
5522 && (ISUPPER (c)
5523 || (corig != c
5524 && c == downcase (corig) && ISLOWER (c))))
5525 | (class_bits & BIT_WORD && ISWORD (c))
5526 | (class_bits & BIT_ALPHA && ISALPHA (c))
5527 | (class_bits & BIT_ALNUM && ISALNUM (c))
5528 | (class_bits & BIT_GRAPH && ISGRAPH (c))
5529 | (class_bits & BIT_PRINT && ISPRINT (c)))
5530 not = !not;
5531 else
5532 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5533 }
5534#endif /* emacs */
5535
5536 if (range_table_exists)
5537 p = CHARSET_RANGE_TABLE_END (range_table, count);
5538 else
5539 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5540
5541 if (!not) goto fail;
5542 5519
5543 d += len; 5520 d += len;
5544 } 5521 }
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
new file mode 100644
index 00000000000..00165ab0512
--- /dev/null
+++ b/test/src/regex-tests.el
@@ -0,0 +1,92 @@
1;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
2
3;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
4
5;; This file is part of GNU Emacs.
6
7;; GNU Emacs is free software: you can redistribute it and/or modify
8;; it under the terms of the GNU General Public License as published by
9;; the Free Software Foundation, either version 3 of the License, or
10;; (at your option) any later version.
11
12;; GNU Emacs is distributed in the hope that it will be useful,
13;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;; GNU General Public License for more details.
16
17;; You should have received a copy of the GNU General Public License
18;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
19
20;;; Code:
21
22(require 'ert)
23
24(ert-deftest regex-word-cc-fallback-test ()
25 "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
26
27Test that a regex of the form \"[[:cc:]]*x\" where CC is
28a character class which matches a multibyte character X, matches
29string \"x\".
30
31For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
32character) must match a string \"\u2420\"."
33 (dolist (class '("[[:word:]]" "\\sw"))
34 (dolist (repeat '("*" "+"))
35 (dolist (suffix '("" "b" "bar" "\u2620"))
36 (dolist (string '("" "foo"))
37 (when (not (and (string-equal repeat "+")
38 (string-equal string "")))
39 (should (string-match (concat "^" class repeat suffix "$")
40 (concat string suffix)))))))))
41
42(defun regex--test-cc (name matching not-matching)
43 (should (string-match-p (concat "^[[:" name ":]]*$") matching))
44 (should (string-match-p (concat "^[[:" name ":]]*?\u2622$")
45 (concat matching "\u2622")))
46 (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching))
47 (should (string-match-p (concat "^[^[:" name ":]]*\u2622$")
48 (concat not-matching "\u2622")))
49 (with-temp-buffer
50 (insert matching)
51 (let ((p (point)))
52 (insert not-matching)
53 (goto-char (point-min))
54 (skip-chars-forward (concat "[:" name ":]"))
55 (should (equal (point) p))
56 (skip-chars-forward (concat "^[:" name ":]"))
57 (should (equal (point) (point-max)))
58 (goto-char (point-min))
59 (skip-chars-forward (concat "[:" name ":]\u2622"))
60 (should (or (equal (point) p) (equal (point) (1+ p)))))))
61
62(ert-deftest regex-character-classes ()
63 "Perform sanity test of regexes using character classes.
64
65Go over all the supported character classes and test whether the
66classes and their inversions match what they are supposed to
67match. The test is done using `string-match-p' as well as
68`skip-chars-forward'."
69 (let (case-fold-search)
70 (regex--test-cc "alnum" "abcABC012łąka" "-, \t\n")
71 (regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n")
72 (regex--test-cc "digit" "012" "abcABCłąka-, \t\n")
73 (regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n")
74 (regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n")
75 (regex--test-cc "lower" "abcłąka" "ABC012-, \t\n")
76
77 (regex--test-cc "word" "abcABC012\u2620" "-, \t\n")
78
79 (regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n")
80 (regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
81 (regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1")
82 (regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1")
83
84 (regex--test-cc "space" " \t\n\u2001" "abcABCł0123")
85 (regex--test-cc "blank" " \t" "\n\u2001")
86
87 (regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620")
88 (regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1")
89 (regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
90 (regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
91
92;;; regex-tests.el ends here