diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex.c | 209 |
1 files changed, 93 insertions, 116 deletions
diff --git a/src/regex.c b/src/regex.c index f92bcb7923b..297bf718848 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -783,44 +783,6 @@ extract_number_and_incr (re_char **source) | |||
| 783 | and end. */ | 783 | and end. */ |
| 784 | #define CHARSET_RANGE_TABLE_END(range_table, count) \ | 784 | #define CHARSET_RANGE_TABLE_END(range_table, count) \ |
| 785 | ((range_table) + (count) * 2 * 3) | 785 | ((range_table) + (count) * 2 * 3) |
| 786 | |||
| 787 | /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in. | ||
| 788 | COUNT is number of ranges in RANGE_TABLE. */ | ||
| 789 | #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ | ||
| 790 | do \ | ||
| 791 | { \ | ||
| 792 | re_wchar_t range_start, range_end; \ | ||
| 793 | re_char *rtp; \ | ||
| 794 | re_char *range_table_end \ | ||
| 795 | = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ | ||
| 796 | \ | ||
| 797 | for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \ | ||
| 798 | { \ | ||
| 799 | EXTRACT_CHARACTER (range_start, rtp); \ | ||
| 800 | EXTRACT_CHARACTER (range_end, rtp + 3); \ | ||
| 801 | \ | ||
| 802 | if (range_start <= (c) && (c) <= range_end) \ | ||
| 803 | { \ | ||
| 804 | (not) = !(not); \ | ||
| 805 | break; \ | ||
| 806 | } \ | ||
| 807 | } \ | ||
| 808 | } \ | ||
| 809 | while (0) | ||
| 810 | |||
| 811 | /* Test if C is in range table of CHARSET. The flag NOT is negated if | ||
| 812 | C is listed in it. */ | ||
| 813 | #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \ | ||
| 814 | do \ | ||
| 815 | { \ | ||
| 816 | /* Number of ranges in range table. */ \ | ||
| 817 | int count; \ | ||
| 818 | re_char *range_table = CHARSET_RANGE_TABLE (charset); \ | ||
| 819 | \ | ||
| 820 | EXTRACT_NUMBER_AND_INCR (count, range_table); \ | ||
| 821 | CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ | ||
| 822 | } \ | ||
| 823 | while (0) | ||
| 824 | 786 | ||
| 825 | /* If DEBUG is defined, Regex prints many voluminous messages about what | 787 | /* If DEBUG is defined, Regex prints many voluminous messages about what |
| 826 | it is doing (if the variable `debug' is nonzero). If linked with the | 788 | it is doing (if the variable `debug' is nonzero). If linked with the |
| @@ -4661,6 +4623,93 @@ skip_noops (const_re_char *p, const_re_char *pend) | |||
| 4661 | return p; | 4623 | return p; |
| 4662 | } | 4624 | } |
| 4663 | 4625 | ||
| 4626 | /* Test if C matches charset op. *PP points to the charset or chraset_not | ||
| 4627 | opcode. When the function finishes, *PP will be advanced past that opcode. | ||
| 4628 | C is character to test (possibly after translations) and CORIG is original | ||
| 4629 | character (i.e. without any translations). UNIBYTE denotes whether c is | ||
| 4630 | unibyte or multibyte character. */ | ||
| 4631 | static bool | ||
| 4632 | execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) | ||
| 4633 | { | ||
| 4634 | re_char *p = *pp, *rtp = NULL; | ||
| 4635 | bool not = (re_opcode_t) *p == charset_not; | ||
| 4636 | |||
| 4637 | if (CHARSET_RANGE_TABLE_EXISTS_P (p)) | ||
| 4638 | { | ||
| 4639 | int count; | ||
| 4640 | rtp = CHARSET_RANGE_TABLE (p); | ||
| 4641 | EXTRACT_NUMBER_AND_INCR (count, rtp); | ||
| 4642 | *pp = CHARSET_RANGE_TABLE_END ((rtp), (count)); | ||
| 4643 | } | ||
| 4644 | else | ||
| 4645 | *pp += 2 + CHARSET_BITMAP_SIZE (p); | ||
| 4646 | |||
| 4647 | if (unibyte && c < (1 << BYTEWIDTH)) | ||
| 4648 | { /* Lookup bitmap. */ | ||
| 4649 | /* Cast to `unsigned' instead of `unsigned char' in | ||
| 4650 | case the bit list is a full 32 bytes long. */ | ||
| 4651 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH) | ||
| 4652 | && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | ||
| 4653 | return !not; | ||
| 4654 | } | ||
| 4655 | #ifdef emacs | ||
| 4656 | else if (rtp) | ||
| 4657 | { | ||
| 4658 | int class_bits = CHARSET_RANGE_TABLE_BITS (p); | ||
| 4659 | re_wchar_t range_start, range_end; | ||
| 4660 | |||
| 4661 | /* Sort tests by the most commonly used classes with some adjustment to which | ||
| 4662 | tests are easiest to perform. Frequencies of character class names used in | ||
| 4663 | Emacs sources as of 2016-07-15: | ||
| 4664 | |||
| 4665 | $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | | ||
| 4666 | sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr | ||
| 4667 | 213 [:alnum:] | ||
| 4668 | 104 [:alpha:] | ||
| 4669 | 62 [:space:] | ||
| 4670 | 39 [:digit:] | ||
| 4671 | 36 [:blank:] | ||
| 4672 | 26 [:upper:] | ||
| 4673 | 24 [:word:] | ||
| 4674 | 21 [:lower:] | ||
| 4675 | 10 [:punct:] | ||
| 4676 | 10 [:ascii:] | ||
| 4677 | 9 [:xdigit:] | ||
| 4678 | 4 [:nonascii:] | ||
| 4679 | 4 [:graph:] | ||
| 4680 | 2 [:print:] | ||
| 4681 | 2 [:cntrl:] | ||
| 4682 | 1 [:ff:] | ||
| 4683 | */ | ||
| 4684 | |||
| 4685 | if ((class_bits & BIT_MULTIBYTE) || | ||
| 4686 | (class_bits & BIT_ALNUM && ISALNUM (c)) || | ||
| 4687 | (class_bits & BIT_ALPHA && ISALPHA (c)) || | ||
| 4688 | (class_bits & BIT_SPACE && ISSPACE (c)) || | ||
| 4689 | (class_bits & BIT_WORD && ISWORD (c)) || | ||
| 4690 | ((class_bits & BIT_UPPER) && | ||
| 4691 | (ISUPPER (c) || (corig != c && | ||
| 4692 | c == downcase (corig) && ISLOWER (c)))) || | ||
| 4693 | ((class_bits & BIT_LOWER) && | ||
| 4694 | (ISLOWER (c) || (corig != c && | ||
| 4695 | c == upcase (corig) && ISUPPER(c)))) || | ||
| 4696 | (class_bits & BIT_PUNCT && ISPUNCT (c)) || | ||
| 4697 | (class_bits & BIT_GRAPH && ISGRAPH (c)) || | ||
| 4698 | (class_bits & BIT_PRINT && ISPRINT (c))) | ||
| 4699 | return !not; | ||
| 4700 | |||
| 4701 | for (p = *pp; rtp < p; rtp += 2 * 3) | ||
| 4702 | { | ||
| 4703 | EXTRACT_CHARACTER (range_start, rtp); | ||
| 4704 | EXTRACT_CHARACTER (range_end, rtp + 3); | ||
| 4705 | if (range_start <= c && c <= range_end) | ||
| 4706 | return !not; | ||
| 4707 | } | ||
| 4708 | } | ||
| 4709 | #endif /* emacs */ | ||
| 4710 | return not; | ||
| 4711 | } | ||
| 4712 | |||
| 4664 | /* Non-zero if "p1 matches something" implies "p2 fails". */ | 4713 | /* Non-zero if "p1 matches something" implies "p2 fails". */ |
| 4665 | static int | 4714 | static int |
| 4666 | mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, | 4715 | mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, |
| @@ -4718,22 +4767,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, | |||
| 4718 | else if ((re_opcode_t) *p1 == charset | 4767 | else if ((re_opcode_t) *p1 == charset |
| 4719 | || (re_opcode_t) *p1 == charset_not) | 4768 | || (re_opcode_t) *p1 == charset_not) |
| 4720 | { | 4769 | { |
| 4721 | int not = (re_opcode_t) *p1 == charset_not; | 4770 | if (!execute_charset (&p1, c, c, !multibyte)) |
| 4722 | |||
| 4723 | /* Test if C is listed in charset (or charset_not) | ||
| 4724 | at `p1'. */ | ||
| 4725 | if (! multibyte || IS_REAL_ASCII (c)) | ||
| 4726 | { | ||
| 4727 | if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH | ||
| 4728 | && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | ||
| 4729 | not = !not; | ||
| 4730 | } | ||
| 4731 | else if (CHARSET_RANGE_TABLE_EXISTS_P (p1)) | ||
| 4732 | CHARSET_LOOKUP_RANGE_TABLE (not, c, p1); | ||
| 4733 | |||
| 4734 | /* `not' is equal to 1 if c would match, which means | ||
| 4735 | that we can't change to pop_failure_jump. */ | ||
| 4736 | if (!not) | ||
| 4737 | { | 4771 | { |
| 4738 | DEBUG_PRINT (" No match => fast loop.\n"); | 4772 | DEBUG_PRINT (" No match => fast loop.\n"); |
| 4739 | return 1; | 4773 | return 1; |
| @@ -5439,32 +5473,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, | |||
| 5439 | case charset_not: | 5473 | case charset_not: |
| 5440 | { | 5474 | { |
| 5441 | register unsigned int c, corig; | 5475 | register unsigned int c, corig; |
| 5442 | boolean not = (re_opcode_t) *(p - 1) == charset_not; | ||
| 5443 | int len; | 5476 | int len; |
| 5444 | 5477 | ||
| 5445 | /* Start of actual range_table, or end of bitmap if there is no | ||
| 5446 | range table. */ | ||
| 5447 | re_char *range_table UNINIT; | ||
| 5448 | |||
| 5449 | /* Nonzero if there is a range table. */ | ||
| 5450 | int range_table_exists; | ||
| 5451 | |||
| 5452 | /* Number of ranges of range table. This is not included | ||
| 5453 | in the initial byte-length of the command. */ | ||
| 5454 | int count = 0; | ||
| 5455 | |||
| 5456 | /* Whether matching against a unibyte character. */ | 5478 | /* Whether matching against a unibyte character. */ |
| 5457 | boolean unibyte_char = false; | 5479 | boolean unibyte_char = false; |
| 5458 | 5480 | ||
| 5459 | DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : ""); | 5481 | DEBUG_PRINT ("EXECUTING charset%s.\n", |
| 5460 | 5482 | (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); | |
| 5461 | range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); | ||
| 5462 | |||
| 5463 | if (range_table_exists) | ||
| 5464 | { | ||
| 5465 | range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ | ||
| 5466 | EXTRACT_NUMBER_AND_INCR (count, range_table); | ||
| 5467 | } | ||
| 5468 | 5483 | ||
| 5469 | PREFETCH (); | 5484 | PREFETCH (); |
| 5470 | corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); | 5485 | corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); |
| @@ -5498,47 +5513,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, | |||
| 5498 | unibyte_char = true; | 5513 | unibyte_char = true; |
| 5499 | } | 5514 | } |
| 5500 | 5515 | ||
| 5501 | if (unibyte_char && c < (1 << BYTEWIDTH)) | 5516 | p -= 1; |
| 5502 | { /* Lookup bitmap. */ | 5517 | if (!execute_charset (&p, c, corig, unibyte_char)) |
| 5503 | /* Cast to `unsigned' instead of `unsigned char' in | 5518 | goto fail; |
| 5504 | case the bit list is a full 32 bytes long. */ | ||
| 5505 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) | ||
| 5506 | && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | ||
| 5507 | not = !not; | ||
| 5508 | } | ||
| 5509 | #ifdef emacs | ||
| 5510 | else if (range_table_exists) | ||
| 5511 | { | ||
| 5512 | int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); | ||
| 5513 | |||
| 5514 | if ( (class_bits & BIT_LOWER | ||
| 5515 | && (ISLOWER (c) | ||
| 5516 | || (corig != c | ||
| 5517 | && c == upcase (corig) && ISUPPER(c)))) | ||
| 5518 | | (class_bits & BIT_MULTIBYTE) | ||
| 5519 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) | ||
| 5520 | | (class_bits & BIT_SPACE && ISSPACE (c)) | ||
| 5521 | | (class_bits & BIT_UPPER | ||
| 5522 | && (ISUPPER (c) | ||
| 5523 | || (corig != c | ||
| 5524 | && c == downcase (corig) && ISLOWER (c)))) | ||
| 5525 | | (class_bits & BIT_WORD && ISWORD (c)) | ||
| 5526 | | (class_bits & BIT_ALPHA && ISALPHA (c)) | ||
| 5527 | | (class_bits & BIT_ALNUM && ISALNUM (c)) | ||
| 5528 | | (class_bits & BIT_GRAPH && ISGRAPH (c)) | ||
| 5529 | | (class_bits & BIT_PRINT && ISPRINT (c))) | ||
| 5530 | not = !not; | ||
| 5531 | else | ||
| 5532 | CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); | ||
| 5533 | } | ||
| 5534 | #endif /* emacs */ | ||
| 5535 | |||
| 5536 | if (range_table_exists) | ||
| 5537 | p = CHARSET_RANGE_TABLE_END (range_table, count); | ||
| 5538 | else | ||
| 5539 | p += CHARSET_BITMAP_SIZE (&p[-1]) + 1; | ||
| 5540 | |||
| 5541 | if (!not) goto fail; | ||
| 5542 | 5519 | ||
| 5543 | d += len; | 5520 | d += len; |
| 5544 | } | 5521 | } |