diff options
| author | Karl Heuer | 1997-02-20 07:26:24 +0000 |
|---|---|---|
| committer | Karl Heuer | 1997-02-20 07:26:24 +0000 |
| commit | 5679531d6c48cec2cfb8fe5317fe0ebea2841662 (patch) | |
| tree | 3238c24a5f81a05bd4666c755793beb9a7077fca /src | |
| parent | 4ed4686978bd18292e2bb7b87a7b0e0407ecb3b1 (diff) | |
| download | emacs-5679531d6c48cec2cfb8fe5317fe0ebea2841662.tar.gz emacs-5679531d6c48cec2cfb8fe5317fe0ebea2841662.zip | |
Include category.h and charset.h.
(compile_pattern_1): Handle new argument `multibyte'.
(compile_pattern): Handle the flag `enable-multibyte-characters'.
(Vascii_downcase_table): Declare external.
(fast_string_match_ignore_case): New function.
(skip_chars): Handle multibyte characters.
(trivial_regexp_p): Handle regular expression "\\Cc" and "\\CC"
for category.
Diffstat (limited to 'src')
| -rw-r--r-- | src/search.c | 184 |
1 files changed, 153 insertions, 31 deletions
diff --git a/src/search.c b/src/search.c index 1b2a6f299cb..f96e9e53bd9 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -22,7 +22,9 @@ Boston, MA 02111-1307, USA. */ | |||
| 22 | #include <config.h> | 22 | #include <config.h> |
| 23 | #include "lisp.h" | 23 | #include "lisp.h" |
| 24 | #include "syntax.h" | 24 | #include "syntax.h" |
| 25 | #include "category.h" | ||
| 25 | #include "buffer.h" | 26 | #include "buffer.h" |
| 27 | #include "charset.h" | ||
| 26 | #include "region-cache.h" | 28 | #include "region-cache.h" |
| 27 | #include "commands.h" | 29 | #include "commands.h" |
| 28 | #include "blockinput.h" | 30 | #include "blockinput.h" |
| @@ -105,15 +107,19 @@ matcher_overflow () | |||
| 105 | If it is 0, we should compile the pattern not to record any | 107 | If it is 0, we should compile the pattern not to record any |
| 106 | subexpression bounds. | 108 | subexpression bounds. |
| 107 | POSIX is nonzero if we want full backtracking (POSIX style) | 109 | POSIX is nonzero if we want full backtracking (POSIX style) |
| 108 | for this pattern. 0 means backtrack only enough to get a valid match. */ | 110 | for this pattern. 0 means backtrack only enough to get a valid match. |
| 111 | MULTIBYTE is nonzero if we want to handle multibyte characters in | ||
| 112 | PATTERN. 0 means all multibyte characters are recognized just as | ||
| 113 | sequences of binary data. */ | ||
| 109 | 114 | ||
| 110 | static void | 115 | static void |
| 111 | compile_pattern_1 (cp, pattern, translate, regp, posix) | 116 | compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) |
| 112 | struct regexp_cache *cp; | 117 | struct regexp_cache *cp; |
| 113 | Lisp_Object pattern; | 118 | Lisp_Object pattern; |
| 114 | Lisp_Object *translate; | 119 | Lisp_Object *translate; |
| 115 | struct re_registers *regp; | 120 | struct re_registers *regp; |
| 116 | int posix; | 121 | int posix; |
| 122 | int multibyte; | ||
| 117 | { | 123 | { |
| 118 | CONST char *val; | 124 | CONST char *val; |
| 119 | reg_syntax_t old; | 125 | reg_syntax_t old; |
| @@ -121,6 +127,7 @@ compile_pattern_1 (cp, pattern, translate, regp, posix) | |||
| 121 | cp->regexp = Qnil; | 127 | cp->regexp = Qnil; |
| 122 | cp->buf.translate = translate; | 128 | cp->buf.translate = translate; |
| 123 | cp->posix = posix; | 129 | cp->posix = posix; |
| 130 | cp->buf.multibyte = multibyte; | ||
| 124 | BLOCK_INPUT; | 131 | BLOCK_INPUT; |
| 125 | old = re_set_syntax (RE_SYNTAX_EMACS | 132 | old = re_set_syntax (RE_SYNTAX_EMACS |
| 126 | | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); | 133 | | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); |
| @@ -153,6 +160,9 @@ compile_pattern (pattern, regp, translate, posix) | |||
| 153 | int posix; | 160 | int posix; |
| 154 | { | 161 | { |
| 155 | struct regexp_cache *cp, **cpp; | 162 | struct regexp_cache *cp, **cpp; |
| 163 | /* Should we check it here, or add an argument `multibyte' to this | ||
| 164 | function? */ | ||
| 165 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); | ||
| 156 | 166 | ||
| 157 | for (cpp = &searchbuf_head; ; cpp = &cp->next) | 167 | for (cpp = &searchbuf_head; ; cpp = &cp->next) |
| 158 | { | 168 | { |
| @@ -160,13 +170,14 @@ compile_pattern (pattern, regp, translate, posix) | |||
| 160 | if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size | 170 | if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size |
| 161 | && !NILP (Fstring_equal (cp->regexp, pattern)) | 171 | && !NILP (Fstring_equal (cp->regexp, pattern)) |
| 162 | && cp->buf.translate == translate | 172 | && cp->buf.translate == translate |
| 163 | && cp->posix == posix) | 173 | && cp->posix == posix |
| 174 | && cp->buf.multibyte == multibyte) | ||
| 164 | break; | 175 | break; |
| 165 | 176 | ||
| 166 | /* If we're at the end of the cache, compile into the last cell. */ | 177 | /* If we're at the end of the cache, compile into the last cell. */ |
| 167 | if (cp->next == 0) | 178 | if (cp->next == 0) |
| 168 | { | 179 | { |
| 169 | compile_pattern_1 (cp, pattern, translate, regp, posix); | 180 | compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte); |
| 170 | break; | 181 | break; |
| 171 | } | 182 | } |
| 172 | } | 183 | } |
| @@ -369,6 +380,29 @@ fast_string_match (regexp, string) | |||
| 369 | immediate_quit = 0; | 380 | immediate_quit = 0; |
| 370 | return val; | 381 | return val; |
| 371 | } | 382 | } |
| 383 | |||
| 384 | /* Match REGEXP against STRING, searching all of STRING ignoring case, | ||
| 385 | and return the index of the match, or negative on failure. | ||
| 386 | This does not clobber the match data. */ | ||
| 387 | |||
| 388 | extern Lisp_Object Vascii_downcase_table; | ||
| 389 | |||
| 390 | int | ||
| 391 | fast_string_match_ignore_case (regexp, string) | ||
| 392 | Lisp_Object regexp; | ||
| 393 | char *string; | ||
| 394 | { | ||
| 395 | int val; | ||
| 396 | struct re_pattern_buffer *bufp; | ||
| 397 | int len = strlen (string); | ||
| 398 | |||
| 399 | bufp = compile_pattern (regexp, 0, | ||
| 400 | XCHAR_TABLE (Vascii_downcase_table)->contents, 0); | ||
| 401 | immediate_quit = 1; | ||
| 402 | val = re_search (bufp, string, len, 0, len, 0); | ||
| 403 | immediate_quit = 0; | ||
| 404 | return val; | ||
| 405 | } | ||
| 372 | 406 | ||
| 373 | /* max and min. */ | 407 | /* max and min. */ |
| 374 | 408 | ||
| @@ -502,8 +536,8 @@ scan_buffer (target, start, end, count, shortage, allow_quit) | |||
| 502 | 536 | ||
| 503 | { | 537 | { |
| 504 | /* The termination address of the dumb loop. */ | 538 | /* The termination address of the dumb loop. */ |
| 505 | register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling) + 1; | 539 | register unsigned char *ceiling_addr = POS_ADDR (ceiling) + 1; |
| 506 | register unsigned char *cursor = &FETCH_CHAR (start); | 540 | register unsigned char *cursor = POS_ADDR (start); |
| 507 | unsigned char *base = cursor; | 541 | unsigned char *base = cursor; |
| 508 | 542 | ||
| 509 | while (cursor < ceiling_addr) | 543 | while (cursor < ceiling_addr) |
| @@ -566,8 +600,8 @@ scan_buffer (target, start, end, count, shortage, allow_quit) | |||
| 566 | 600 | ||
| 567 | { | 601 | { |
| 568 | /* The termination address of the dumb loop. */ | 602 | /* The termination address of the dumb loop. */ |
| 569 | register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling); | 603 | register unsigned char *ceiling_addr = POS_ADDR (ceiling); |
| 570 | register unsigned char *cursor = &FETCH_CHAR (start - 1); | 604 | register unsigned char *cursor = POS_ADDR (start - 1); |
| 571 | unsigned char *base = cursor; | 605 | unsigned char *base = cursor; |
| 572 | 606 | ||
| 573 | while (cursor >= ceiling_addr) | 607 | while (cursor >= ceiling_addr) |
| @@ -693,7 +727,18 @@ skip_chars (forwardp, syntaxp, string, lim) | |||
| 693 | { | 727 | { |
| 694 | register unsigned char *p, *pend; | 728 | register unsigned char *p, *pend; |
| 695 | register unsigned char c; | 729 | register unsigned char c; |
| 730 | register int ch; | ||
| 696 | unsigned char fastmap[0400]; | 731 | unsigned char fastmap[0400]; |
| 732 | /* If SYNTAXP is 0, STRING may contain multi-byte form of characters | ||
| 733 | of which codes don't fit in FASTMAP. In that case, we set the | ||
| 734 | first byte of multibyte form (i.e. base leading-code) in FASTMAP | ||
| 735 | and set the actual ranges of characters in CHAR_RANGES. In the | ||
| 736 | form "X-Y" of STRING, both X and Y must belong to the same | ||
| 737 | character set because a range striding across character sets is | ||
| 738 | meaningless. */ | ||
| 739 | int *char_ranges | ||
| 740 | = (int *) alloca (XSTRING (string)->size * (sizeof (int)) * 2); | ||
| 741 | int n_char_ranges = 0; | ||
| 697 | int negate = 0; | 742 | int negate = 0; |
| 698 | register int i; | 743 | register int i; |
| 699 | 744 | ||
| @@ -724,11 +769,13 @@ skip_chars (forwardp, syntaxp, string, lim) | |||
| 724 | 769 | ||
| 725 | /* Find the characters specified and set their elements of fastmap. | 770 | /* Find the characters specified and set their elements of fastmap. |
| 726 | If syntaxp, each character counts as itself. | 771 | If syntaxp, each character counts as itself. |
| 727 | Otherwise, handle backslashes and ranges specially */ | 772 | Otherwise, handle backslashes and ranges specially. */ |
| 728 | 773 | ||
| 729 | while (p != pend) | 774 | while (p != pend) |
| 730 | { | 775 | { |
| 731 | c = *p++; | 776 | c = *p; |
| 777 | ch = STRING_CHAR (p, pend - p); | ||
| 778 | p += BYTES_BY_CHAR_HEAD (*p); | ||
| 732 | if (syntaxp) | 779 | if (syntaxp) |
| 733 | fastmap[syntax_spec_code[c]] = 1; | 780 | fastmap[syntax_spec_code[c]] = 1; |
| 734 | else | 781 | else |
| @@ -740,25 +787,49 @@ skip_chars (forwardp, syntaxp, string, lim) | |||
| 740 | } | 787 | } |
| 741 | if (p != pend && *p == '-') | 788 | if (p != pend && *p == '-') |
| 742 | { | 789 | { |
| 790 | unsigned int ch2; | ||
| 791 | |||
| 743 | p++; | 792 | p++; |
| 744 | if (p == pend) break; | 793 | if (p == pend) break; |
| 745 | while (c <= *p) | 794 | if (SINGLE_BYTE_CHAR_P (ch)) |
| 795 | while (c <= *p) | ||
| 796 | { | ||
| 797 | fastmap[c] = 1; | ||
| 798 | c++; | ||
| 799 | } | ||
| 800 | else | ||
| 746 | { | 801 | { |
| 747 | fastmap[c] = 1; | 802 | fastmap[c] = 1; /* C is the base leading-code. */ |
| 748 | c++; | 803 | ch2 = STRING_CHAR (p, pend - p); |
| 804 | if (ch <= ch2) | ||
| 805 | char_ranges[n_char_ranges++] = ch, | ||
| 806 | char_ranges[n_char_ranges++] = ch2; | ||
| 749 | } | 807 | } |
| 750 | p++; | 808 | p += BYTES_BY_CHAR_HEAD (*p); |
| 751 | } | 809 | } |
| 752 | else | 810 | else |
| 753 | fastmap[c] = 1; | 811 | { |
| 812 | fastmap[c] = 1; | ||
| 813 | if (!SINGLE_BYTE_CHAR_P (ch)) | ||
| 814 | char_ranges[n_char_ranges++] = ch, | ||
| 815 | char_ranges[n_char_ranges++] = ch; | ||
| 816 | } | ||
| 754 | } | 817 | } |
| 755 | } | 818 | } |
| 756 | 819 | ||
| 757 | /* If ^ was the first character, complement the fastmap. */ | 820 | /* If ^ was the first character, complement the fastmap. In |
| 821 | addition, as all multibyte characters have possibility of | ||
| 822 | matching, set all entries for base leading codes, which is | ||
| 823 | harmless even if SYNTAXP is 1. */ | ||
| 758 | 824 | ||
| 759 | if (negate) | 825 | if (negate) |
| 760 | for (i = 0; i < sizeof fastmap; i++) | 826 | for (i = 0; i < sizeof fastmap; i++) |
| 761 | fastmap[i] ^= 1; | 827 | { |
| 828 | if (!BASE_LEADING_CODE_P (i)) | ||
| 829 | fastmap[i] ^= 1; | ||
| 830 | else | ||
| 831 | fastmap[i] = 1; | ||
| 832 | } | ||
| 762 | 833 | ||
| 763 | { | 834 | { |
| 764 | int start_point = PT; | 835 | int start_point = PT; |
| @@ -771,26 +842,76 @@ skip_chars (forwardp, syntaxp, string, lim) | |||
| 771 | { | 842 | { |
| 772 | while (pos < XINT (lim) | 843 | while (pos < XINT (lim) |
| 773 | && fastmap[(int) SYNTAX (FETCH_CHAR (pos))]) | 844 | && fastmap[(int) SYNTAX (FETCH_CHAR (pos))]) |
| 774 | pos++; | 845 | INC_POS (pos); |
| 775 | } | 846 | } |
| 776 | else | 847 | else |
| 777 | { | 848 | { |
| 778 | while (pos > XINT (lim) | 849 | while (pos > XINT (lim)) |
| 779 | && fastmap[(int) SYNTAX (FETCH_CHAR (pos - 1))]) | 850 | { |
| 780 | pos--; | 851 | DEC_POS (pos); |
| 852 | if (!fastmap[(int) SYNTAX (FETCH_CHAR (pos))]) | ||
| 853 | { | ||
| 854 | INC_POS (pos); | ||
| 855 | break; | ||
| 856 | } | ||
| 857 | } | ||
| 781 | } | 858 | } |
| 782 | } | 859 | } |
| 783 | else | 860 | else |
| 784 | { | 861 | { |
| 785 | if (forwardp) | 862 | if (forwardp) |
| 786 | { | 863 | { |
| 787 | while (pos < XINT (lim) && fastmap[FETCH_CHAR (pos)]) | 864 | while (pos < XINT (lim) && fastmap[(c = FETCH_BYTE (pos))]) |
| 788 | pos++; | 865 | { |
| 866 | if (!BASE_LEADING_CODE_P (c)) | ||
| 867 | pos++; | ||
| 868 | else if (n_char_ranges) | ||
| 869 | { | ||
| 870 | /* We much check CHAR_RANGES for a multibyte | ||
| 871 | character. */ | ||
| 872 | ch = FETCH_MULTIBYTE_CHAR (pos); | ||
| 873 | for (i = 0; i < n_char_ranges; i += 2) | ||
| 874 | if ((ch >= char_ranges[i] && ch <= char_ranges[i + 1])) | ||
| 875 | break; | ||
| 876 | if (!(negate ^ (i < n_char_ranges))) | ||
| 877 | break; | ||
| 878 | |||
| 879 | INC_POS (pos); | ||
| 880 | } | ||
| 881 | else | ||
| 882 | { | ||
| 883 | if (!negate) break; | ||
| 884 | INC_POS (pos); | ||
| 885 | } | ||
| 886 | } | ||
| 789 | } | 887 | } |
| 790 | else | 888 | else |
| 791 | { | 889 | { |
| 792 | while (pos > XINT (lim) && fastmap[FETCH_CHAR (pos - 1)]) | 890 | while (pos > XINT (lim)) |
| 793 | pos--; | 891 | { |
| 892 | DEC_POS (pos); | ||
| 893 | if (fastmap[(c = FETCH_BYTE (pos))]) | ||
| 894 | { | ||
| 895 | if (!BASE_LEADING_CODE_P (c)) | ||
| 896 | ; | ||
| 897 | else if (n_char_ranges) | ||
| 898 | { | ||
| 899 | /* We much check CHAR_RANGES for a multibyte | ||
| 900 | character. */ | ||
| 901 | ch = FETCH_MULTIBYTE_CHAR (pos); | ||
| 902 | for (i = 0; i < n_char_ranges; i += 2) | ||
| 903 | if (ch >= char_ranges[i] && ch <= char_ranges[i + 1]) | ||
| 904 | break; | ||
| 905 | if (!(negate ^ (i < n_char_ranges))) | ||
| 906 | break; | ||
| 907 | } | ||
| 908 | else | ||
| 909 | if (!negate) | ||
| 910 | break; | ||
| 911 | } | ||
| 912 | else | ||
| 913 | break; | ||
| 914 | } | ||
| 794 | } | 915 | } |
| 795 | } | 916 | } |
| 796 | SET_PT (pos); | 917 | SET_PT (pos); |
| @@ -890,6 +1011,7 @@ trivial_regexp_p (regexp) | |||
| 890 | case '|': case '(': case ')': case '`': case '\'': case 'b': | 1011 | case '|': case '(': case ')': case '`': case '\'': case 'b': |
| 891 | case 'B': case '<': case '>': case 'w': case 'W': case 's': | 1012 | case 'B': case '<': case '>': case 'w': case 'W': case 's': |
| 892 | case 'S': case '=': | 1013 | case 'S': case '=': |
| 1014 | case 'c': case 'C': /* for categoryspec and notcategoryspec */ | ||
| 893 | case '1': case '2': case '3': case '4': case '5': | 1015 | case '1': case '2': case '3': case '4': case '5': |
| 894 | case '6': case '7': case '8': case '9': | 1016 | case '6': case '7': case '8': case '9': |
| 895 | return 0; | 1017 | return 0; |
| @@ -1165,8 +1287,8 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix) | |||
| 1165 | : max (lim, max (limit, pos - 20000))); | 1287 | : max (lim, max (limit, pos - 20000))); |
| 1166 | if ((limit - pos) * direction > 20) | 1288 | if ((limit - pos) * direction > 20) |
| 1167 | { | 1289 | { |
| 1168 | p_limit = &FETCH_CHAR (limit); | 1290 | p_limit = POS_ADDR (limit); |
| 1169 | p2 = (cursor = &FETCH_CHAR (pos)); | 1291 | p2 = (cursor = POS_ADDR (pos)); |
| 1170 | /* In this loop, pos + cursor - p2 is the surrogate for pos */ | 1292 | /* In this loop, pos + cursor - p2 is the surrogate for pos */ |
| 1171 | while (1) /* use one cursor setting as long as i can */ | 1293 | while (1) /* use one cursor setting as long as i can */ |
| 1172 | { | 1294 | { |
| @@ -1256,7 +1378,7 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix) | |||
| 1256 | /* (the reach is at most len + 21, and typically */ | 1378 | /* (the reach is at most len + 21, and typically */ |
| 1257 | /* does not exceed len) */ | 1379 | /* does not exceed len) */ |
| 1258 | while ((limit - pos) * direction >= 0) | 1380 | while ((limit - pos) * direction >= 0) |
| 1259 | pos += BM_tab[FETCH_CHAR(pos)]; | 1381 | pos += BM_tab[FETCH_BYTE (pos)]; |
| 1260 | /* now run the same tests to distinguish going off the */ | 1382 | /* now run the same tests to distinguish going off the */ |
| 1261 | /* end, a match or a phony match. */ | 1383 | /* end, a match or a phony match. */ |
| 1262 | if ((pos - limit) * direction <= len) | 1384 | if ((pos - limit) * direction <= len) |
| @@ -1269,8 +1391,8 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix) | |||
| 1269 | { | 1391 | { |
| 1270 | pos -= direction; | 1392 | pos -= direction; |
| 1271 | if (pat[i] != (trt != 0 | 1393 | if (pat[i] != (trt != 0 |
| 1272 | ? trt[FETCH_CHAR(pos)] | 1394 | ? trt[FETCH_BYTE (pos)] |
| 1273 | : FETCH_CHAR (pos))) | 1395 | : FETCH_BYTE (pos))) |
| 1274 | break; | 1396 | break; |
| 1275 | } | 1397 | } |
| 1276 | /* Above loop has moved POS part or all the way | 1398 | /* Above loop has moved POS part or all the way |
| @@ -1599,7 +1721,7 @@ since only regular expressions have distinguished subexpressions.") | |||
| 1599 | for (pos = search_regs.start[sub]; pos < last; pos++) | 1721 | for (pos = search_regs.start[sub]; pos < last; pos++) |
| 1600 | { | 1722 | { |
| 1601 | if (NILP (string)) | 1723 | if (NILP (string)) |
| 1602 | c = FETCH_CHAR (pos); | 1724 | c = FETCH_BYTE (pos); |
| 1603 | else | 1725 | else |
| 1604 | c = XSTRING (string)->data[pos]; | 1726 | c = XSTRING (string)->data[pos]; |
| 1605 | 1727 | ||