aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKarl Heuer1997-02-20 07:26:24 +0000
committerKarl Heuer1997-02-20 07:26:24 +0000
commit5679531d6c48cec2cfb8fe5317fe0ebea2841662 (patch)
tree3238c24a5f81a05bd4666c755793beb9a7077fca /src
parent4ed4686978bd18292e2bb7b87a7b0e0407ecb3b1 (diff)
downloademacs-5679531d6c48cec2cfb8fe5317fe0ebea2841662.tar.gz
emacs-5679531d6c48cec2cfb8fe5317fe0ebea2841662.zip
Include category.h and charset.h.
(compile_pattern_1): Handle new argument `multibyte'. (compile_pattern): Handle the flag `enable-multibyte-characters'. (Vascii_downcase_table): Declare external. (fast_string_match_ignore_case): New function. (skip_chars): Handle multibyte characters. (trivial_regexp_p): Handle regular expression "\\Cc" and "\\CC" for category.
Diffstat (limited to 'src')
-rw-r--r--src/search.c184
1 files changed, 153 insertions, 31 deletions
diff --git a/src/search.c b/src/search.c
index 1b2a6f299cb..f96e9e53bd9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -22,7 +22,9 @@ Boston, MA 02111-1307, USA. */
22#include <config.h> 22#include <config.h>
23#include "lisp.h" 23#include "lisp.h"
24#include "syntax.h" 24#include "syntax.h"
25#include "category.h"
25#include "buffer.h" 26#include "buffer.h"
27#include "charset.h"
26#include "region-cache.h" 28#include "region-cache.h"
27#include "commands.h" 29#include "commands.h"
28#include "blockinput.h" 30#include "blockinput.h"
@@ -105,15 +107,19 @@ matcher_overflow ()
105 If it is 0, we should compile the pattern not to record any 107 If it is 0, we should compile the pattern not to record any
106 subexpression bounds. 108 subexpression bounds.
107 POSIX is nonzero if we want full backtracking (POSIX style) 109 POSIX is nonzero if we want full backtracking (POSIX style)
108 for this pattern. 0 means backtrack only enough to get a valid match. */ 110 for this pattern. 0 means backtrack only enough to get a valid match.
111 MULTIBYTE is nonzero if we want to handle multibyte characters in
112 PATTERN. 0 means all multibyte characters are recognized just as
113 sequences of binary data. */
109 114
110static void 115static void
111compile_pattern_1 (cp, pattern, translate, regp, posix) 116compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
112 struct regexp_cache *cp; 117 struct regexp_cache *cp;
113 Lisp_Object pattern; 118 Lisp_Object pattern;
114 Lisp_Object *translate; 119 Lisp_Object *translate;
115 struct re_registers *regp; 120 struct re_registers *regp;
116 int posix; 121 int posix;
122 int multibyte;
117{ 123{
118 CONST char *val; 124 CONST char *val;
119 reg_syntax_t old; 125 reg_syntax_t old;
@@ -121,6 +127,7 @@ compile_pattern_1 (cp, pattern, translate, regp, posix)
121 cp->regexp = Qnil; 127 cp->regexp = Qnil;
122 cp->buf.translate = translate; 128 cp->buf.translate = translate;
123 cp->posix = posix; 129 cp->posix = posix;
130 cp->buf.multibyte = multibyte;
124 BLOCK_INPUT; 131 BLOCK_INPUT;
125 old = re_set_syntax (RE_SYNTAX_EMACS 132 old = re_set_syntax (RE_SYNTAX_EMACS
126 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); 133 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
@@ -153,6 +160,9 @@ compile_pattern (pattern, regp, translate, posix)
153 int posix; 160 int posix;
154{ 161{
155 struct regexp_cache *cp, **cpp; 162 struct regexp_cache *cp, **cpp;
163 /* Should we check it here, or add an argument `multibyte' to this
164 function? */
165 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
156 166
157 for (cpp = &searchbuf_head; ; cpp = &cp->next) 167 for (cpp = &searchbuf_head; ; cpp = &cp->next)
158 { 168 {
@@ -160,13 +170,14 @@ compile_pattern (pattern, regp, translate, posix)
160 if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size 170 if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size
161 && !NILP (Fstring_equal (cp->regexp, pattern)) 171 && !NILP (Fstring_equal (cp->regexp, pattern))
162 && cp->buf.translate == translate 172 && cp->buf.translate == translate
163 && cp->posix == posix) 173 && cp->posix == posix
174 && cp->buf.multibyte == multibyte)
164 break; 175 break;
165 176
166 /* If we're at the end of the cache, compile into the last cell. */ 177 /* If we're at the end of the cache, compile into the last cell. */
167 if (cp->next == 0) 178 if (cp->next == 0)
168 { 179 {
169 compile_pattern_1 (cp, pattern, translate, regp, posix); 180 compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte);
170 break; 181 break;
171 } 182 }
172 } 183 }
@@ -369,6 +380,29 @@ fast_string_match (regexp, string)
369 immediate_quit = 0; 380 immediate_quit = 0;
370 return val; 381 return val;
371} 382}
383
384/* Match REGEXP against STRING, searching all of STRING ignoring case,
385 and return the index of the match, or negative on failure.
386 This does not clobber the match data. */
387
388extern Lisp_Object Vascii_downcase_table;
389
390int
391fast_string_match_ignore_case (regexp, string)
392 Lisp_Object regexp;
393 char *string;
394{
395 int val;
396 struct re_pattern_buffer *bufp;
397 int len = strlen (string);
398
399 bufp = compile_pattern (regexp, 0,
400 XCHAR_TABLE (Vascii_downcase_table)->contents, 0);
401 immediate_quit = 1;
402 val = re_search (bufp, string, len, 0, len, 0);
403 immediate_quit = 0;
404 return val;
405}
372 406
373/* max and min. */ 407/* max and min. */
374 408
@@ -502,8 +536,8 @@ scan_buffer (target, start, end, count, shortage, allow_quit)
502 536
503 { 537 {
504 /* The termination address of the dumb loop. */ 538 /* The termination address of the dumb loop. */
505 register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling) + 1; 539 register unsigned char *ceiling_addr = POS_ADDR (ceiling) + 1;
506 register unsigned char *cursor = &FETCH_CHAR (start); 540 register unsigned char *cursor = POS_ADDR (start);
507 unsigned char *base = cursor; 541 unsigned char *base = cursor;
508 542
509 while (cursor < ceiling_addr) 543 while (cursor < ceiling_addr)
@@ -566,8 +600,8 @@ scan_buffer (target, start, end, count, shortage, allow_quit)
566 600
567 { 601 {
568 /* The termination address of the dumb loop. */ 602 /* The termination address of the dumb loop. */
569 register unsigned char *ceiling_addr = &FETCH_CHAR (ceiling); 603 register unsigned char *ceiling_addr = POS_ADDR (ceiling);
570 register unsigned char *cursor = &FETCH_CHAR (start - 1); 604 register unsigned char *cursor = POS_ADDR (start - 1);
571 unsigned char *base = cursor; 605 unsigned char *base = cursor;
572 606
573 while (cursor >= ceiling_addr) 607 while (cursor >= ceiling_addr)
@@ -693,7 +727,18 @@ skip_chars (forwardp, syntaxp, string, lim)
693{ 727{
694 register unsigned char *p, *pend; 728 register unsigned char *p, *pend;
695 register unsigned char c; 729 register unsigned char c;
730 register int ch;
696 unsigned char fastmap[0400]; 731 unsigned char fastmap[0400];
732 /* If SYNTAXP is 0, STRING may contain multi-byte form of characters
733 of which codes don't fit in FASTMAP. In that case, we set the
734 first byte of multibyte form (i.e. base leading-code) in FASTMAP
735 and set the actual ranges of characters in CHAR_RANGES. In the
736 form "X-Y" of STRING, both X and Y must belong to the same
737 character set because a range striding across character sets is
738 meaningless. */
739 int *char_ranges
740 = (int *) alloca (XSTRING (string)->size * (sizeof (int)) * 2);
741 int n_char_ranges = 0;
697 int negate = 0; 742 int negate = 0;
698 register int i; 743 register int i;
699 744
@@ -724,11 +769,13 @@ skip_chars (forwardp, syntaxp, string, lim)
724 769
725 /* Find the characters specified and set their elements of fastmap. 770 /* Find the characters specified and set their elements of fastmap.
726 If syntaxp, each character counts as itself. 771 If syntaxp, each character counts as itself.
727 Otherwise, handle backslashes and ranges specially */ 772 Otherwise, handle backslashes and ranges specially. */
728 773
729 while (p != pend) 774 while (p != pend)
730 { 775 {
731 c = *p++; 776 c = *p;
777 ch = STRING_CHAR (p, pend - p);
778 p += BYTES_BY_CHAR_HEAD (*p);
732 if (syntaxp) 779 if (syntaxp)
733 fastmap[syntax_spec_code[c]] = 1; 780 fastmap[syntax_spec_code[c]] = 1;
734 else 781 else
@@ -740,25 +787,49 @@ skip_chars (forwardp, syntaxp, string, lim)
740 } 787 }
741 if (p != pend && *p == '-') 788 if (p != pend && *p == '-')
742 { 789 {
790 unsigned int ch2;
791
743 p++; 792 p++;
744 if (p == pend) break; 793 if (p == pend) break;
745 while (c <= *p) 794 if (SINGLE_BYTE_CHAR_P (ch))
795 while (c <= *p)
796 {
797 fastmap[c] = 1;
798 c++;
799 }
800 else
746 { 801 {
747 fastmap[c] = 1; 802 fastmap[c] = 1; /* C is the base leading-code. */
748 c++; 803 ch2 = STRING_CHAR (p, pend - p);
804 if (ch <= ch2)
805 char_ranges[n_char_ranges++] = ch,
806 char_ranges[n_char_ranges++] = ch2;
749 } 807 }
750 p++; 808 p += BYTES_BY_CHAR_HEAD (*p);
751 } 809 }
752 else 810 else
753 fastmap[c] = 1; 811 {
812 fastmap[c] = 1;
813 if (!SINGLE_BYTE_CHAR_P (ch))
814 char_ranges[n_char_ranges++] = ch,
815 char_ranges[n_char_ranges++] = ch;
816 }
754 } 817 }
755 } 818 }
756 819
757 /* If ^ was the first character, complement the fastmap. */ 820 /* If ^ was the first character, complement the fastmap. In
821 addition, as all multibyte characters have possibility of
822 matching, set all entries for base leading codes, which is
823 harmless even if SYNTAXP is 1. */
758 824
759 if (negate) 825 if (negate)
760 for (i = 0; i < sizeof fastmap; i++) 826 for (i = 0; i < sizeof fastmap; i++)
761 fastmap[i] ^= 1; 827 {
828 if (!BASE_LEADING_CODE_P (i))
829 fastmap[i] ^= 1;
830 else
831 fastmap[i] = 1;
832 }
762 833
763 { 834 {
764 int start_point = PT; 835 int start_point = PT;
@@ -771,26 +842,76 @@ skip_chars (forwardp, syntaxp, string, lim)
771 { 842 {
772 while (pos < XINT (lim) 843 while (pos < XINT (lim)
773 && fastmap[(int) SYNTAX (FETCH_CHAR (pos))]) 844 && fastmap[(int) SYNTAX (FETCH_CHAR (pos))])
774 pos++; 845 INC_POS (pos);
775 } 846 }
776 else 847 else
777 { 848 {
778 while (pos > XINT (lim) 849 while (pos > XINT (lim))
779 && fastmap[(int) SYNTAX (FETCH_CHAR (pos - 1))]) 850 {
780 pos--; 851 DEC_POS (pos);
852 if (!fastmap[(int) SYNTAX (FETCH_CHAR (pos))])
853 {
854 INC_POS (pos);
855 break;
856 }
857 }
781 } 858 }
782 } 859 }
783 else 860 else
784 { 861 {
785 if (forwardp) 862 if (forwardp)
786 { 863 {
787 while (pos < XINT (lim) && fastmap[FETCH_CHAR (pos)]) 864 while (pos < XINT (lim) && fastmap[(c = FETCH_BYTE (pos))])
788 pos++; 865 {
866 if (!BASE_LEADING_CODE_P (c))
867 pos++;
868 else if (n_char_ranges)
869 {
870 /* We much check CHAR_RANGES for a multibyte
871 character. */
872 ch = FETCH_MULTIBYTE_CHAR (pos);
873 for (i = 0; i < n_char_ranges; i += 2)
874 if ((ch >= char_ranges[i] && ch <= char_ranges[i + 1]))
875 break;
876 if (!(negate ^ (i < n_char_ranges)))
877 break;
878
879 INC_POS (pos);
880 }
881 else
882 {
883 if (!negate) break;
884 INC_POS (pos);
885 }
886 }
789 } 887 }
790 else 888 else
791 { 889 {
792 while (pos > XINT (lim) && fastmap[FETCH_CHAR (pos - 1)]) 890 while (pos > XINT (lim))
793 pos--; 891 {
892 DEC_POS (pos);
893 if (fastmap[(c = FETCH_BYTE (pos))])
894 {
895 if (!BASE_LEADING_CODE_P (c))
896 ;
897 else if (n_char_ranges)
898 {
899 /* We much check CHAR_RANGES for a multibyte
900 character. */
901 ch = FETCH_MULTIBYTE_CHAR (pos);
902 for (i = 0; i < n_char_ranges; i += 2)
903 if (ch >= char_ranges[i] && ch <= char_ranges[i + 1])
904 break;
905 if (!(negate ^ (i < n_char_ranges)))
906 break;
907 }
908 else
909 if (!negate)
910 break;
911 }
912 else
913 break;
914 }
794 } 915 }
795 } 916 }
796 SET_PT (pos); 917 SET_PT (pos);
@@ -890,6 +1011,7 @@ trivial_regexp_p (regexp)
890 case '|': case '(': case ')': case '`': case '\'': case 'b': 1011 case '|': case '(': case ')': case '`': case '\'': case 'b':
891 case 'B': case '<': case '>': case 'w': case 'W': case 's': 1012 case 'B': case '<': case '>': case 'w': case 'W': case 's':
892 case 'S': case '=': 1013 case 'S': case '=':
1014 case 'c': case 'C': /* for categoryspec and notcategoryspec */
893 case '1': case '2': case '3': case '4': case '5': 1015 case '1': case '2': case '3': case '4': case '5':
894 case '6': case '7': case '8': case '9': 1016 case '6': case '7': case '8': case '9':
895 return 0; 1017 return 0;
@@ -1165,8 +1287,8 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix)
1165 : max (lim, max (limit, pos - 20000))); 1287 : max (lim, max (limit, pos - 20000)));
1166 if ((limit - pos) * direction > 20) 1288 if ((limit - pos) * direction > 20)
1167 { 1289 {
1168 p_limit = &FETCH_CHAR (limit); 1290 p_limit = POS_ADDR (limit);
1169 p2 = (cursor = &FETCH_CHAR (pos)); 1291 p2 = (cursor = POS_ADDR (pos));
1170 /* In this loop, pos + cursor - p2 is the surrogate for pos */ 1292 /* In this loop, pos + cursor - p2 is the surrogate for pos */
1171 while (1) /* use one cursor setting as long as i can */ 1293 while (1) /* use one cursor setting as long as i can */
1172 { 1294 {
@@ -1256,7 +1378,7 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix)
1256 /* (the reach is at most len + 21, and typically */ 1378 /* (the reach is at most len + 21, and typically */
1257 /* does not exceed len) */ 1379 /* does not exceed len) */
1258 while ((limit - pos) * direction >= 0) 1380 while ((limit - pos) * direction >= 0)
1259 pos += BM_tab[FETCH_CHAR(pos)]; 1381 pos += BM_tab[FETCH_BYTE (pos)];
1260 /* now run the same tests to distinguish going off the */ 1382 /* now run the same tests to distinguish going off the */
1261 /* end, a match or a phony match. */ 1383 /* end, a match or a phony match. */
1262 if ((pos - limit) * direction <= len) 1384 if ((pos - limit) * direction <= len)
@@ -1269,8 +1391,8 @@ search_buffer (string, pos, lim, n, RE, trt, inverse_trt, posix)
1269 { 1391 {
1270 pos -= direction; 1392 pos -= direction;
1271 if (pat[i] != (trt != 0 1393 if (pat[i] != (trt != 0
1272 ? trt[FETCH_CHAR(pos)] 1394 ? trt[FETCH_BYTE (pos)]
1273 : FETCH_CHAR (pos))) 1395 : FETCH_BYTE (pos)))
1274 break; 1396 break;
1275 } 1397 }
1276 /* Above loop has moved POS part or all the way 1398 /* Above loop has moved POS part or all the way
@@ -1599,7 +1721,7 @@ since only regular expressions have distinguished subexpressions.")
1599 for (pos = search_regs.start[sub]; pos < last; pos++) 1721 for (pos = search_regs.start[sub]; pos < last; pos++)
1600 { 1722 {
1601 if (NILP (string)) 1723 if (NILP (string))
1602 c = FETCH_CHAR (pos); 1724 c = FETCH_BYTE (pos);
1603 else 1725 else
1604 c = XSTRING (string)->data[pos]; 1726 c = XSTRING (string)->data[pos];
1605 1727