aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2002-10-01 01:30:13 +0000
committerKenichi Handa2002-10-01 01:30:13 +0000
commitf5f7578b06c5d8dc77039208430ac0fc23b3a533 (patch)
tree74ad0a0d8fb7703010dcc5f1ffa17ca646e847ca /src
parent0540dc5e89fb8cad1a6782075389508f891babf7 (diff)
downloademacs-f5f7578b06c5d8dc77039208430ac0fc23b3a533.tar.gz
emacs-f5f7578b06c5d8dc77039208430ac0fc23b3a533.zip
(search_buffer): Fix case-fold-search of multibyte
characters. (boyer_moore): Rename the last argument to char_high_bits.
Diffstat (limited to 'src')
-rw-r--r--src/search.c55
1 files changed, 23 insertions, 32 deletions
diff --git a/src/search.c b/src/search.c
index c9fd6655c29..7b87a9b996a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1106,7 +1106,12 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1106 unsigned char *patbuf; 1106 unsigned char *patbuf;
1107 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1107 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1108 unsigned char *base_pat = XSTRING (string)->data; 1108 unsigned char *base_pat = XSTRING (string)->data;
1109 int charset_base = -1; 1109 /* High bits of char, calculated by (CHAR & 0x3F). Characters
1110 of the same high bits have the same sequence of bytes but
1111 last. To do the BM search, all characters in STRING must
1112 have the same high bits (including their case
1113 translations). */
1114 int char_high_bits = -1;
1110 int boyer_moore_ok = 1; 1115 int boyer_moore_ok = 1;
1111 1116
1112 /* MULTIBYTE says whether the text to be searched is multibyte. 1117 /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1147,16 +1152,15 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1147 /* Copy and optionally translate the pattern. */ 1152 /* Copy and optionally translate the pattern. */
1148 len = raw_pattern_size; 1153 len = raw_pattern_size;
1149 len_byte = raw_pattern_size_byte; 1154 len_byte = raw_pattern_size_byte;
1150 patbuf = (unsigned char *) alloca (len_byte); 1155 patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
1151 pat = patbuf; 1156 pat = patbuf;
1152 base_pat = raw_pattern; 1157 base_pat = raw_pattern;
1153 if (multibyte) 1158 if (multibyte)
1154 { 1159 {
1155 while (--len >= 0) 1160 while (--len >= 0)
1156 { 1161 {
1157 unsigned char str[MAX_MULTIBYTE_LENGTH];
1158 int c, translated, inverse; 1162 int c, translated, inverse;
1159 int in_charlen, charlen; 1163 int in_charlen;
1160 1164
1161 /* If we got here and the RE flag is set, it's because we're 1165 /* If we got here and the RE flag is set, it's because we're
1162 dealing with a regexp known to be trivial, so the backslash 1166 dealing with a regexp known to be trivial, so the backslash
@@ -1172,23 +1176,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1172 1176
1173 /* Translate the character, if requested. */ 1177 /* Translate the character, if requested. */
1174 TRANSLATE (translated, trt, c); 1178 TRANSLATE (translated, trt, c);
1175 /* If translation changed the byte-length, go back
1176 to the original character. */
1177 charlen = CHAR_STRING (translated, str);
1178 if (in_charlen != charlen)
1179 {
1180 translated = c;
1181 charlen = CHAR_STRING (c, str);
1182 }
1183
1184 /* If we are searching for something strange,
1185 an invalid multibyte code, don't use boyer-moore. */
1186 if (! ASCII_BYTE_P (translated)
1187 && (charlen == 1 /* 8bit code */
1188 || charlen != in_charlen /* invalid multibyte code */
1189 ))
1190 boyer_moore_ok = 0;
1191
1192 TRANSLATE (inverse, inverse_trt, c); 1179 TRANSLATE (inverse, inverse_trt, c);
1193 1180
1194 /* Did this char actually get translated? 1181 /* Did this char actually get translated?
@@ -1197,18 +1184,22 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1197 { 1184 {
1198 /* Keep track of which character set row 1185 /* Keep track of which character set row
1199 contains the characters that need translation. */ 1186 contains the characters that need translation. */
1200 int charset_base_code = c & ~0x3F; 1187 int this_high_bit = c & ~0x3F;
1201 if (charset_base == -1) 1188 int trt_high_bit = ((inverse != c ? inverse : translated)
1202 charset_base = charset_base_code; 1189 & ~0x3F);
1203 else if (charset_base != charset_base_code) 1190
1191 if (this_high_bit != trt_high_bit)
1192 boyer_moore_ok = 0;
1193 else if (char_high_bits == -1)
1194 char_high_bits = this_high_bit;
1195 else if (char_high_bits != this_high_bit)
1204 /* If two different rows appear, needing translation, 1196 /* If two different rows appear, needing translation,
1205 then we cannot use boyer_moore search. */ 1197 then we cannot use boyer_moore search. */
1206 boyer_moore_ok = 0; 1198 boyer_moore_ok = 0;
1207 } 1199 }
1208 1200
1209 /* Store this character into the translated pattern. */ 1201 /* Store this character into the translated pattern. */
1210 bcopy (str, pat, charlen); 1202 CHAR_STRING_ADVANCE (translated, pat);
1211 pat += charlen;
1212 base_pat += in_charlen; 1203 base_pat += in_charlen;
1213 len_byte -= in_charlen; 1204 len_byte -= in_charlen;
1214 } 1205 }
@@ -1216,7 +1207,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1216 else 1207 else
1217 { 1208 {
1218 /* Unibyte buffer. */ 1209 /* Unibyte buffer. */
1219 charset_base = 0; 1210 char_high_bits = 0;
1220 while (--len >= 0) 1211 while (--len >= 0)
1221 { 1212 {
1222 int c, translated; 1213 int c, translated;
@@ -1242,7 +1233,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1242 if (boyer_moore_ok) 1233 if (boyer_moore_ok)
1243 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, 1234 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1244 pos, pos_byte, lim, lim_byte, 1235 pos, pos_byte, lim, lim_byte,
1245 charset_base); 1236 char_high_bits);
1246 else 1237 else
1247 return simple_search (n, pat, len, len_byte, trt, 1238 return simple_search (n, pat, len, len_byte, trt,
1248 pos, pos_byte, lim, lim_byte); 1239 pos, pos_byte, lim, lim_byte);
@@ -1475,7 +1466,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1475 1466
1476static int 1467static int
1477boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, 1468boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1478 pos, pos_byte, lim, lim_byte, charset_base) 1469 pos, pos_byte, lim, lim_byte, char_high_bits)
1479 int n; 1470 int n;
1480 unsigned char *base_pat; 1471 unsigned char *base_pat;
1481 int len, len_byte; 1472 int len, len_byte;
@@ -1483,7 +1474,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1483 Lisp_Object inverse_trt; 1474 Lisp_Object inverse_trt;
1484 int pos, pos_byte; 1475 int pos, pos_byte;
1485 int lim, lim_byte; 1476 int lim, lim_byte;
1486 int charset_base; 1477 int char_high_bits;
1487{ 1478{
1488 int direction = ((n > 0) ? 1 : -1); 1479 int direction = ((n > 0) ? 1 : -1);
1489 register int dirlen; 1480 register int dirlen;
@@ -1584,7 +1575,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1584 while (! CHAR_HEAD_P (*charstart)) 1575 while (! CHAR_HEAD_P (*charstart))
1585 charstart--; 1576 charstart--;
1586 untranslated = STRING_CHAR (charstart, ptr - charstart + 1); 1577 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1587 if (charset_base == (untranslated & ~0x3F)) 1578 if (char_high_bits == (untranslated & ~0x3F))
1588 { 1579 {
1589 TRANSLATE (ch, trt, untranslated); 1580 TRANSLATE (ch, trt, untranslated);
1590 if (! CHAR_HEAD_P (*ptr)) 1581 if (! CHAR_HEAD_P (*ptr))