diff options
| author | Kenichi Handa | 2002-10-01 01:30:13 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2002-10-01 01:30:13 +0000 |
| commit | f5f7578b06c5d8dc77039208430ac0fc23b3a533 (patch) | |
| tree | 74ad0a0d8fb7703010dcc5f1ffa17ca646e847ca /src | |
| parent | 0540dc5e89fb8cad1a6782075389508f891babf7 (diff) | |
| download | emacs-f5f7578b06c5d8dc77039208430ac0fc23b3a533.tar.gz emacs-f5f7578b06c5d8dc77039208430ac0fc23b3a533.zip | |
(search_buffer): Fix case-fold-search of multibyte
characters.
(boyer_moore): Rename the last argument to char_high_bits.
Diffstat (limited to 'src')
| -rw-r--r-- | src/search.c | 55 |
1 files changed, 23 insertions, 32 deletions
diff --git a/src/search.c b/src/search.c index c9fd6655c29..7b87a9b996a 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -1106,7 +1106,12 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1106 | unsigned char *patbuf; | 1106 | unsigned char *patbuf; |
| 1107 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1107 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
| 1108 | unsigned char *base_pat = XSTRING (string)->data; | 1108 | unsigned char *base_pat = XSTRING (string)->data; |
| 1109 | int charset_base = -1; | 1109 | /* High bits of char, calculated by (CHAR & 0x3F). Characters |
| 1110 | of the same high bits have the same sequence of bytes but | ||
| 1111 | last. To do the BM search, all characters in STRING must | ||
| 1112 | have the same high bits (including their case | ||
| 1113 | translations). */ | ||
| 1114 | int char_high_bits = -1; | ||
| 1110 | int boyer_moore_ok = 1; | 1115 | int boyer_moore_ok = 1; |
| 1111 | 1116 | ||
| 1112 | /* MULTIBYTE says whether the text to be searched is multibyte. | 1117 | /* MULTIBYTE says whether the text to be searched is multibyte. |
| @@ -1147,16 +1152,15 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1147 | /* Copy and optionally translate the pattern. */ | 1152 | /* Copy and optionally translate the pattern. */ |
| 1148 | len = raw_pattern_size; | 1153 | len = raw_pattern_size; |
| 1149 | len_byte = raw_pattern_size_byte; | 1154 | len_byte = raw_pattern_size_byte; |
| 1150 | patbuf = (unsigned char *) alloca (len_byte); | 1155 | patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH); |
| 1151 | pat = patbuf; | 1156 | pat = patbuf; |
| 1152 | base_pat = raw_pattern; | 1157 | base_pat = raw_pattern; |
| 1153 | if (multibyte) | 1158 | if (multibyte) |
| 1154 | { | 1159 | { |
| 1155 | while (--len >= 0) | 1160 | while (--len >= 0) |
| 1156 | { | 1161 | { |
| 1157 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | ||
| 1158 | int c, translated, inverse; | 1162 | int c, translated, inverse; |
| 1159 | int in_charlen, charlen; | 1163 | int in_charlen; |
| 1160 | 1164 | ||
| 1161 | /* If we got here and the RE flag is set, it's because we're | 1165 | /* If we got here and the RE flag is set, it's because we're |
| 1162 | dealing with a regexp known to be trivial, so the backslash | 1166 | dealing with a regexp known to be trivial, so the backslash |
| @@ -1172,23 +1176,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1172 | 1176 | ||
| 1173 | /* Translate the character, if requested. */ | 1177 | /* Translate the character, if requested. */ |
| 1174 | TRANSLATE (translated, trt, c); | 1178 | TRANSLATE (translated, trt, c); |
| 1175 | /* If translation changed the byte-length, go back | ||
| 1176 | to the original character. */ | ||
| 1177 | charlen = CHAR_STRING (translated, str); | ||
| 1178 | if (in_charlen != charlen) | ||
| 1179 | { | ||
| 1180 | translated = c; | ||
| 1181 | charlen = CHAR_STRING (c, str); | ||
| 1182 | } | ||
| 1183 | |||
| 1184 | /* If we are searching for something strange, | ||
| 1185 | an invalid multibyte code, don't use boyer-moore. */ | ||
| 1186 | if (! ASCII_BYTE_P (translated) | ||
| 1187 | && (charlen == 1 /* 8bit code */ | ||
| 1188 | || charlen != in_charlen /* invalid multibyte code */ | ||
| 1189 | )) | ||
| 1190 | boyer_moore_ok = 0; | ||
| 1191 | |||
| 1192 | TRANSLATE (inverse, inverse_trt, c); | 1179 | TRANSLATE (inverse, inverse_trt, c); |
| 1193 | 1180 | ||
| 1194 | /* Did this char actually get translated? | 1181 | /* Did this char actually get translated? |
| @@ -1197,18 +1184,22 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1197 | { | 1184 | { |
| 1198 | /* Keep track of which character set row | 1185 | /* Keep track of which character set row |
| 1199 | contains the characters that need translation. */ | 1186 | contains the characters that need translation. */ |
| 1200 | int charset_base_code = c & ~0x3F; | 1187 | int this_high_bit = c & ~0x3F; |
| 1201 | if (charset_base == -1) | 1188 | int trt_high_bit = ((inverse != c ? inverse : translated) |
| 1202 | charset_base = charset_base_code; | 1189 | & ~0x3F); |
| 1203 | else if (charset_base != charset_base_code) | 1190 | |
| 1191 | if (this_high_bit != trt_high_bit) | ||
| 1192 | boyer_moore_ok = 0; | ||
| 1193 | else if (char_high_bits == -1) | ||
| 1194 | char_high_bits = this_high_bit; | ||
| 1195 | else if (char_high_bits != this_high_bit) | ||
| 1204 | /* If two different rows appear, needing translation, | 1196 | /* If two different rows appear, needing translation, |
| 1205 | then we cannot use boyer_moore search. */ | 1197 | then we cannot use boyer_moore search. */ |
| 1206 | boyer_moore_ok = 0; | 1198 | boyer_moore_ok = 0; |
| 1207 | } | 1199 | } |
| 1208 | 1200 | ||
| 1209 | /* Store this character into the translated pattern. */ | 1201 | /* Store this character into the translated pattern. */ |
| 1210 | bcopy (str, pat, charlen); | 1202 | CHAR_STRING_ADVANCE (translated, pat); |
| 1211 | pat += charlen; | ||
| 1212 | base_pat += in_charlen; | 1203 | base_pat += in_charlen; |
| 1213 | len_byte -= in_charlen; | 1204 | len_byte -= in_charlen; |
| 1214 | } | 1205 | } |
| @@ -1216,7 +1207,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1216 | else | 1207 | else |
| 1217 | { | 1208 | { |
| 1218 | /* Unibyte buffer. */ | 1209 | /* Unibyte buffer. */ |
| 1219 | charset_base = 0; | 1210 | char_high_bits = 0; |
| 1220 | while (--len >= 0) | 1211 | while (--len >= 0) |
| 1221 | { | 1212 | { |
| 1222 | int c, translated; | 1213 | int c, translated; |
| @@ -1242,7 +1233,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1242 | if (boyer_moore_ok) | 1233 | if (boyer_moore_ok) |
| 1243 | return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, | 1234 | return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, |
| 1244 | pos, pos_byte, lim, lim_byte, | 1235 | pos, pos_byte, lim, lim_byte, |
| 1245 | charset_base); | 1236 | char_high_bits); |
| 1246 | else | 1237 | else |
| 1247 | return simple_search (n, pat, len, len_byte, trt, | 1238 | return simple_search (n, pat, len, len_byte, trt, |
| 1248 | pos, pos_byte, lim, lim_byte); | 1239 | pos, pos_byte, lim, lim_byte); |
| @@ -1475,7 +1466,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1475 | 1466 | ||
| 1476 | static int | 1467 | static int |
| 1477 | boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | 1468 | boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, |
| 1478 | pos, pos_byte, lim, lim_byte, charset_base) | 1469 | pos, pos_byte, lim, lim_byte, char_high_bits) |
| 1479 | int n; | 1470 | int n; |
| 1480 | unsigned char *base_pat; | 1471 | unsigned char *base_pat; |
| 1481 | int len, len_byte; | 1472 | int len, len_byte; |
| @@ -1483,7 +1474,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1483 | Lisp_Object inverse_trt; | 1474 | Lisp_Object inverse_trt; |
| 1484 | int pos, pos_byte; | 1475 | int pos, pos_byte; |
| 1485 | int lim, lim_byte; | 1476 | int lim, lim_byte; |
| 1486 | int charset_base; | 1477 | int char_high_bits; |
| 1487 | { | 1478 | { |
| 1488 | int direction = ((n > 0) ? 1 : -1); | 1479 | int direction = ((n > 0) ? 1 : -1); |
| 1489 | register int dirlen; | 1480 | register int dirlen; |
| @@ -1584,7 +1575,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1584 | while (! CHAR_HEAD_P (*charstart)) | 1575 | while (! CHAR_HEAD_P (*charstart)) |
| 1585 | charstart--; | 1576 | charstart--; |
| 1586 | untranslated = STRING_CHAR (charstart, ptr - charstart + 1); | 1577 | untranslated = STRING_CHAR (charstart, ptr - charstart + 1); |
| 1587 | if (charset_base == (untranslated & ~0x3F)) | 1578 | if (char_high_bits == (untranslated & ~0x3F)) |
| 1588 | { | 1579 | { |
| 1589 | TRANSLATE (ch, trt, untranslated); | 1580 | TRANSLATE (ch, trt, untranslated); |
| 1590 | if (! CHAR_HEAD_P (*ptr)) | 1581 | if (! CHAR_HEAD_P (*ptr)) |