diff options
| author | Kenichi Handa | 2005-04-09 01:38:58 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2005-04-09 01:38:58 +0000 |
| commit | a17ae96ac4e644c0fc9eaef0b0e926c10e6969d2 (patch) | |
| tree | 593b37853150d5292557c035220882d8a74b9b2d | |
| parent | 4a2283bcc7998fef3788136a16a6c2b10df6fe6f (diff) | |
| download | emacs-a17ae96ac4e644c0fc9eaef0b0e926c10e6969d2.tar.gz emacs-a17ae96ac4e644c0fc9eaef0b0e926c10e6969d2.zip | |
(search_buffer): Fix the change for syncing with CVS
head.
(search_buffer): Likewise.
| -rw-r--r-- | src/search.c | 186 |
1 files changed, 101 insertions, 85 deletions
diff --git a/src/search.c b/src/search.c index fb149c665a9..b7d3e78ac1d 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -1141,12 +1141,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1141 | unsigned char *patbuf; | 1141 | unsigned char *patbuf; |
| 1142 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1142 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
| 1143 | unsigned char *base_pat = SDATA (string); | 1143 | unsigned char *base_pat = SDATA (string); |
| 1144 | /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F) | 1144 | /* Set to nozero if we find a non-ASCII char that need |
| 1145 | otherwise. Characters of the same high bits have the same | 1145 | translation. */ |
| 1146 | sequence of bytes but last. To do the BM search, all | 1146 | int char_base = 0; |
| 1147 | characters in STRING must have the same high bits (including | ||
| 1148 | their case translations). */ | ||
| 1149 | int char_high_bits = -1; | ||
| 1150 | int boyer_moore_ok = 1; | 1147 | int boyer_moore_ok = 1; |
| 1151 | 1148 | ||
| 1152 | /* MULTIBYTE says whether the text to be searched is multibyte. | 1149 | /* MULTIBYTE says whether the text to be searched is multibyte. |
| @@ -1192,10 +1189,19 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1192 | base_pat = raw_pattern; | 1189 | base_pat = raw_pattern; |
| 1193 | if (multibyte) | 1190 | if (multibyte) |
| 1194 | { | 1191 | { |
| 1192 | /* Fill patbuf by translated characters in STRING while | ||
| 1193 | checking if we can use boyer-moore search. If TRT is | ||
| 1194 | non-nil, we can use boyer-moore search only if TRT can be | ||
| 1195 | represented by the byte array of 256 elements. For that, | ||
| 1196 | all non-ASCII case-equivalents of all case-senstive | ||
| 1197 | characters in STRING must belong to the same charset and | ||
| 1198 | row. */ | ||
| 1199 | |||
| 1195 | while (--len >= 0) | 1200 | while (--len >= 0) |
| 1196 | { | 1201 | { |
| 1202 | unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str; | ||
| 1197 | int c, translated, inverse; | 1203 | int c, translated, inverse; |
| 1198 | int in_charlen; | 1204 | int in_charlen, charlen; |
| 1199 | 1205 | ||
| 1200 | /* If we got here and the RE flag is set, it's because we're | 1206 | /* If we got here and the RE flag is set, it's because we're |
| 1201 | dealing with a regexp known to be trivial, so the backslash | 1207 | dealing with a regexp known to be trivial, so the backslash |
| @@ -1209,32 +1215,60 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1209 | 1215 | ||
| 1210 | c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); | 1216 | c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); |
| 1211 | 1217 | ||
| 1212 | /* Translate the character, if requested. */ | 1218 | if (NILP (trt)) |
| 1213 | TRANSLATE (translated, trt, c); | ||
| 1214 | TRANSLATE (inverse, inverse_trt, c); | ||
| 1215 | |||
| 1216 | /* Did this char actually get translated? | ||
| 1217 | Would any other char get translated into it? */ | ||
| 1218 | if (translated != c || inverse != c) | ||
| 1219 | { | 1219 | { |
| 1220 | /* Keep track of which character set row | 1220 | str = base_pat; |
| 1221 | contains the characters that need translation. */ | 1221 | charlen = in_charlen; |
| 1222 | int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F); | 1222 | } |
| 1223 | int c1 = inverse != c ? inverse : translated; | 1223 | else |
| 1224 | int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F); | 1224 | { |
| 1225 | 1225 | /* Translate the character. */ | |
| 1226 | if (this_high_bit != trt_high_bit) | 1226 | TRANSLATE (translated, trt, c); |
| 1227 | boyer_moore_ok = 0; | 1227 | charlen = CHAR_STRING (translated, str_base); |
| 1228 | else if (char_high_bits == -1) | 1228 | str = str_base; |
| 1229 | char_high_bits = this_high_bit; | 1229 | |
| 1230 | else if (char_high_bits != this_high_bit) | 1230 | /* Check if C has any other case-equivalents. */ |
| 1231 | /* If two different rows appear, needing translation, | 1231 | TRANSLATE (inverse, inverse_trt, c); |
| 1232 | then we cannot use boyer_moore search. */ | 1232 | /* If so, check if we can use boyer-moore. */ |
| 1233 | boyer_moore_ok = 0; | 1233 | if (c != inverse && boyer_moore_ok) |
| 1234 | { | ||
| 1235 | /* Check if all equivalents belong to the same | ||
| 1236 | group of characters. Note that the check of C | ||
| 1237 | itself is done by the last iteration. Note | ||
| 1238 | also that we don't have to check ASCII | ||
| 1239 | characters because boyer-moore search can | ||
| 1240 | always handle their translation. */ | ||
| 1241 | while (1) | ||
| 1242 | { | ||
| 1243 | if (! ASCII_BYTE_P (inverse)) | ||
| 1244 | { | ||
| 1245 | if (CHAR_BYTE8_P (inverse)) | ||
| 1246 | { | ||
| 1247 | /* Boyer-moore search can't handle a | ||
| 1248 | translation of an eight-bit | ||
| 1249 | character. */ | ||
| 1250 | boyer_moore_ok = 0; | ||
| 1251 | break; | ||
| 1252 | } | ||
| 1253 | else if (char_base == 0) | ||
| 1254 | char_base = inverse & ~0x3F; | ||
| 1255 | else if ((inverse & ~0x3F) | ||
| 1256 | != char_base) | ||
| 1257 | { | ||
| 1258 | boyer_moore_ok = 0; | ||
| 1259 | break; | ||
| 1260 | } | ||
| 1261 | } | ||
| 1262 | if (c == inverse) | ||
| 1263 | break; | ||
| 1264 | TRANSLATE (inverse, inverse_trt, inverse); | ||
| 1265 | } | ||
| 1266 | } | ||
| 1234 | } | 1267 | } |
| 1235 | 1268 | ||
| 1236 | /* Store this character into the translated pattern. */ | 1269 | /* Store this character into the translated pattern. */ |
| 1237 | CHAR_STRING_ADVANCE (translated, pat); | 1270 | bcopy (str, pat, charlen); |
| 1271 | pat += charlen; | ||
| 1238 | base_pat += in_charlen; | 1272 | base_pat += in_charlen; |
| 1239 | len_byte -= in_charlen; | 1273 | len_byte -= in_charlen; |
| 1240 | } | 1274 | } |
| @@ -1242,7 +1276,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1242 | else | 1276 | else |
| 1243 | { | 1277 | { |
| 1244 | /* Unibyte buffer. */ | 1278 | /* Unibyte buffer. */ |
| 1245 | char_high_bits = 0; | 1279 | char_base = 0; |
| 1246 | while (--len >= 0) | 1280 | while (--len >= 0) |
| 1247 | { | 1281 | { |
| 1248 | int c, translated; | 1282 | int c, translated; |
| @@ -1269,7 +1303,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1269 | if (boyer_moore_ok) | 1303 | if (boyer_moore_ok) |
| 1270 | return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, | 1304 | return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, |
| 1271 | pos, pos_byte, lim, lim_byte, | 1305 | pos, pos_byte, lim, lim_byte, |
| 1272 | char_high_bits); | 1306 | char_base); |
| 1273 | else | 1307 | else |
| 1274 | return simple_search (n, pat, len, len_byte, trt, | 1308 | return simple_search (n, pat, len, len_byte, trt, |
| 1275 | pos, pos_byte, lim, lim_byte); | 1309 | pos, pos_byte, lim, lim_byte); |
| @@ -1499,13 +1533,13 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1499 | have nontrivial translation are the same aside from the last byte. | 1533 | have nontrivial translation are the same aside from the last byte. |
| 1500 | This makes it possible to translate just the last byte of a | 1534 | This makes it possible to translate just the last byte of a |
| 1501 | character, and do so after just a simple test of the context. | 1535 | character, and do so after just a simple test of the context. |
| 1502 | CHARSET_BASE is nonzero iff there is such a non-ASCII character. | 1536 | CHAR_BASE is nonzero iff there is such a non-ASCII character. |
| 1503 | 1537 | ||
| 1504 | If that criterion is not satisfied, do not call this function. */ | 1538 | If that criterion is not satisfied, do not call this function. */ |
| 1505 | 1539 | ||
| 1506 | static int | 1540 | static int |
| 1507 | boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | 1541 | boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, |
| 1508 | pos, pos_byte, lim, lim_byte, char_high_bits) | 1542 | pos, pos_byte, lim, lim_byte, char_base) |
| 1509 | int n; | 1543 | int n; |
| 1510 | unsigned char *base_pat; | 1544 | unsigned char *base_pat; |
| 1511 | int len, len_byte; | 1545 | int len, len_byte; |
| @@ -1513,7 +1547,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1513 | Lisp_Object inverse_trt; | 1547 | Lisp_Object inverse_trt; |
| 1514 | int pos, pos_byte; | 1548 | int pos, pos_byte; |
| 1515 | int lim, lim_byte; | 1549 | int lim, lim_byte; |
| 1516 | int char_high_bits; | 1550 | int char_base; |
| 1517 | { | 1551 | { |
| 1518 | int direction = ((n > 0) ? 1 : -1); | 1552 | int direction = ((n > 0) ? 1 : -1); |
| 1519 | register int dirlen; | 1553 | register int dirlen; |
| @@ -1528,11 +1562,12 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1528 | unsigned char simple_translate[0400]; | 1562 | unsigned char simple_translate[0400]; |
| 1529 | /* These are set to the preceding bytes of a byte to be translated | 1563 | /* These are set to the preceding bytes of a byte to be translated |
| 1530 | if charset_base is nonzero. As the maximum byte length of a | 1564 | if charset_base is nonzero. As the maximum byte length of a |
| 1531 | multibyte character is 4, we have to check at most three previous | 1565 | multibyte character is 5, we have to check at most four previous |
| 1532 | bytes. */ | 1566 | bytes. */ |
| 1533 | int translate_prev_byte1 = 0; | 1567 | int translate_prev_byte1 = 0; |
| 1534 | int translate_prev_byte2 = 0; | 1568 | int translate_prev_byte2 = 0; |
| 1535 | int translate_prev_byte3 = 0; | 1569 | int translate_prev_byte3 = 0; |
| 1570 | int translate_prev_byte4 = 0; | ||
| 1536 | 1571 | ||
| 1537 | #ifdef C_ALLOCA | 1572 | #ifdef C_ALLOCA |
| 1538 | int BM_tab_space[0400]; | 1573 | int BM_tab_space[0400]; |
| @@ -1598,20 +1633,23 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1598 | for (i = 0; i < 0400; i++) | 1633 | for (i = 0; i < 0400; i++) |
| 1599 | simple_translate[i] = i; | 1634 | simple_translate[i] = i; |
| 1600 | 1635 | ||
| 1601 | if (charset_base) | 1636 | if (char_base) |
| 1602 | { | 1637 | { |
| 1603 | /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a | 1638 | /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE. Only a |
| 1604 | byte following them are the target of translation. */ | 1639 | byte following them are the target of translation. */ |
| 1605 | int sample_char = charset_base | 0x20; | ||
| 1606 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | 1640 | unsigned char str[MAX_MULTIBYTE_LENGTH]; |
| 1607 | int len = CHAR_STRING (sample_char, str); | 1641 | int len = CHAR_STRING (char_base, str); |
| 1608 | 1642 | ||
| 1609 | translate_prev_byte1 = str[len - 2]; | 1643 | translate_prev_byte1 = str[len - 2]; |
| 1610 | if (len > 2) | 1644 | if (len > 2) |
| 1611 | { | 1645 | { |
| 1612 | translate_prev_byte2 = str[len - 3]; | 1646 | translate_prev_byte2 = str[len - 3]; |
| 1613 | if (len > 3) | 1647 | if (len > 3) |
| 1614 | translate_prev_byte3 = str[len - 4]; | 1648 | { |
| 1649 | translate_prev_byte3 = str[len - 4]; | ||
| 1650 | if (len > 4) | ||
| 1651 | translate_prev_byte4 = str[len - 5]; | ||
| 1652 | } | ||
| 1615 | } | 1653 | } |
| 1616 | } | 1654 | } |
| 1617 | 1655 | ||
| @@ -1624,66 +1662,44 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1624 | i = infinity; | 1662 | i = infinity; |
| 1625 | if (! NILP (trt)) | 1663 | if (! NILP (trt)) |
| 1626 | { | 1664 | { |
| 1627 | int ch; | 1665 | /* If the byte currently looking at is a head of a character |
| 1628 | int untranslated; | 1666 | to check case-equivalents, set CH to that character. An |
| 1629 | int this_translated = 1; | 1667 | ASCII character and a non-ASCII character matching with |
| 1630 | 1668 | CHAR_BASE are to be checked. */ | |
| 1631 | if (multibyte | 1669 | int ch = -1; |
| 1632 | /* Is *PTR the last byte of a character? */ | 1670 | |
| 1633 | && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) | 1671 | if (ASCII_BYTE_P (*ptr) || ! multibyte) |
| 1634 | { | 1672 | ch = *ptr; |
| 1635 | unsigned char *charstart = ptr; | 1673 | else if (char_base && CHAR_HEAD_P (*ptr)) |
| 1636 | while (! CHAR_HEAD_P (*charstart)) | ||
| 1637 | charstart--; | ||
| 1638 | untranslated = STRING_CHAR (charstart, ptr - charstart + 1); | ||
| 1639 | if (char_high_bits | ||
| 1640 | == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F)) | ||
| 1641 | { | ||
| 1642 | TRANSLATE (ch, trt, untranslated); | ||
| 1643 | if (! CHAR_HEAD_P (*ptr)) | ||
| 1644 | { | ||
| 1645 | translate_prev_byte = ptr[-1]; | ||
| 1646 | if (! CHAR_HEAD_P (translate_prev_byte)) | ||
| 1647 | translate_anteprev_byte = ptr[-2]; | ||
| 1648 | } | ||
| 1649 | } | ||
| 1650 | else | ||
| 1651 | { | ||
| 1652 | this_translated = 0; | ||
| 1653 | ch = *ptr; | ||
| 1654 | } | ||
| 1655 | } | ||
| 1656 | else if (!multibyte) | ||
| 1657 | TRANSLATE (ch, trt, *ptr); | ||
| 1658 | else | ||
| 1659 | { | 1674 | { |
| 1660 | ch = *ptr; | 1675 | ch = STRING_CHAR (ptr, pat_end - ptr); |
| 1661 | this_translated = 0; | 1676 | if (char_base != (ch & ~0x3F)) |
| 1677 | ch = -1; | ||
| 1662 | } | 1678 | } |
| 1663 | 1679 | ||
| 1664 | if (this_translated | 1680 | j = *ptr; |
| 1665 | && ch >= 0200) | ||
| 1666 | j = (ch & 0x3F) | 0200; | ||
| 1667 | else | ||
| 1668 | j = (unsigned char) ch; | ||
| 1669 | |||
| 1670 | if (i == infinity) | 1681 | if (i == infinity) |
| 1671 | stride_for_teases = BM_tab[j]; | 1682 | stride_for_teases = BM_tab[j]; |
| 1672 | 1683 | ||
| 1673 | BM_tab[j] = dirlen - i; | 1684 | BM_tab[j] = dirlen - i; |
| 1674 | /* A translation table is accompanied by its inverse -- see */ | 1685 | /* A translation table is accompanied by its inverse -- see */ |
| 1675 | /* comment following downcase_table for details */ | 1686 | /* comment following downcase_table for details */ |
| 1676 | if (this_translated) | 1687 | if (ch >= 0) |
| 1677 | { | 1688 | { |
| 1678 | int starting_ch = ch; | 1689 | int starting_ch = ch; |
| 1679 | int starting_j = j; | 1690 | int starting_j; |
| 1691 | |||
| 1692 | if (ch > 0400) | ||
| 1693 | starting_j = (ch & ~0x3F) | 0200; | ||
| 1694 | else | ||
| 1695 | starting_j = ch; | ||
| 1680 | while (1) | 1696 | while (1) |
| 1681 | { | 1697 | { |
| 1682 | TRANSLATE (ch, inverse_trt, ch); | 1698 | TRANSLATE (ch, inverse_trt, ch); |
| 1683 | if (ch > 0200) | 1699 | if (ch > 0400) |
| 1684 | j = (ch & 0x3F) | 0200; | 1700 | j = (ch & ~0x3F) | 0200; |
| 1685 | else | 1701 | else |
| 1686 | j = (unsigned char) ch; | 1702 | j = ch; |
| 1687 | 1703 | ||
| 1688 | /* For all the characters that map into CH, | 1704 | /* For all the characters that map into CH, |
| 1689 | set up simple_translate to map the last byte | 1705 | set up simple_translate to map the last byte |