diff options
Diffstat (limited to 'src/search.c')
| -rw-r--r-- | src/search.c | 223 |
1 files changed, 129 insertions, 94 deletions
diff --git a/src/search.c b/src/search.c index 7e990f2bfd7..d86a7cca7b2 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -293,7 +293,7 @@ looking_at_1 (string, posix) | |||
| 293 | CHECK_STRING (string); | 293 | CHECK_STRING (string); |
| 294 | bufp = compile_pattern (string, &search_regs, | 294 | bufp = compile_pattern (string, &search_regs, |
| 295 | (!NILP (current_buffer->case_fold_search) | 295 | (!NILP (current_buffer->case_fold_search) |
| 296 | ? DOWNCASE_TABLE : Qnil), | 296 | ? current_buffer->case_canon_table : Qnil), |
| 297 | posix, | 297 | posix, |
| 298 | !NILP (current_buffer->enable_multibyte_characters)); | 298 | !NILP (current_buffer->enable_multibyte_characters)); |
| 299 | 299 | ||
| @@ -399,7 +399,7 @@ string_match_1 (regexp, string, start, posix) | |||
| 399 | 399 | ||
| 400 | bufp = compile_pattern (regexp, &search_regs, | 400 | bufp = compile_pattern (regexp, &search_regs, |
| 401 | (!NILP (current_buffer->case_fold_search) | 401 | (!NILP (current_buffer->case_fold_search) |
| 402 | ? DOWNCASE_TABLE : Qnil), | 402 | ? current_buffer->case_canon_table : Qnil), |
| 403 | posix, | 403 | posix, |
| 404 | STRING_MULTIBYTE (string)); | 404 | STRING_MULTIBYTE (string)); |
| 405 | immediate_quit = 1; | 405 | immediate_quit = 1; |
| @@ -499,7 +499,7 @@ fast_c_string_match_ignore_case (regexp, string) | |||
| 499 | regexp = string_make_unibyte (regexp); | 499 | regexp = string_make_unibyte (regexp); |
| 500 | re_match_object = Qt; | 500 | re_match_object = Qt; |
| 501 | bufp = compile_pattern (regexp, 0, | 501 | bufp = compile_pattern (regexp, 0, |
| 502 | Vascii_downcase_table, 0, | 502 | Vascii_canon_table, 0, |
| 503 | 0); | 503 | 0); |
| 504 | immediate_quit = 1; | 504 | immediate_quit = 1; |
| 505 | val = re_search (bufp, string, len, 0, len, 0); | 505 | val = re_search (bufp, string, len, 0, len, 0); |
| @@ -516,7 +516,7 @@ fast_string_match_ignore_case (regexp, string) | |||
| 516 | int val; | 516 | int val; |
| 517 | struct re_pattern_buffer *bufp; | 517 | struct re_pattern_buffer *bufp; |
| 518 | 518 | ||
| 519 | bufp = compile_pattern (regexp, 0, Vascii_downcase_table, | 519 | bufp = compile_pattern (regexp, 0, Vascii_canon_table, |
| 520 | 0, STRING_MULTIBYTE (string)); | 520 | 0, STRING_MULTIBYTE (string)); |
| 521 | immediate_quit = 1; | 521 | immediate_quit = 1; |
| 522 | re_match_object = string; | 522 | re_match_object = string; |
| @@ -1175,7 +1175,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1175 | unsigned char *patbuf; | 1175 | unsigned char *patbuf; |
| 1176 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1176 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
| 1177 | unsigned char *base_pat = SDATA (string); | 1177 | unsigned char *base_pat = SDATA (string); |
| 1178 | int charset_base = -1; | 1178 | /* Set to nozero if we find a non-ASCII char that need |
| 1179 | translation. */ | ||
| 1180 | int charset_base = 0; | ||
| 1179 | int boyer_moore_ok = 1; | 1181 | int boyer_moore_ok = 1; |
| 1180 | 1182 | ||
| 1181 | /* MULTIBYTE says whether the text to be searched is multibyte. | 1183 | /* MULTIBYTE says whether the text to be searched is multibyte. |
| @@ -1221,9 +1223,17 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1221 | base_pat = raw_pattern; | 1223 | base_pat = raw_pattern; |
| 1222 | if (multibyte) | 1224 | if (multibyte) |
| 1223 | { | 1225 | { |
| 1226 | /* Fill patbuf by translated characters in STRING while | ||
| 1227 | checking if we can use boyer-moore search. If TRT is | ||
| 1228 | non-nil, we can use boyer-moore search only if TRT can be | ||
| 1229 | represented by the byte array of 256 elements. For that, | ||
| 1230 | all non-ASCII case-equivalents of all case-senstive | ||
| 1231 | characters in STRING must belong to the same charset and | ||
| 1232 | row. */ | ||
| 1233 | |||
| 1224 | while (--len >= 0) | 1234 | while (--len >= 0) |
| 1225 | { | 1235 | { |
| 1226 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | 1236 | unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str; |
| 1227 | int c, translated, inverse; | 1237 | int c, translated, inverse; |
| 1228 | int in_charlen, charlen; | 1238 | int in_charlen, charlen; |
| 1229 | 1239 | ||
| @@ -1233,50 +1243,62 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1233 | if (RE && *base_pat == '\\') | 1243 | if (RE && *base_pat == '\\') |
| 1234 | { | 1244 | { |
| 1235 | len--; | 1245 | len--; |
| 1246 | raw_pattern_size--; | ||
| 1236 | len_byte--; | 1247 | len_byte--; |
| 1237 | base_pat++; | 1248 | base_pat++; |
| 1238 | } | 1249 | } |
| 1239 | 1250 | ||
| 1240 | c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); | 1251 | c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); |
| 1241 | 1252 | ||
| 1242 | /* Translate the character, if requested. */ | 1253 | if (NILP (trt)) |
| 1243 | TRANSLATE (translated, trt, c); | ||
| 1244 | /* If translation changed the byte-length, go back | ||
| 1245 | to the original character. */ | ||
| 1246 | charlen = CHAR_STRING (translated, str); | ||
| 1247 | if (in_charlen != charlen) | ||
| 1248 | { | 1254 | { |
| 1249 | translated = c; | 1255 | str = base_pat; |
| 1250 | charlen = CHAR_STRING (c, str); | 1256 | charlen = in_charlen; |
| 1251 | } | 1257 | } |
| 1252 | 1258 | else | |
| 1253 | /* If we are searching for something strange, | ||
| 1254 | an invalid multibyte code, don't use boyer-moore. */ | ||
| 1255 | if (! ASCII_BYTE_P (translated) | ||
| 1256 | && (charlen == 1 /* 8bit code */ | ||
| 1257 | || charlen != in_charlen /* invalid multibyte code */ | ||
| 1258 | )) | ||
| 1259 | boyer_moore_ok = 0; | ||
| 1260 | |||
| 1261 | TRANSLATE (inverse, inverse_trt, c); | ||
| 1262 | |||
| 1263 | /* Did this char actually get translated? | ||
| 1264 | Would any other char get translated into it? */ | ||
| 1265 | if (translated != c || inverse != c) | ||
| 1266 | { | 1259 | { |
| 1267 | /* Keep track of which character set row | 1260 | /* Translate the character. */ |
| 1268 | contains the characters that need translation. */ | 1261 | TRANSLATE (translated, trt, c); |
| 1269 | int charset_base_code = c & ~CHAR_FIELD3_MASK; | 1262 | charlen = CHAR_STRING (translated, str_base); |
| 1270 | int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; | 1263 | str = str_base; |
| 1271 | 1264 | ||
| 1272 | if (charset_base_code != inverse_charset_base) | 1265 | /* Check if C has any other case-equivalents. */ |
| 1273 | boyer_moore_ok = 0; | 1266 | TRANSLATE (inverse, inverse_trt, c); |
| 1274 | else if (charset_base == -1) | 1267 | /* If so, check if we can use boyer-moore. */ |
| 1275 | charset_base = charset_base_code; | 1268 | if (c != inverse && boyer_moore_ok) |
| 1276 | else if (charset_base != charset_base_code) | 1269 | { |
| 1277 | /* If two different rows appear, needing translation, | 1270 | /* Check if all equivalents belong to the same |
| 1278 | then we cannot use boyer_moore search. */ | 1271 | charset & row. Note that the check of C |
| 1279 | boyer_moore_ok = 0; | 1272 | itself is done by the last iteration. Note |
| 1273 | also that we don't have to check ASCII | ||
| 1274 | characters because boyer-moore search can | ||
| 1275 | always handle their translation. */ | ||
| 1276 | while (1) | ||
| 1277 | { | ||
| 1278 | if (! ASCII_BYTE_P (inverse)) | ||
| 1279 | { | ||
| 1280 | if (SINGLE_BYTE_CHAR_P (inverse)) | ||
| 1281 | { | ||
| 1282 | /* Boyer-moore search can't handle a | ||
| 1283 | translation of an eight-bit | ||
| 1284 | character. */ | ||
| 1285 | boyer_moore_ok = 0; | ||
| 1286 | break; | ||
| 1287 | } | ||
| 1288 | else if (charset_base == 0) | ||
| 1289 | charset_base = inverse & ~CHAR_FIELD3_MASK; | ||
| 1290 | else if ((inverse & ~CHAR_FIELD3_MASK) | ||
| 1291 | != charset_base) | ||
| 1292 | { | ||
| 1293 | boyer_moore_ok = 0; | ||
| 1294 | break; | ||
| 1295 | } | ||
| 1296 | } | ||
| 1297 | if (c == inverse) | ||
| 1298 | break; | ||
| 1299 | TRANSLATE (inverse, inverse_trt, inverse); | ||
| 1300 | } | ||
| 1301 | } | ||
| 1280 | } | 1302 | } |
| 1281 | 1303 | ||
| 1282 | /* Store this character into the translated pattern. */ | 1304 | /* Store this character into the translated pattern. */ |
| @@ -1300,6 +1322,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1300 | if (RE && *base_pat == '\\') | 1322 | if (RE && *base_pat == '\\') |
| 1301 | { | 1323 | { |
| 1302 | len--; | 1324 | len--; |
| 1325 | raw_pattern_size--; | ||
| 1303 | base_pat++; | 1326 | base_pat++; |
| 1304 | } | 1327 | } |
| 1305 | c = *base_pat++; | 1328 | c = *base_pat++; |
| @@ -1533,16 +1556,18 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1533 | return n; | 1556 | return n; |
| 1534 | } | 1557 | } |
| 1535 | 1558 | ||
| 1536 | /* Do Boyer-Moore search N times for the string PAT, | 1559 | /* Do Boyer-Moore search N times for the string BASE_PAT, |
| 1537 | whose length is LEN/LEN_BYTE, | 1560 | whose length is LEN/LEN_BYTE, |
| 1538 | from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | 1561 | from buffer position POS/POS_BYTE until LIM/LIM_BYTE. |
| 1539 | DIRECTION says which direction we search in. | 1562 | DIRECTION says which direction we search in. |
| 1540 | TRT and INVERSE_TRT are translation tables. | 1563 | TRT and INVERSE_TRT are translation tables. |
| 1564 | Characters in PAT are already translated by TRT. | ||
| 1541 | 1565 | ||
| 1542 | This kind of search works if all the characters in PAT that have | 1566 | This kind of search works if all the characters in BASE_PAT that |
| 1543 | nontrivial translation are the same aside from the last byte. This | 1567 | have nontrivial translation are the same aside from the last byte. |
| 1544 | makes it possible to translate just the last byte of a character, | 1568 | This makes it possible to translate just the last byte of a |
| 1545 | and do so after just a simple test of the context. | 1569 | character, and do so after just a simple test of the context. |
| 1570 | CHARSET_BASE is nonzero iff there is such a non-ASCII character. | ||
| 1546 | 1571 | ||
| 1547 | If that criterion is not satisfied, do not call this function. */ | 1572 | If that criterion is not satisfied, do not call this function. */ |
| 1548 | 1573 | ||
| @@ -1569,8 +1594,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1569 | int multibyte = ! NILP (current_buffer->enable_multibyte_characters); | 1594 | int multibyte = ! NILP (current_buffer->enable_multibyte_characters); |
| 1570 | 1595 | ||
| 1571 | unsigned char simple_translate[0400]; | 1596 | unsigned char simple_translate[0400]; |
| 1572 | int translate_prev_byte = 0; | 1597 | /* These are set to the preceding bytes of a byte to be translated |
| 1573 | int translate_anteprev_byte = 0; | 1598 | if charset_base is nonzero. As the maximum byte length of a |
| 1599 | multibyte character is 4, we have to check at most three previous | ||
| 1600 | bytes. */ | ||
| 1601 | int translate_prev_byte1 = 0; | ||
| 1602 | int translate_prev_byte2 = 0; | ||
| 1603 | int translate_prev_byte3 = 0; | ||
| 1574 | 1604 | ||
| 1575 | #ifdef C_ALLOCA | 1605 | #ifdef C_ALLOCA |
| 1576 | int BM_tab_space[0400]; | 1606 | int BM_tab_space[0400]; |
| @@ -1636,6 +1666,23 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1636 | for (i = 0; i < 0400; i++) | 1666 | for (i = 0; i < 0400; i++) |
| 1637 | simple_translate[i] = i; | 1667 | simple_translate[i] = i; |
| 1638 | 1668 | ||
| 1669 | if (charset_base) | ||
| 1670 | { | ||
| 1671 | /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a | ||
| 1672 | byte following them are the target of translation. */ | ||
| 1673 | int sample_char = charset_base | 0x20; | ||
| 1674 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | ||
| 1675 | int len = CHAR_STRING (sample_char, str); | ||
| 1676 | |||
| 1677 | translate_prev_byte1 = str[len - 2]; | ||
| 1678 | if (len > 2) | ||
| 1679 | { | ||
| 1680 | translate_prev_byte2 = str[len - 3]; | ||
| 1681 | if (len > 3) | ||
| 1682 | translate_prev_byte3 = str[len - 4]; | ||
| 1683 | } | ||
| 1684 | } | ||
| 1685 | |||
| 1639 | i = 0; | 1686 | i = 0; |
| 1640 | while (i != infinity) | 1687 | while (i != infinity) |
| 1641 | { | 1688 | { |
| @@ -1645,57 +1692,37 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1645 | i = infinity; | 1692 | i = infinity; |
| 1646 | if (! NILP (trt)) | 1693 | if (! NILP (trt)) |
| 1647 | { | 1694 | { |
| 1648 | int ch; | 1695 | /* If the byte currently looking at is a head of a character |
| 1649 | int untranslated; | 1696 | to check case-equivalents, set CH to that character. An |
| 1650 | int this_translated = 1; | 1697 | ASCII character and a non-ASCII character matching with |
| 1651 | 1698 | CHARSET_BASE are to be checked. */ | |
| 1652 | if (multibyte | 1699 | int ch = -1; |
| 1653 | /* Is *PTR the last byte of a character? */ | 1700 | |
| 1654 | && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) | 1701 | if (ASCII_BYTE_P (*ptr) || ! multibyte) |
| 1702 | ch = *ptr; | ||
| 1703 | else if (charset_base && CHAR_HEAD_P (*ptr)) | ||
| 1655 | { | 1704 | { |
| 1656 | unsigned char *charstart = ptr; | 1705 | ch = STRING_CHAR (ptr, pat_end - ptr); |
| 1657 | while (! CHAR_HEAD_P (*charstart)) | 1706 | if (charset_base != (ch & ~CHAR_FIELD3_MASK)) |
| 1658 | charstart--; | 1707 | ch = -1; |
| 1659 | untranslated = STRING_CHAR (charstart, ptr - charstart + 1); | ||
| 1660 | if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) | ||
| 1661 | { | ||
| 1662 | TRANSLATE (ch, trt, untranslated); | ||
| 1663 | if (! CHAR_HEAD_P (*ptr)) | ||
| 1664 | { | ||
| 1665 | translate_prev_byte = ptr[-1]; | ||
| 1666 | if (! CHAR_HEAD_P (translate_prev_byte)) | ||
| 1667 | translate_anteprev_byte = ptr[-2]; | ||
| 1668 | } | ||
| 1669 | } | ||
| 1670 | else | ||
| 1671 | { | ||
| 1672 | this_translated = 0; | ||
| 1673 | ch = *ptr; | ||
| 1674 | } | ||
| 1675 | } | 1708 | } |
| 1676 | else if (!multibyte) | ||
| 1677 | TRANSLATE (ch, trt, *ptr); | ||
| 1678 | else | ||
| 1679 | { | ||
| 1680 | ch = *ptr; | ||
| 1681 | this_translated = 0; | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | if (ch > 0400) | ||
| 1685 | j = ((unsigned char) ch) | 0200; | ||
| 1686 | else | ||
| 1687 | j = (unsigned char) ch; | ||
| 1688 | 1709 | ||
| 1710 | j = *ptr; | ||
| 1689 | if (i == infinity) | 1711 | if (i == infinity) |
| 1690 | stride_for_teases = BM_tab[j]; | 1712 | stride_for_teases = BM_tab[j]; |
| 1691 | 1713 | ||
| 1692 | BM_tab[j] = dirlen - i; | 1714 | BM_tab[j] = dirlen - i; |
| 1693 | /* A translation table is accompanied by its inverse -- see */ | 1715 | /* A translation table is accompanied by its inverse -- see */ |
| 1694 | /* comment following downcase_table for details */ | 1716 | /* comment following downcase_table for details */ |
| 1695 | if (this_translated) | 1717 | if (ch >= 0) |
| 1696 | { | 1718 | { |
| 1697 | int starting_ch = ch; | 1719 | int starting_ch = ch; |
| 1698 | int starting_j = j; | 1720 | int starting_j; |
| 1721 | |||
| 1722 | if (ch > 0400) | ||
| 1723 | starting_j = ((unsigned char) ch) | 0200; | ||
| 1724 | else | ||
| 1725 | starting_j = (unsigned char) ch; | ||
| 1699 | while (1) | 1726 | while (1) |
| 1700 | { | 1727 | { |
| 1701 | TRANSLATE (ch, inverse_trt, ch); | 1728 | TRANSLATE (ch, inverse_trt, ch); |
| @@ -1821,9 +1848,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1821 | || ((cursor == tail_end_ptr | 1848 | || ((cursor == tail_end_ptr |
| 1822 | || CHAR_HEAD_P (cursor[1])) | 1849 | || CHAR_HEAD_P (cursor[1])) |
| 1823 | && (CHAR_HEAD_P (cursor[0]) | 1850 | && (CHAR_HEAD_P (cursor[0]) |
| 1824 | || (translate_prev_byte == cursor[-1] | 1851 | /* Check if this is the last byte of |
| 1825 | && (CHAR_HEAD_P (translate_prev_byte) | 1852 | a translable character. */ |
| 1826 | || translate_anteprev_byte == cursor[-2]))))) | 1853 | || (translate_prev_byte1 == cursor[-1] |
| 1854 | && (CHAR_HEAD_P (translate_prev_byte1) | ||
| 1855 | || (translate_prev_byte2 == cursor[-2] | ||
| 1856 | && (CHAR_HEAD_P (translate_prev_byte2) | ||
| 1857 | || (translate_prev_byte3 == cursor[-3])))))))) | ||
| 1827 | ch = simple_translate[*cursor]; | 1858 | ch = simple_translate[*cursor]; |
| 1828 | else | 1859 | else |
| 1829 | ch = *cursor; | 1860 | ch = *cursor; |
| @@ -1901,9 +1932,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1901 | || ((ptr == tail_end_ptr | 1932 | || ((ptr == tail_end_ptr |
| 1902 | || CHAR_HEAD_P (ptr[1])) | 1933 | || CHAR_HEAD_P (ptr[1])) |
| 1903 | && (CHAR_HEAD_P (ptr[0]) | 1934 | && (CHAR_HEAD_P (ptr[0]) |
| 1904 | || (translate_prev_byte == ptr[-1] | 1935 | /* Check if this is the last byte of a |
| 1905 | && (CHAR_HEAD_P (translate_prev_byte) | 1936 | translable character. */ |
| 1906 | || translate_anteprev_byte == ptr[-2]))))) | 1937 | || (translate_prev_byte1 == ptr[-1] |
| 1938 | && (CHAR_HEAD_P (translate_prev_byte1) | ||
| 1939 | || (translate_prev_byte2 == ptr[-2] | ||
| 1940 | && (CHAR_HEAD_P (translate_prev_byte2) | ||
| 1941 | || translate_prev_byte3 == ptr[-3]))))))) | ||
| 1907 | ch = simple_translate[*ptr]; | 1942 | ch = simple_translate[*ptr]; |
| 1908 | else | 1943 | else |
| 1909 | ch = *ptr; | 1944 | ch = *ptr; |