diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/search.c | 103 |
1 files changed, 62 insertions, 41 deletions
diff --git a/src/search.c b/src/search.c index 29e24d75e71..0a28085308a 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -1141,9 +1141,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1141 | unsigned char *patbuf; | 1141 | unsigned char *patbuf; |
| 1142 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); | 1142 | int multibyte = !NILP (current_buffer->enable_multibyte_characters); |
| 1143 | unsigned char *base_pat = SDATA (string); | 1143 | unsigned char *base_pat = SDATA (string); |
| 1144 | /* Set to nozero if we find a non-ASCII char that need | 1144 | /* Set to positive if we find a non-ASCII char that need |
| 1145 | translation. */ | 1145 | translation. Otherwise set to zero later. */ |
| 1146 | int char_base = 0; | 1146 | int char_base = -1; |
| 1147 | int boyer_moore_ok = 1; | 1147 | int boyer_moore_ok = 1; |
| 1148 | 1148 | ||
| 1149 | /* MULTIBYTE says whether the text to be searched is multibyte. | 1149 | /* MULTIBYTE says whether the text to be searched is multibyte. |
| @@ -1234,37 +1234,46 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, | |||
| 1234 | { | 1234 | { |
| 1235 | /* Check if all equivalents belong to the same | 1235 | /* Check if all equivalents belong to the same |
| 1236 | group of characters. Note that the check of C | 1236 | group of characters. Note that the check of C |
| 1237 | itself is done by the last iteration. Note | 1237 | itself is done by the last iteration. */ |
| 1238 | also that we don't have to check ASCII | 1238 | int this_char_base = -1; |
| 1239 | characters because boyer-moore search can | 1239 | |
| 1240 | always handle their translation. */ | 1240 | while (boyer_moore_ok) |
| 1241 | while (1) | ||
| 1242 | { | 1241 | { |
| 1243 | if (! ASCII_BYTE_P (inverse)) | 1242 | if (ASCII_BYTE_P (inverse)) |
| 1244 | { | 1243 | { |
| 1245 | if (CHAR_BYTE8_P (inverse)) | 1244 | if (this_char_base > 0) |
| 1246 | { | 1245 | boyer_moore_ok = 0; |
| 1247 | /* Boyer-moore search can't handle a | 1246 | else |
| 1248 | translation of an eight-bit | ||
| 1249 | character. */ | ||
| 1250 | boyer_moore_ok = 0; | ||
| 1251 | break; | ||
| 1252 | } | ||
| 1253 | else if (char_base == 0) | ||
| 1254 | char_base = inverse & ~0x3F; | ||
| 1255 | else if ((inverse & ~0x3F) | ||
| 1256 | != char_base) | ||
| 1257 | { | 1247 | { |
| 1258 | boyer_moore_ok = 0; | 1248 | this_char_base = 0; |
| 1259 | break; | 1249 | if (char_base < 0) |
| 1250 | char_base = this_char_base; | ||
| 1260 | } | 1251 | } |
| 1261 | } | 1252 | } |
| 1253 | else if (CHAR_BYTE8_P (inverse)) | ||
| 1254 | /* Boyer-moore search can't handle a | ||
| 1255 | translation of an eight-bit | ||
| 1256 | character. */ | ||
| 1257 | boyer_moore_ok = 0; | ||
| 1258 | else if (this_char_base < 0) | ||
| 1259 | { | ||
| 1260 | this_char_base = inverse & ~0x3F; | ||
| 1261 | if (char_base < 0) | ||
| 1262 | char_base = this_char_base; | ||
| 1263 | else if (char_base > 0 | ||
| 1264 | && this_char_base != char_base) | ||
| 1265 | boyer_moore_ok = 0; | ||
| 1266 | } | ||
| 1267 | else if ((inverse & ~0x3F) != this_char_base) | ||
| 1268 | boyer_moore_ok = 0; | ||
| 1262 | if (c == inverse) | 1269 | if (c == inverse) |
| 1263 | break; | 1270 | break; |
| 1264 | TRANSLATE (inverse, inverse_trt, inverse); | 1271 | TRANSLATE (inverse, inverse_trt, inverse); |
| 1265 | } | 1272 | } |
| 1266 | } | 1273 | } |
| 1267 | } | 1274 | } |
| 1275 | if (char_base < 0) | ||
| 1276 | char_base = 0; | ||
| 1268 | 1277 | ||
| 1269 | /* Store this character into the translated pattern. */ | 1278 | /* Store this character into the translated pattern. */ |
| 1270 | bcopy (str, pat, charlen); | 1279 | bcopy (str, pat, charlen); |
| @@ -1333,6 +1342,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1333 | { | 1342 | { |
| 1334 | int multibyte = ! NILP (current_buffer->enable_multibyte_characters); | 1343 | int multibyte = ! NILP (current_buffer->enable_multibyte_characters); |
| 1335 | int forward = n > 0; | 1344 | int forward = n > 0; |
| 1345 | /* Number of buffer bytes matched. Note that this may be different | ||
| 1346 | from len_byte in a multibyte buffer. */ | ||
| 1347 | int match_byte; | ||
| 1336 | 1348 | ||
| 1337 | if (lim > pos && multibyte) | 1349 | if (lim > pos && multibyte) |
| 1338 | while (n > 0) | 1350 | while (n > 0) |
| @@ -1372,8 +1384,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1372 | 1384 | ||
| 1373 | if (this_len == 0) | 1385 | if (this_len == 0) |
| 1374 | { | 1386 | { |
| 1387 | match_byte = this_pos_byte - pos_byte; | ||
| 1375 | pos += len; | 1388 | pos += len; |
| 1376 | pos_byte += len_byte; | 1389 | pos_byte += match_byte; |
| 1377 | break; | 1390 | break; |
| 1378 | } | 1391 | } |
| 1379 | 1392 | ||
| @@ -1410,6 +1423,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1410 | 1423 | ||
| 1411 | if (this_len == 0) | 1424 | if (this_len == 0) |
| 1412 | { | 1425 | { |
| 1426 | match_byte = len; | ||
| 1413 | pos += len; | 1427 | pos += len; |
| 1414 | break; | 1428 | break; |
| 1415 | } | 1429 | } |
| @@ -1435,6 +1449,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1435 | if (pos - len < lim) | 1449 | if (pos - len < lim) |
| 1436 | goto stop; | 1450 | goto stop; |
| 1437 | this_pos_byte = CHAR_TO_BYTE (this_pos); | 1451 | this_pos_byte = CHAR_TO_BYTE (this_pos); |
| 1452 | match_byte = pos_byte - this_pos_byte; | ||
| 1438 | 1453 | ||
| 1439 | while (this_len > 0) | 1454 | while (this_len > 0) |
| 1440 | { | 1455 | { |
| @@ -1460,7 +1475,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1460 | if (this_len == 0) | 1475 | if (this_len == 0) |
| 1461 | { | 1476 | { |
| 1462 | pos -= len; | 1477 | pos -= len; |
| 1463 | pos_byte -= len_byte; | 1478 | pos_byte -= match_byte; |
| 1464 | break; | 1479 | break; |
| 1465 | } | 1480 | } |
| 1466 | 1481 | ||
| @@ -1496,6 +1511,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1496 | 1511 | ||
| 1497 | if (this_len == 0) | 1512 | if (this_len == 0) |
| 1498 | { | 1513 | { |
| 1514 | match_byte = len; | ||
| 1499 | pos -= len; | 1515 | pos -= len; |
| 1500 | break; | 1516 | break; |
| 1501 | } | 1517 | } |
| @@ -1510,9 +1526,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) | |||
| 1510 | if (n == 0) | 1526 | if (n == 0) |
| 1511 | { | 1527 | { |
| 1512 | if (forward) | 1528 | if (forward) |
| 1513 | set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte); | 1529 | set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte); |
| 1514 | else | 1530 | else |
| 1515 | set_search_regs (multibyte ? pos_byte : pos, len_byte); | 1531 | set_search_regs (multibyte ? pos_byte : pos, match_byte); |
| 1516 | 1532 | ||
| 1517 | return pos; | 1533 | return pos; |
| 1518 | } | 1534 | } |
| @@ -1561,7 +1577,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1561 | 1577 | ||
| 1562 | unsigned char simple_translate[0400]; | 1578 | unsigned char simple_translate[0400]; |
| 1563 | /* These are set to the preceding bytes of a byte to be translated | 1579 | /* These are set to the preceding bytes of a byte to be translated |
| 1564 | if charset_base is nonzero. As the maximum byte length of a | 1580 | if char_base is nonzero. As the maximum byte length of a |
| 1565 | multibyte character is 5, we have to check at most four previous | 1581 | multibyte character is 5, we have to check at most four previous |
| 1566 | bytes. */ | 1582 | bytes. */ |
| 1567 | int translate_prev_byte1 = 0; | 1583 | int translate_prev_byte1 = 0; |
| @@ -1662,22 +1678,31 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1662 | i = infinity; | 1678 | i = infinity; |
| 1663 | if (! NILP (trt)) | 1679 | if (! NILP (trt)) |
| 1664 | { | 1680 | { |
| 1665 | /* If the byte currently looking at is a head of a character | 1681 | /* If the byte currently looking at is the last of a |
| 1666 | to check case-equivalents, set CH to that character. An | 1682 | character to check case-equivalents, set CH to that |
| 1667 | ASCII character and a non-ASCII character matching with | 1683 | character. An ASCII character and a non-ASCII character |
| 1668 | CHAR_BASE are to be checked. */ | 1684 | matching with CHAR_BASE are to be checked. */ |
| 1669 | int ch = -1; | 1685 | int ch = -1; |
| 1670 | 1686 | ||
| 1671 | if (ASCII_BYTE_P (*ptr) || ! multibyte) | 1687 | if (ASCII_BYTE_P (*ptr) || ! multibyte) |
| 1672 | ch = *ptr; | 1688 | ch = *ptr; |
| 1673 | else if (char_base && CHAR_HEAD_P (*ptr)) | 1689 | else if (char_base |
| 1690 | && (pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])) | ||
| 1674 | { | 1691 | { |
| 1675 | ch = STRING_CHAR (ptr, pat_end - ptr); | 1692 | unsigned char *charstart = ptr - 1; |
| 1693 | |||
| 1694 | while (! (CHAR_HEAD_P (*charstart))) | ||
| 1695 | charstart--; | ||
| 1696 | ch = STRING_CHAR (charstart, ptr - charstart + 1); | ||
| 1676 | if (char_base != (ch & ~0x3F)) | 1697 | if (char_base != (ch & ~0x3F)) |
| 1677 | ch = -1; | 1698 | ch = -1; |
| 1678 | } | 1699 | } |
| 1679 | 1700 | ||
| 1680 | j = *ptr; | 1701 | if (ch > 0400) |
| 1702 | j = (ch & 0x3F) | 0200; | ||
| 1703 | else | ||
| 1704 | j = *ptr; | ||
| 1705 | |||
| 1681 | if (i == infinity) | 1706 | if (i == infinity) |
| 1682 | stride_for_teases = BM_tab[j]; | 1707 | stride_for_teases = BM_tab[j]; |
| 1683 | 1708 | ||
| @@ -1687,17 +1712,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1687 | if (ch >= 0) | 1712 | if (ch >= 0) |
| 1688 | { | 1713 | { |
| 1689 | int starting_ch = ch; | 1714 | int starting_ch = ch; |
| 1690 | int starting_j; | 1715 | int starting_j = j; |
| 1691 | 1716 | ||
| 1692 | if (ch > 0400) | ||
| 1693 | starting_j = (ch & ~0x3F) | 0200; | ||
| 1694 | else | ||
| 1695 | starting_j = ch; | ||
| 1696 | while (1) | 1717 | while (1) |
| 1697 | { | 1718 | { |
| 1698 | TRANSLATE (ch, inverse_trt, ch); | 1719 | TRANSLATE (ch, inverse_trt, ch); |
| 1699 | if (ch > 0400) | 1720 | if (ch > 0400) |
| 1700 | j = (ch & ~0x3F) | 0200; | 1721 | j = (ch & 0x3F) | 0200; |
| 1701 | else | 1722 | else |
| 1702 | j = ch; | 1723 | j = ch; |
| 1703 | 1724 | ||