aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2005-10-17 04:15:51 +0000
committerKenichi Handa2005-10-17 04:15:51 +0000
commit49e44e813481da2bedc4aebe4fd3b38ab7af052b (patch)
treeef445e7a2a206fcad325665d3b846cb7eb4b17f4 /src
parent4b4cc57c0133217f41c06bc3293b572d0a06ac4d (diff)
downloademacs-49e44e813481da2bedc4aebe4fd3b38ab7af052b.tar.gz
emacs-49e44e813481da2bedc4aebe4fd3b38ab7af052b.zip
(search_buffer): Give up BM search on case-fold-search
if one of a target character has a case-equivalence of different byte length even if that target charcter is an ASCII. (simple_search): Fix culculation of byte length of matched text. (boyer_moore): Fix handling of case-equivalent multibyte characters.
Diffstat (limited to 'src')
-rw-r--r--src/search.c103
1 files changed, 62 insertions, 41 deletions
diff --git a/src/search.c b/src/search.c
index 29e24d75e71..0a28085308a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1141,9 +1141,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1141 unsigned char *patbuf; 1141 unsigned char *patbuf;
1142 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1142 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1143 unsigned char *base_pat = SDATA (string); 1143 unsigned char *base_pat = SDATA (string);
1144 /* Set to nozero if we find a non-ASCII char that need 1144 /* Set to positive if we find a non-ASCII char that need
1145 translation. */ 1145 translation. Otherwise set to zero later. */
1146 int char_base = 0; 1146 int char_base = -1;
1147 int boyer_moore_ok = 1; 1147 int boyer_moore_ok = 1;
1148 1148
1149 /* MULTIBYTE says whether the text to be searched is multibyte. 1149 /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1234,37 +1234,46 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1234 { 1234 {
1235 /* Check if all equivalents belong to the same 1235 /* Check if all equivalents belong to the same
1236 group of characters. Note that the check of C 1236 group of characters. Note that the check of C
1237 itself is done by the last iteration. Note 1237 itself is done by the last iteration. */
1238 also that we don't have to check ASCII 1238 int this_char_base = -1;
1239 characters because boyer-moore search can 1239
1240 always handle their translation. */ 1240 while (boyer_moore_ok)
1241 while (1)
1242 { 1241 {
1243 if (! ASCII_BYTE_P (inverse)) 1242 if (ASCII_BYTE_P (inverse))
1244 { 1243 {
1245 if (CHAR_BYTE8_P (inverse)) 1244 if (this_char_base > 0)
1246 { 1245 boyer_moore_ok = 0;
1247 /* Boyer-moore search can't handle a 1246 else
1248 translation of an eight-bit
1249 character. */
1250 boyer_moore_ok = 0;
1251 break;
1252 }
1253 else if (char_base == 0)
1254 char_base = inverse & ~0x3F;
1255 else if ((inverse & ~0x3F)
1256 != char_base)
1257 { 1247 {
1258 boyer_moore_ok = 0; 1248 this_char_base = 0;
1259 break; 1249 if (char_base < 0)
1250 char_base = this_char_base;
1260 } 1251 }
1261 } 1252 }
1253 else if (CHAR_BYTE8_P (inverse))
1254 /* Boyer-moore search can't handle a
1255 translation of an eight-bit
1256 character. */
1257 boyer_moore_ok = 0;
1258 else if (this_char_base < 0)
1259 {
1260 this_char_base = inverse & ~0x3F;
1261 if (char_base < 0)
1262 char_base = this_char_base;
1263 else if (char_base > 0
1264 && this_char_base != char_base)
1265 boyer_moore_ok = 0;
1266 }
1267 else if ((inverse & ~0x3F) != this_char_base)
1268 boyer_moore_ok = 0;
1262 if (c == inverse) 1269 if (c == inverse)
1263 break; 1270 break;
1264 TRANSLATE (inverse, inverse_trt, inverse); 1271 TRANSLATE (inverse, inverse_trt, inverse);
1265 } 1272 }
1266 } 1273 }
1267 } 1274 }
1275 if (char_base < 0)
1276 char_base = 0;
1268 1277
1269 /* Store this character into the translated pattern. */ 1278 /* Store this character into the translated pattern. */
1270 bcopy (str, pat, charlen); 1279 bcopy (str, pat, charlen);
@@ -1333,6 +1342,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1333{ 1342{
1334 int multibyte = ! NILP (current_buffer->enable_multibyte_characters); 1343 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
1335 int forward = n > 0; 1344 int forward = n > 0;
1345 /* Number of buffer bytes matched. Note that this may be different
1346 from len_byte in a multibyte buffer. */
1347 int match_byte;
1336 1348
1337 if (lim > pos && multibyte) 1349 if (lim > pos && multibyte)
1338 while (n > 0) 1350 while (n > 0)
@@ -1372,8 +1384,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1372 1384
1373 if (this_len == 0) 1385 if (this_len == 0)
1374 { 1386 {
1387 match_byte = this_pos_byte - pos_byte;
1375 pos += len; 1388 pos += len;
1376 pos_byte += len_byte; 1389 pos_byte += match_byte;
1377 break; 1390 break;
1378 } 1391 }
1379 1392
@@ -1410,6 +1423,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1410 1423
1411 if (this_len == 0) 1424 if (this_len == 0)
1412 { 1425 {
1426 match_byte = len;
1413 pos += len; 1427 pos += len;
1414 break; 1428 break;
1415 } 1429 }
@@ -1435,6 +1449,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1435 if (pos - len < lim) 1449 if (pos - len < lim)
1436 goto stop; 1450 goto stop;
1437 this_pos_byte = CHAR_TO_BYTE (this_pos); 1451 this_pos_byte = CHAR_TO_BYTE (this_pos);
1452 match_byte = pos_byte - this_pos_byte;
1438 1453
1439 while (this_len > 0) 1454 while (this_len > 0)
1440 { 1455 {
@@ -1460,7 +1475,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1460 if (this_len == 0) 1475 if (this_len == 0)
1461 { 1476 {
1462 pos -= len; 1477 pos -= len;
1463 pos_byte -= len_byte; 1478 pos_byte -= match_byte;
1464 break; 1479 break;
1465 } 1480 }
1466 1481
@@ -1496,6 +1511,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1496 1511
1497 if (this_len == 0) 1512 if (this_len == 0)
1498 { 1513 {
1514 match_byte = len;
1499 pos -= len; 1515 pos -= len;
1500 break; 1516 break;
1501 } 1517 }
@@ -1510,9 +1526,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1510 if (n == 0) 1526 if (n == 0)
1511 { 1527 {
1512 if (forward) 1528 if (forward)
1513 set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte); 1529 set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
1514 else 1530 else
1515 set_search_regs (multibyte ? pos_byte : pos, len_byte); 1531 set_search_regs (multibyte ? pos_byte : pos, match_byte);
1516 1532
1517 return pos; 1533 return pos;
1518 } 1534 }
@@ -1561,7 +1577,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1561 1577
1562 unsigned char simple_translate[0400]; 1578 unsigned char simple_translate[0400];
1563 /* These are set to the preceding bytes of a byte to be translated 1579 /* These are set to the preceding bytes of a byte to be translated
1564 if charset_base is nonzero. As the maximum byte length of a 1580 if char_base is nonzero. As the maximum byte length of a
1565 multibyte character is 5, we have to check at most four previous 1581 multibyte character is 5, we have to check at most four previous
1566 bytes. */ 1582 bytes. */
1567 int translate_prev_byte1 = 0; 1583 int translate_prev_byte1 = 0;
@@ -1662,22 +1678,31 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1662 i = infinity; 1678 i = infinity;
1663 if (! NILP (trt)) 1679 if (! NILP (trt))
1664 { 1680 {
1665 /* If the byte currently looking at is a head of a character 1681 /* If the byte currently looking at is the last of a
1666 to check case-equivalents, set CH to that character. An 1682 character to check case-equivalents, set CH to that
1667 ASCII character and a non-ASCII character matching with 1683 character. An ASCII character and a non-ASCII character
1668 CHAR_BASE are to be checked. */ 1684 matching with CHAR_BASE are to be checked. */
1669 int ch = -1; 1685 int ch = -1;
1670 1686
1671 if (ASCII_BYTE_P (*ptr) || ! multibyte) 1687 if (ASCII_BYTE_P (*ptr) || ! multibyte)
1672 ch = *ptr; 1688 ch = *ptr;
1673 else if (char_base && CHAR_HEAD_P (*ptr)) 1689 else if (char_base
1690 && (pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1]))
1674 { 1691 {
1675 ch = STRING_CHAR (ptr, pat_end - ptr); 1692 unsigned char *charstart = ptr - 1;
1693
1694 while (! (CHAR_HEAD_P (*charstart)))
1695 charstart--;
1696 ch = STRING_CHAR (charstart, ptr - charstart + 1);
1676 if (char_base != (ch & ~0x3F)) 1697 if (char_base != (ch & ~0x3F))
1677 ch = -1; 1698 ch = -1;
1678 } 1699 }
1679 1700
1680 j = *ptr; 1701 if (ch > 0400)
1702 j = (ch & 0x3F) | 0200;
1703 else
1704 j = *ptr;
1705
1681 if (i == infinity) 1706 if (i == infinity)
1682 stride_for_teases = BM_tab[j]; 1707 stride_for_teases = BM_tab[j];
1683 1708
@@ -1687,17 +1712,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1687 if (ch >= 0) 1712 if (ch >= 0)
1688 { 1713 {
1689 int starting_ch = ch; 1714 int starting_ch = ch;
1690 int starting_j; 1715 int starting_j = j;
1691 1716
1692 if (ch > 0400)
1693 starting_j = (ch & ~0x3F) | 0200;
1694 else
1695 starting_j = ch;
1696 while (1) 1717 while (1)
1697 { 1718 {
1698 TRANSLATE (ch, inverse_trt, ch); 1719 TRANSLATE (ch, inverse_trt, ch);
1699 if (ch > 0400) 1720 if (ch > 0400)
1700 j = (ch & ~0x3F) | 0200; 1721 j = (ch & 0x3F) | 0200;
1701 else 1722 else
1702 j = ch; 1723 j = ch;
1703 1724