aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/search.c186
1 files changed, 101 insertions, 85 deletions
diff --git a/src/search.c b/src/search.c
index fb149c665a9..b7d3e78ac1d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1141,12 +1141,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1141 unsigned char *patbuf; 1141 unsigned char *patbuf;
1142 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1142 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1143 unsigned char *base_pat = SDATA (string); 1143 unsigned char *base_pat = SDATA (string);
1144 /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F) 1144 /* Set to nozero if we find a non-ASCII char that need
1145 otherwise. Characters of the same high bits have the same 1145 translation. */
1146 sequence of bytes but last. To do the BM search, all 1146 int char_base = 0;
1147 characters in STRING must have the same high bits (including
1148 their case translations). */
1149 int char_high_bits = -1;
1150 int boyer_moore_ok = 1; 1147 int boyer_moore_ok = 1;
1151 1148
1152 /* MULTIBYTE says whether the text to be searched is multibyte. 1149 /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1192,10 +1189,19 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1192 base_pat = raw_pattern; 1189 base_pat = raw_pattern;
1193 if (multibyte) 1190 if (multibyte)
1194 { 1191 {
1192 /* Fill patbuf by translated characters in STRING while
1193 checking if we can use boyer-moore search. If TRT is
1194 non-nil, we can use boyer-moore search only if TRT can be
1195 represented by the byte array of 256 elements. For that,
1196 all non-ASCII case-equivalents of all case-senstive
1197 characters in STRING must belong to the same charset and
1198 row. */
1199
1195 while (--len >= 0) 1200 while (--len >= 0)
1196 { 1201 {
1202 unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1197 int c, translated, inverse; 1203 int c, translated, inverse;
1198 int in_charlen; 1204 int in_charlen, charlen;
1199 1205
1200 /* If we got here and the RE flag is set, it's because we're 1206 /* If we got here and the RE flag is set, it's because we're
1201 dealing with a regexp known to be trivial, so the backslash 1207 dealing with a regexp known to be trivial, so the backslash
@@ -1209,32 +1215,60 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1209 1215
1210 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); 1216 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
1211 1217
1212 /* Translate the character, if requested. */ 1218 if (NILP (trt))
1213 TRANSLATE (translated, trt, c);
1214 TRANSLATE (inverse, inverse_trt, c);
1215
1216 /* Did this char actually get translated?
1217 Would any other char get translated into it? */
1218 if (translated != c || inverse != c)
1219 { 1219 {
1220 /* Keep track of which character set row 1220 str = base_pat;
1221 contains the characters that need translation. */ 1221 charlen = in_charlen;
1222 int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F); 1222 }
1223 int c1 = inverse != c ? inverse : translated; 1223 else
1224 int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F); 1224 {
1225 1225 /* Translate the character. */
1226 if (this_high_bit != trt_high_bit) 1226 TRANSLATE (translated, trt, c);
1227 boyer_moore_ok = 0; 1227 charlen = CHAR_STRING (translated, str_base);
1228 else if (char_high_bits == -1) 1228 str = str_base;
1229 char_high_bits = this_high_bit; 1229
1230 else if (char_high_bits != this_high_bit) 1230 /* Check if C has any other case-equivalents. */
1231 /* If two different rows appear, needing translation, 1231 TRANSLATE (inverse, inverse_trt, c);
1232 then we cannot use boyer_moore search. */ 1232 /* If so, check if we can use boyer-moore. */
1233 boyer_moore_ok = 0; 1233 if (c != inverse && boyer_moore_ok)
1234 {
1235 /* Check if all equivalents belong to the same
1236 group of characters. Note that the check of C
1237 itself is done by the last iteration. Note
1238 also that we don't have to check ASCII
1239 characters because boyer-moore search can
1240 always handle their translation. */
1241 while (1)
1242 {
1243 if (! ASCII_BYTE_P (inverse))
1244 {
1245 if (CHAR_BYTE8_P (inverse))
1246 {
1247 /* Boyer-moore search can't handle a
1248 translation of an eight-bit
1249 character. */
1250 boyer_moore_ok = 0;
1251 break;
1252 }
1253 else if (char_base == 0)
1254 char_base = inverse & ~0x3F;
1255 else if ((inverse & ~0x3F)
1256 != char_base)
1257 {
1258 boyer_moore_ok = 0;
1259 break;
1260 }
1261 }
1262 if (c == inverse)
1263 break;
1264 TRANSLATE (inverse, inverse_trt, inverse);
1265 }
1266 }
1234 } 1267 }
1235 1268
1236 /* Store this character into the translated pattern. */ 1269 /* Store this character into the translated pattern. */
1237 CHAR_STRING_ADVANCE (translated, pat); 1270 bcopy (str, pat, charlen);
1271 pat += charlen;
1238 base_pat += in_charlen; 1272 base_pat += in_charlen;
1239 len_byte -= in_charlen; 1273 len_byte -= in_charlen;
1240 } 1274 }
@@ -1242,7 +1276,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1242 else 1276 else
1243 { 1277 {
1244 /* Unibyte buffer. */ 1278 /* Unibyte buffer. */
1245 char_high_bits = 0; 1279 char_base = 0;
1246 while (--len >= 0) 1280 while (--len >= 0)
1247 { 1281 {
1248 int c, translated; 1282 int c, translated;
@@ -1269,7 +1303,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1269 if (boyer_moore_ok) 1303 if (boyer_moore_ok)
1270 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, 1304 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1271 pos, pos_byte, lim, lim_byte, 1305 pos, pos_byte, lim, lim_byte,
1272 char_high_bits); 1306 char_base);
1273 else 1307 else
1274 return simple_search (n, pat, len, len_byte, trt, 1308 return simple_search (n, pat, len, len_byte, trt,
1275 pos, pos_byte, lim, lim_byte); 1309 pos, pos_byte, lim, lim_byte);
@@ -1499,13 +1533,13 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1499 have nontrivial translation are the same aside from the last byte. 1533 have nontrivial translation are the same aside from the last byte.
1500 This makes it possible to translate just the last byte of a 1534 This makes it possible to translate just the last byte of a
1501 character, and do so after just a simple test of the context. 1535 character, and do so after just a simple test of the context.
1502 CHARSET_BASE is nonzero iff there is such a non-ASCII character. 1536 CHAR_BASE is nonzero iff there is such a non-ASCII character.
1503 1537
1504 If that criterion is not satisfied, do not call this function. */ 1538 If that criterion is not satisfied, do not call this function. */
1505 1539
1506static int 1540static int
1507boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, 1541boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1508 pos, pos_byte, lim, lim_byte, char_high_bits) 1542 pos, pos_byte, lim, lim_byte, char_base)
1509 int n; 1543 int n;
1510 unsigned char *base_pat; 1544 unsigned char *base_pat;
1511 int len, len_byte; 1545 int len, len_byte;
@@ -1513,7 +1547,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1513 Lisp_Object inverse_trt; 1547 Lisp_Object inverse_trt;
1514 int pos, pos_byte; 1548 int pos, pos_byte;
1515 int lim, lim_byte; 1549 int lim, lim_byte;
1516 int char_high_bits; 1550 int char_base;
1517{ 1551{
1518 int direction = ((n > 0) ? 1 : -1); 1552 int direction = ((n > 0) ? 1 : -1);
1519 register int dirlen; 1553 register int dirlen;
@@ -1528,11 +1562,12 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1528 unsigned char simple_translate[0400]; 1562 unsigned char simple_translate[0400];
1529 /* These are set to the preceding bytes of a byte to be translated 1563 /* These are set to the preceding bytes of a byte to be translated
1530 if charset_base is nonzero. As the maximum byte length of a 1564 if charset_base is nonzero. As the maximum byte length of a
1531 multibyte character is 4, we have to check at most three previous 1565 multibyte character is 5, we have to check at most four previous
1532 bytes. */ 1566 bytes. */
1533 int translate_prev_byte1 = 0; 1567 int translate_prev_byte1 = 0;
1534 int translate_prev_byte2 = 0; 1568 int translate_prev_byte2 = 0;
1535 int translate_prev_byte3 = 0; 1569 int translate_prev_byte3 = 0;
1570 int translate_prev_byte4 = 0;
1536 1571
1537#ifdef C_ALLOCA 1572#ifdef C_ALLOCA
1538 int BM_tab_space[0400]; 1573 int BM_tab_space[0400];
@@ -1598,20 +1633,23 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1598 for (i = 0; i < 0400; i++) 1633 for (i = 0; i < 0400; i++)
1599 simple_translate[i] = i; 1634 simple_translate[i] = i;
1600 1635
1601 if (charset_base) 1636 if (char_base)
1602 { 1637 {
1603 /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a 1638 /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE. Only a
1604 byte following them are the target of translation. */ 1639 byte following them are the target of translation. */
1605 int sample_char = charset_base | 0x20;
1606 unsigned char str[MAX_MULTIBYTE_LENGTH]; 1640 unsigned char str[MAX_MULTIBYTE_LENGTH];
1607 int len = CHAR_STRING (sample_char, str); 1641 int len = CHAR_STRING (char_base, str);
1608 1642
1609 translate_prev_byte1 = str[len - 2]; 1643 translate_prev_byte1 = str[len - 2];
1610 if (len > 2) 1644 if (len > 2)
1611 { 1645 {
1612 translate_prev_byte2 = str[len - 3]; 1646 translate_prev_byte2 = str[len - 3];
1613 if (len > 3) 1647 if (len > 3)
1614 translate_prev_byte3 = str[len - 4]; 1648 {
1649 translate_prev_byte3 = str[len - 4];
1650 if (len > 4)
1651 translate_prev_byte4 = str[len - 5];
1652 }
1615 } 1653 }
1616 } 1654 }
1617 1655
@@ -1624,66 +1662,44 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1624 i = infinity; 1662 i = infinity;
1625 if (! NILP (trt)) 1663 if (! NILP (trt))
1626 { 1664 {
1627 int ch; 1665 /* If the byte currently looking at is a head of a character
1628 int untranslated; 1666 to check case-equivalents, set CH to that character. An
1629 int this_translated = 1; 1667 ASCII character and a non-ASCII character matching with
1630 1668 CHAR_BASE are to be checked. */
1631 if (multibyte 1669 int ch = -1;
1632 /* Is *PTR the last byte of a character? */ 1670
1633 && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) 1671 if (ASCII_BYTE_P (*ptr) || ! multibyte)
1634 { 1672 ch = *ptr;
1635 unsigned char *charstart = ptr; 1673 else if (char_base && CHAR_HEAD_P (*ptr))
1636 while (! CHAR_HEAD_P (*charstart))
1637 charstart--;
1638 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1639 if (char_high_bits
1640 == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F))
1641 {
1642 TRANSLATE (ch, trt, untranslated);
1643 if (! CHAR_HEAD_P (*ptr))
1644 {
1645 translate_prev_byte = ptr[-1];
1646 if (! CHAR_HEAD_P (translate_prev_byte))
1647 translate_anteprev_byte = ptr[-2];
1648 }
1649 }
1650 else
1651 {
1652 this_translated = 0;
1653 ch = *ptr;
1654 }
1655 }
1656 else if (!multibyte)
1657 TRANSLATE (ch, trt, *ptr);
1658 else
1659 { 1674 {
1660 ch = *ptr; 1675 ch = STRING_CHAR (ptr, pat_end - ptr);
1661 this_translated = 0; 1676 if (char_base != (ch & ~0x3F))
1677 ch = -1;
1662 } 1678 }
1663 1679
1664 if (this_translated 1680 j = *ptr;
1665 && ch >= 0200)
1666 j = (ch & 0x3F) | 0200;
1667 else
1668 j = (unsigned char) ch;
1669
1670 if (i == infinity) 1681 if (i == infinity)
1671 stride_for_teases = BM_tab[j]; 1682 stride_for_teases = BM_tab[j];
1672 1683
1673 BM_tab[j] = dirlen - i; 1684 BM_tab[j] = dirlen - i;
1674 /* A translation table is accompanied by its inverse -- see */ 1685 /* A translation table is accompanied by its inverse -- see */
1675 /* comment following downcase_table for details */ 1686 /* comment following downcase_table for details */
1676 if (this_translated) 1687 if (ch >= 0)
1677 { 1688 {
1678 int starting_ch = ch; 1689 int starting_ch = ch;
1679 int starting_j = j; 1690 int starting_j;
1691
1692 if (ch > 0400)
1693 starting_j = (ch & ~0x3F) | 0200;
1694 else
1695 starting_j = ch;
1680 while (1) 1696 while (1)
1681 { 1697 {
1682 TRANSLATE (ch, inverse_trt, ch); 1698 TRANSLATE (ch, inverse_trt, ch);
1683 if (ch > 0200) 1699 if (ch > 0400)
1684 j = (ch & 0x3F) | 0200; 1700 j = (ch & ~0x3F) | 0200;
1685 else 1701 else
1686 j = (unsigned char) ch; 1702 j = ch;
1687 1703
1688 /* For all the characters that map into CH, 1704 /* For all the characters that map into CH,
1689 set up simple_translate to map the last byte 1705 set up simple_translate to map the last byte