aboutsummaryrefslogtreecommitdiffstats
path: root/src/search.c
diff options
context:
space:
mode:
authorKaroly Lorentey2005-04-04 16:43:15 +0000
committerKaroly Lorentey2005-04-04 16:43:15 +0000
commitee00ea6e18c2aeae86e262fae703f67f4705032a (patch)
tree9e7fc6bfb33de2b2f861589a2f7674ad35de85a8 /src/search.c
parent4a932511428a2b61ec51deebd6e16ec1efbda800 (diff)
parent8c6ef2ca34c444c1dea1f11b999b4b2ec16cdea3 (diff)
downloademacs-ee00ea6e18c2aeae86e262fae703f67f4705032a.tar.gz
emacs-ee00ea6e18c2aeae86e262fae703f67f4705032a.zip
Merged from miles@gnu.org--gnu-2005 (patch 45-55, 214-231)
Patches applied: * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-214 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-215 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-216 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-217 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-218 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-219 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-220 Merge from gnus--rel--5.10 * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-221 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-222 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-223 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-224 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-225 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-226 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-227 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-228 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-229 Update from CVS * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-230 Merge from gnus--rel--5.10 * miles@gnu.org--gnu-2005/emacs--cvs-trunk--0--patch-231 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-45 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-46 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-47 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-48 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-49 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-50 Update from CVS: texi Makefile.in CVS keyw cruft * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-51 Update from CVS: ChangeLog tweaks * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-52 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-53 Merge from emacs--cvs-trunk--0 * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-54 Update from CVS * miles@gnu.org--gnu-2005/gnus--rel--5.10--patch-55 Update from CVS git-archimport-id: lorentey@elte.hu--2004/emacs--multi-tty--0--patch-324
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c223
1 files changed, 129 insertions, 94 deletions
diff --git a/src/search.c b/src/search.c
index 7e990f2bfd7..d86a7cca7b2 100644
--- a/src/search.c
+++ b/src/search.c
@@ -293,7 +293,7 @@ looking_at_1 (string, posix)
293 CHECK_STRING (string); 293 CHECK_STRING (string);
294 bufp = compile_pattern (string, &search_regs, 294 bufp = compile_pattern (string, &search_regs,
295 (!NILP (current_buffer->case_fold_search) 295 (!NILP (current_buffer->case_fold_search)
296 ? DOWNCASE_TABLE : Qnil), 296 ? current_buffer->case_canon_table : Qnil),
297 posix, 297 posix,
298 !NILP (current_buffer->enable_multibyte_characters)); 298 !NILP (current_buffer->enable_multibyte_characters));
299 299
@@ -399,7 +399,7 @@ string_match_1 (regexp, string, start, posix)
399 399
400 bufp = compile_pattern (regexp, &search_regs, 400 bufp = compile_pattern (regexp, &search_regs,
401 (!NILP (current_buffer->case_fold_search) 401 (!NILP (current_buffer->case_fold_search)
402 ? DOWNCASE_TABLE : Qnil), 402 ? current_buffer->case_canon_table : Qnil),
403 posix, 403 posix,
404 STRING_MULTIBYTE (string)); 404 STRING_MULTIBYTE (string));
405 immediate_quit = 1; 405 immediate_quit = 1;
@@ -499,7 +499,7 @@ fast_c_string_match_ignore_case (regexp, string)
499 regexp = string_make_unibyte (regexp); 499 regexp = string_make_unibyte (regexp);
500 re_match_object = Qt; 500 re_match_object = Qt;
501 bufp = compile_pattern (regexp, 0, 501 bufp = compile_pattern (regexp, 0,
502 Vascii_downcase_table, 0, 502 Vascii_canon_table, 0,
503 0); 503 0);
504 immediate_quit = 1; 504 immediate_quit = 1;
505 val = re_search (bufp, string, len, 0, len, 0); 505 val = re_search (bufp, string, len, 0, len, 0);
@@ -516,7 +516,7 @@ fast_string_match_ignore_case (regexp, string)
516 int val; 516 int val;
517 struct re_pattern_buffer *bufp; 517 struct re_pattern_buffer *bufp;
518 518
519 bufp = compile_pattern (regexp, 0, Vascii_downcase_table, 519 bufp = compile_pattern (regexp, 0, Vascii_canon_table,
520 0, STRING_MULTIBYTE (string)); 520 0, STRING_MULTIBYTE (string));
521 immediate_quit = 1; 521 immediate_quit = 1;
522 re_match_object = string; 522 re_match_object = string;
@@ -1175,7 +1175,9 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1175 unsigned char *patbuf; 1175 unsigned char *patbuf;
1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters); 1176 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1177 unsigned char *base_pat = SDATA (string); 1177 unsigned char *base_pat = SDATA (string);
1178 int charset_base = -1; 1178 /* Set to nozero if we find a non-ASCII char that need
1179 translation. */
1180 int charset_base = 0;
1179 int boyer_moore_ok = 1; 1181 int boyer_moore_ok = 1;
1180 1182
1181 /* MULTIBYTE says whether the text to be searched is multibyte. 1183 /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1221,9 +1223,17 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1221 base_pat = raw_pattern; 1223 base_pat = raw_pattern;
1222 if (multibyte) 1224 if (multibyte)
1223 { 1225 {
1226 /* Fill patbuf by translated characters in STRING while
1227 checking if we can use boyer-moore search. If TRT is
1228 non-nil, we can use boyer-moore search only if TRT can be
1229 represented by the byte array of 256 elements. For that,
1230 all non-ASCII case-equivalents of all case-senstive
1231 characters in STRING must belong to the same charset and
1232 row. */
1233
1224 while (--len >= 0) 1234 while (--len >= 0)
1225 { 1235 {
1226 unsigned char str[MAX_MULTIBYTE_LENGTH]; 1236 unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1227 int c, translated, inverse; 1237 int c, translated, inverse;
1228 int in_charlen, charlen; 1238 int in_charlen, charlen;
1229 1239
@@ -1233,50 +1243,62 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1233 if (RE && *base_pat == '\\') 1243 if (RE && *base_pat == '\\')
1234 { 1244 {
1235 len--; 1245 len--;
1246 raw_pattern_size--;
1236 len_byte--; 1247 len_byte--;
1237 base_pat++; 1248 base_pat++;
1238 } 1249 }
1239 1250
1240 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); 1251 c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
1241 1252
1242 /* Translate the character, if requested. */ 1253 if (NILP (trt))
1243 TRANSLATE (translated, trt, c);
1244 /* If translation changed the byte-length, go back
1245 to the original character. */
1246 charlen = CHAR_STRING (translated, str);
1247 if (in_charlen != charlen)
1248 { 1254 {
1249 translated = c; 1255 str = base_pat;
1250 charlen = CHAR_STRING (c, str); 1256 charlen = in_charlen;
1251 } 1257 }
1252 1258 else
1253 /* If we are searching for something strange,
1254 an invalid multibyte code, don't use boyer-moore. */
1255 if (! ASCII_BYTE_P (translated)
1256 && (charlen == 1 /* 8bit code */
1257 || charlen != in_charlen /* invalid multibyte code */
1258 ))
1259 boyer_moore_ok = 0;
1260
1261 TRANSLATE (inverse, inverse_trt, c);
1262
1263 /* Did this char actually get translated?
1264 Would any other char get translated into it? */
1265 if (translated != c || inverse != c)
1266 { 1259 {
1267 /* Keep track of which character set row 1260 /* Translate the character. */
1268 contains the characters that need translation. */ 1261 TRANSLATE (translated, trt, c);
1269 int charset_base_code = c & ~CHAR_FIELD3_MASK; 1262 charlen = CHAR_STRING (translated, str_base);
1270 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; 1263 str = str_base;
1271 1264
1272 if (charset_base_code != inverse_charset_base) 1265 /* Check if C has any other case-equivalents. */
1273 boyer_moore_ok = 0; 1266 TRANSLATE (inverse, inverse_trt, c);
1274 else if (charset_base == -1) 1267 /* If so, check if we can use boyer-moore. */
1275 charset_base = charset_base_code; 1268 if (c != inverse && boyer_moore_ok)
1276 else if (charset_base != charset_base_code) 1269 {
1277 /* If two different rows appear, needing translation, 1270 /* Check if all equivalents belong to the same
1278 then we cannot use boyer_moore search. */ 1271 charset & row. Note that the check of C
1279 boyer_moore_ok = 0; 1272 itself is done by the last iteration. Note
1273 also that we don't have to check ASCII
1274 characters because boyer-moore search can
1275 always handle their translation. */
1276 while (1)
1277 {
1278 if (! ASCII_BYTE_P (inverse))
1279 {
1280 if (SINGLE_BYTE_CHAR_P (inverse))
1281 {
1282 /* Boyer-moore search can't handle a
1283 translation of an eight-bit
1284 character. */
1285 boyer_moore_ok = 0;
1286 break;
1287 }
1288 else if (charset_base == 0)
1289 charset_base = inverse & ~CHAR_FIELD3_MASK;
1290 else if ((inverse & ~CHAR_FIELD3_MASK)
1291 != charset_base)
1292 {
1293 boyer_moore_ok = 0;
1294 break;
1295 }
1296 }
1297 if (c == inverse)
1298 break;
1299 TRANSLATE (inverse, inverse_trt, inverse);
1300 }
1301 }
1280 } 1302 }
1281 1303
1282 /* Store this character into the translated pattern. */ 1304 /* Store this character into the translated pattern. */
@@ -1300,6 +1322,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
1300 if (RE && *base_pat == '\\') 1322 if (RE && *base_pat == '\\')
1301 { 1323 {
1302 len--; 1324 len--;
1325 raw_pattern_size--;
1303 base_pat++; 1326 base_pat++;
1304 } 1327 }
1305 c = *base_pat++; 1328 c = *base_pat++;
@@ -1533,16 +1556,18 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
1533 return n; 1556 return n;
1534} 1557}
1535 1558
1536/* Do Boyer-Moore search N times for the string PAT, 1559/* Do Boyer-Moore search N times for the string BASE_PAT,
1537 whose length is LEN/LEN_BYTE, 1560 whose length is LEN/LEN_BYTE,
1538 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. 1561 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1539 DIRECTION says which direction we search in. 1562 DIRECTION says which direction we search in.
1540 TRT and INVERSE_TRT are translation tables. 1563 TRT and INVERSE_TRT are translation tables.
1564 Characters in PAT are already translated by TRT.
1541 1565
1542 This kind of search works if all the characters in PAT that have 1566 This kind of search works if all the characters in BASE_PAT that
1543 nontrivial translation are the same aside from the last byte. This 1567 have nontrivial translation are the same aside from the last byte.
1544 makes it possible to translate just the last byte of a character, 1568 This makes it possible to translate just the last byte of a
1545 and do so after just a simple test of the context. 1569 character, and do so after just a simple test of the context.
1570 CHARSET_BASE is nonzero iff there is such a non-ASCII character.
1546 1571
1547 If that criterion is not satisfied, do not call this function. */ 1572 If that criterion is not satisfied, do not call this function. */
1548 1573
@@ -1569,8 +1594,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1569 int multibyte = ! NILP (current_buffer->enable_multibyte_characters); 1594 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
1570 1595
1571 unsigned char simple_translate[0400]; 1596 unsigned char simple_translate[0400];
1572 int translate_prev_byte = 0; 1597 /* These are set to the preceding bytes of a byte to be translated
1573 int translate_anteprev_byte = 0; 1598 if charset_base is nonzero. As the maximum byte length of a
1599 multibyte character is 4, we have to check at most three previous
1600 bytes. */
1601 int translate_prev_byte1 = 0;
1602 int translate_prev_byte2 = 0;
1603 int translate_prev_byte3 = 0;
1574 1604
1575#ifdef C_ALLOCA 1605#ifdef C_ALLOCA
1576 int BM_tab_space[0400]; 1606 int BM_tab_space[0400];
@@ -1636,6 +1666,23 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1636 for (i = 0; i < 0400; i++) 1666 for (i = 0; i < 0400; i++)
1637 simple_translate[i] = i; 1667 simple_translate[i] = i;
1638 1668
1669 if (charset_base)
1670 {
1671 /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a
1672 byte following them are the target of translation. */
1673 int sample_char = charset_base | 0x20;
1674 unsigned char str[MAX_MULTIBYTE_LENGTH];
1675 int len = CHAR_STRING (sample_char, str);
1676
1677 translate_prev_byte1 = str[len - 2];
1678 if (len > 2)
1679 {
1680 translate_prev_byte2 = str[len - 3];
1681 if (len > 3)
1682 translate_prev_byte3 = str[len - 4];
1683 }
1684 }
1685
1639 i = 0; 1686 i = 0;
1640 while (i != infinity) 1687 while (i != infinity)
1641 { 1688 {
@@ -1645,57 +1692,37 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1645 i = infinity; 1692 i = infinity;
1646 if (! NILP (trt)) 1693 if (! NILP (trt))
1647 { 1694 {
1648 int ch; 1695 /* If the byte currently looking at is a head of a character
1649 int untranslated; 1696 to check case-equivalents, set CH to that character. An
1650 int this_translated = 1; 1697 ASCII character and a non-ASCII character matching with
1651 1698 CHARSET_BASE are to be checked. */
1652 if (multibyte 1699 int ch = -1;
1653 /* Is *PTR the last byte of a character? */ 1700
1654 && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1]))) 1701 if (ASCII_BYTE_P (*ptr) || ! multibyte)
1702 ch = *ptr;
1703 else if (charset_base && CHAR_HEAD_P (*ptr))
1655 { 1704 {
1656 unsigned char *charstart = ptr; 1705 ch = STRING_CHAR (ptr, pat_end - ptr);
1657 while (! CHAR_HEAD_P (*charstart)) 1706 if (charset_base != (ch & ~CHAR_FIELD3_MASK))
1658 charstart--; 1707 ch = -1;
1659 untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
1660 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
1661 {
1662 TRANSLATE (ch, trt, untranslated);
1663 if (! CHAR_HEAD_P (*ptr))
1664 {
1665 translate_prev_byte = ptr[-1];
1666 if (! CHAR_HEAD_P (translate_prev_byte))
1667 translate_anteprev_byte = ptr[-2];
1668 }
1669 }
1670 else
1671 {
1672 this_translated = 0;
1673 ch = *ptr;
1674 }
1675 } 1708 }
1676 else if (!multibyte)
1677 TRANSLATE (ch, trt, *ptr);
1678 else
1679 {
1680 ch = *ptr;
1681 this_translated = 0;
1682 }
1683
1684 if (ch > 0400)
1685 j = ((unsigned char) ch) | 0200;
1686 else
1687 j = (unsigned char) ch;
1688 1709
1710 j = *ptr;
1689 if (i == infinity) 1711 if (i == infinity)
1690 stride_for_teases = BM_tab[j]; 1712 stride_for_teases = BM_tab[j];
1691 1713
1692 BM_tab[j] = dirlen - i; 1714 BM_tab[j] = dirlen - i;
1693 /* A translation table is accompanied by its inverse -- see */ 1715 /* A translation table is accompanied by its inverse -- see */
1694 /* comment following downcase_table for details */ 1716 /* comment following downcase_table for details */
1695 if (this_translated) 1717 if (ch >= 0)
1696 { 1718 {
1697 int starting_ch = ch; 1719 int starting_ch = ch;
1698 int starting_j = j; 1720 int starting_j;
1721
1722 if (ch > 0400)
1723 starting_j = ((unsigned char) ch) | 0200;
1724 else
1725 starting_j = (unsigned char) ch;
1699 while (1) 1726 while (1)
1700 { 1727 {
1701 TRANSLATE (ch, inverse_trt, ch); 1728 TRANSLATE (ch, inverse_trt, ch);
@@ -1821,9 +1848,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1821 || ((cursor == tail_end_ptr 1848 || ((cursor == tail_end_ptr
1822 || CHAR_HEAD_P (cursor[1])) 1849 || CHAR_HEAD_P (cursor[1]))
1823 && (CHAR_HEAD_P (cursor[0]) 1850 && (CHAR_HEAD_P (cursor[0])
1824 || (translate_prev_byte == cursor[-1] 1851 /* Check if this is the last byte of
1825 && (CHAR_HEAD_P (translate_prev_byte) 1852 a translable character. */
1826 || translate_anteprev_byte == cursor[-2]))))) 1853 || (translate_prev_byte1 == cursor[-1]
1854 && (CHAR_HEAD_P (translate_prev_byte1)
1855 || (translate_prev_byte2 == cursor[-2]
1856 && (CHAR_HEAD_P (translate_prev_byte2)
1857 || (translate_prev_byte3 == cursor[-3]))))))))
1827 ch = simple_translate[*cursor]; 1858 ch = simple_translate[*cursor];
1828 else 1859 else
1829 ch = *cursor; 1860 ch = *cursor;
@@ -1901,9 +1932,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1901 || ((ptr == tail_end_ptr 1932 || ((ptr == tail_end_ptr
1902 || CHAR_HEAD_P (ptr[1])) 1933 || CHAR_HEAD_P (ptr[1]))
1903 && (CHAR_HEAD_P (ptr[0]) 1934 && (CHAR_HEAD_P (ptr[0])
1904 || (translate_prev_byte == ptr[-1] 1935 /* Check if this is the last byte of a
1905 && (CHAR_HEAD_P (translate_prev_byte) 1936 translable character. */
1906 || translate_anteprev_byte == ptr[-2]))))) 1937 || (translate_prev_byte1 == ptr[-1]
1938 && (CHAR_HEAD_P (translate_prev_byte1)
1939 || (translate_prev_byte2 == ptr[-2]
1940 && (CHAR_HEAD_P (translate_prev_byte2)
1941 || translate_prev_byte3 == ptr[-3])))))))
1907 ch = simple_translate[*ptr]; 1942 ch = simple_translate[*ptr];
1908 else 1943 else
1909 ch = *ptr; 1944 ch = *ptr;