aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorAndreas Schwab2009-04-16 09:30:45 +0000
committerAndreas Schwab2009-04-16 09:30:45 +0000
commitf4646fffc2839ef6d3d46f409da6a3ebf38bb45b (patch)
tree9d6dd292a2ec099312aa09a2bf41bc80060b36cd /src
parent05fcb8daf64fd80937b8e97a813a72b5f5c42a5f (diff)
downloademacs-f4646fffc2839ef6d3d46f409da6a3ebf38bb45b.tar.gz
emacs-f4646fffc2839ef6d3d46f409da6a3ebf38bb45b.zip
(boyer_moore): Use zero as marker value for a possible
match instead of depending on overflow behavior.
Diffstat (limited to 'src')
-rw-r--r--src/ChangeLog5
-rw-r--r--src/search.c167
2 files changed, 74 insertions, 98 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index 60fd0530617..c541efb966e 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
12009-04-16 Andreas Schwab <schwab@linux-m68k.org>
2
3 * search.c (boyer_moore): Use zero as marker value for a possible
4 match instead of depending on overflow behavior.
5
12009-04-16 Chong Yidong <cyd@stupidchicken.com> 62009-04-16 Chong Yidong <cyd@stupidchicken.com>
2 7
3 * keyboard.c (adjust_point_for_property): Disable 2009-02-12 8 * keyboard.c (adjust_point_for_property): Disable 2009-02-12
diff --git a/src/search.c b/src/search.c
index 39c130ba70f..2eb435276a3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1729,9 +1729,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1729{ 1729{
1730 int direction = ((n > 0) ? 1 : -1); 1730 int direction = ((n > 0) ? 1 : -1);
1731 register int dirlen; 1731 register int dirlen;
1732 int infinity, limit, stride_for_teases = 0; 1732 int limit, stride_for_teases = 0;
1733 register int *BM_tab; 1733 int BM_tab[0400];
1734 int *BM_tab_base;
1735 register unsigned char *cursor, *p_limit; 1734 register unsigned char *cursor, *p_limit;
1736 register int i, j; 1735 register int i, j;
1737 unsigned char *pat, *pat_end; 1736 unsigned char *pat, *pat_end;
@@ -1747,37 +1746,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1747 int translate_prev_byte3 = 0; 1746 int translate_prev_byte3 = 0;
1748 int translate_prev_byte4 = 0; 1747 int translate_prev_byte4 = 0;
1749 1748
1750 BM_tab = (int *) alloca (0400 * sizeof (int)); 1749 /* The general approach is that we are going to maintain that we know
1751 1750 the first (closest to the present position, in whatever direction
1752 /* The general approach is that we are going to maintain that we know */ 1751 we're searching) character that could possibly be the last
1753 /* the first (closest to the present position, in whatever direction */ 1752 (furthest from present position) character of a valid match. We
1754 /* we're searching) character that could possibly be the last */ 1753 advance the state of our knowledge by looking at that character
1755 /* (furthest from present position) character of a valid match. We */ 1754 and seeing whether it indeed matches the last character of the
1756 /* advance the state of our knowledge by looking at that character */ 1755 pattern. If it does, we take a closer look. If it does not, we
1757 /* and seeing whether it indeed matches the last character of the */ 1756 move our pointer (to putative last characters) as far as is
1758 /* pattern. If it does, we take a closer look. If it does not, we */ 1757 logically possible. This amount of movement, which I call a
1759 /* move our pointer (to putative last characters) as far as is */ 1758 stride, will be the length of the pattern if the actual character
1760 /* logically possible. This amount of movement, which I call a */ 1759 appears nowhere in the pattern, otherwise it will be the distance
1761 /* stride, will be the length of the pattern if the actual character */ 1760 from the last occurrence of that character to the end of the
1762 /* appears nowhere in the pattern, otherwise it will be the distance */ 1761 pattern. If the amount is zero we have a possible match. */
1763 /* from the last occurrence of that character to the end of the */ 1762
1764 /* pattern. */ 1763 /* Here we make a "mickey mouse" BM table. The stride of the search
1765 /* As a coding trick, an enormous stride is coded into the table for */ 1764 is determined only by the last character of the putative match.
1766 /* characters that match the last character. This allows use of only */ 1765 If that character does not match, we will stride the proper
1767 /* a single test, a test for having gone past the end of the */ 1766 distance to propose a match that superimposes it on the last
1768 /* permissible match region, to test for both possible matches (when */ 1767 instance of a character that matches it (per trt), or misses
1769 /* the stride goes past the end immediately) and failure to */ 1768 it entirely if there is none. */
1770 /* match (where you get nudged past the end one stride at a time). */
1771
1772 /* Here we make a "mickey mouse" BM table. The stride of the search */
1773 /* is determined only by the last character of the putative match. */
1774 /* If that character does not match, we will stride the proper */
1775 /* distance to propose a match that superimposes it on the last */
1776 /* instance of a character that matches it (per trt), or misses */
1777 /* it entirely if there is none. */
1778 1769
1779 dirlen = len_byte * direction; 1770 dirlen = len_byte * direction;
1780 infinity = dirlen - (lim_byte + pos_byte + len_byte + len_byte) * direction;
1781 1771
1782 /* Record position after the end of the pattern. */ 1772 /* Record position after the end of the pattern. */
1783 pat_end = base_pat + len_byte; 1773 pat_end = base_pat + len_byte;
@@ -1787,23 +1777,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1787 if (direction < 0) 1777 if (direction < 0)
1788 base_pat = pat_end - 1; 1778 base_pat = pat_end - 1;
1789 1779
1790 BM_tab_base = BM_tab; 1780 /* A character that does not appear in the pattern induces a
1791 BM_tab += 0400; 1781 stride equal to the pattern length. */
1792 j = dirlen; /* to get it in a register */ 1782 for (i = 0; i < 0400; i++)
1793 /* A character that does not appear in the pattern induces a */ 1783 BM_tab[i] = dirlen;
1794 /* stride equal to the pattern length. */
1795 while (BM_tab_base != BM_tab)
1796 {
1797 *--BM_tab = j;
1798 *--BM_tab = j;
1799 *--BM_tab = j;
1800 *--BM_tab = j;
1801 }
1802 1784
1803 /* We use this for translation, instead of TRT itself. 1785 /* We use this for translation, instead of TRT itself.
1804 We fill this in to handle the characters that actually 1786 We fill this in to handle the characters that actually
1805 occur in the pattern. Others don't matter anyway! */ 1787 occur in the pattern. Others don't matter anyway! */
1806 bzero (simple_translate, sizeof simple_translate);
1807 for (i = 0; i < 0400; i++) 1788 for (i = 0; i < 0400; i++)
1808 simple_translate[i] = i; 1789 simple_translate[i] = i;
1809 1790
@@ -1828,12 +1809,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1828 } 1809 }
1829 1810
1830 i = 0; 1811 i = 0;
1831 while (i != infinity) 1812 while (i != dirlen)
1832 { 1813 {
1833 unsigned char *ptr = base_pat + i; 1814 unsigned char *ptr = base_pat + i;
1834 i += direction; 1815 i += direction;
1835 if (i == dirlen)
1836 i = infinity;
1837 if (! NILP (trt)) 1816 if (! NILP (trt))
1838 { 1817 {
1839 /* If the byte currently looking at is the last of a 1818 /* If the byte currently looking at is the last of a
@@ -1861,7 +1840,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1861 else 1840 else
1862 j = *ptr; 1841 j = *ptr;
1863 1842
1864 if (i == infinity) 1843 if (i == dirlen)
1865 stride_for_teases = BM_tab[j]; 1844 stride_for_teases = BM_tab[j];
1866 1845
1867 BM_tab[j] = dirlen - i; 1846 BM_tab[j] = dirlen - i;
@@ -1894,17 +1873,16 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1894 { 1873 {
1895 j = *ptr; 1874 j = *ptr;
1896 1875
1897 if (i == infinity) 1876 if (i == dirlen)
1898 stride_for_teases = BM_tab[j]; 1877 stride_for_teases = BM_tab[j];
1899 BM_tab[j] = dirlen - i; 1878 BM_tab[j] = dirlen - i;
1900 } 1879 }
1901 /* stride_for_teases tells how much to stride if we get a */ 1880 /* stride_for_teases tells how much to stride if we get a
1902 /* match on the far character but are subsequently */ 1881 match on the far character but are subsequently
1903 /* disappointed, by recording what the stride would have been */ 1882 disappointed, by recording what the stride would have been
1904 /* for that character if the last character had been */ 1883 for that character if the last character had been
1905 /* different. */ 1884 different. */
1906 } 1885 }
1907 infinity = dirlen - infinity;
1908 pos_byte += dirlen - ((direction > 0) ? direction : 0); 1886 pos_byte += dirlen - ((direction > 0) ? direction : 0);
1909 /* loop invariant - POS_BYTE points at where last char (first 1887 /* loop invariant - POS_BYTE points at where last char (first
1910 char if reverse) of pattern would align in a possible match. */ 1888 char if reverse) of pattern would align in a possible match. */
@@ -1948,43 +1926,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
1948 1926
1949 p_limit = BYTE_POS_ADDR (limit); 1927 p_limit = BYTE_POS_ADDR (limit);
1950 p2 = (cursor = BYTE_POS_ADDR (pos_byte)); 1928 p2 = (cursor = BYTE_POS_ADDR (pos_byte));
1951 /* In this loop, pos + cursor - p2 is the surrogate for pos */ 1929 /* In this loop, pos + cursor - p2 is the surrogate for pos. */
1952 while (1) /* use one cursor setting as long as i can */ 1930 while (1) /* use one cursor setting as long as i can */
1953 { 1931 {
1954 if (direction > 0) /* worth duplicating */ 1932 if (direction > 0) /* worth duplicating */
1955 { 1933 {
1956 /* Use signed comparison if appropriate 1934 while (cursor <= p_limit)
1957 to make cursor+infinity sure to be > p_limit. 1935 {
1958 Assuming that the buffer lies in a range of addresses 1936 if (BM_tab[*cursor] == 0)
1959 that are all "positive" (as ints) or all "negative", 1937 goto hit;
1960 either kind of comparison will work as long
1961 as we don't step by infinity. So pick the kind
1962 that works when we do step by infinity. */
1963 if ((EMACS_INT) (p_limit + infinity) > (EMACS_INT) p_limit)
1964 while ((EMACS_INT) cursor <= (EMACS_INT) p_limit)
1965 cursor += BM_tab[*cursor];
1966 else
1967 while ((EMACS_UINT) cursor <= (EMACS_UINT) p_limit)
1968 cursor += BM_tab[*cursor]; 1938 cursor += BM_tab[*cursor];
1939 }
1969 } 1940 }
1970 else 1941 else
1971 { 1942 {
1972 if ((EMACS_INT) (p_limit + infinity) < (EMACS_INT) p_limit) 1943 while (cursor >= p_limit)
1973 while ((EMACS_INT) cursor >= (EMACS_INT) p_limit) 1944 {
1974 cursor += BM_tab[*cursor]; 1945 if (BM_tab[*cursor] == 0)
1975 else 1946 goto hit;
1976 while ((EMACS_UINT) cursor >= (EMACS_UINT) p_limit)
1977 cursor += BM_tab[*cursor]; 1947 cursor += BM_tab[*cursor];
1948 }
1978 } 1949 }
1979/* If you are here, cursor is beyond the end of the searched region. */ 1950 /* If you are here, cursor is beyond the end of the
1980/* This can happen if you match on the far character of the pattern, */ 1951 searched region. You fail to match within the
1981/* because the "stride" of that character is infinity, a number able */ 1952 permitted region and would otherwise try a character
1982/* to throw you well beyond the end of the search. It can also */ 1953 beyond that region. */
1983/* happen if you fail to match within the permitted region and would */ 1954 break;
1984/* otherwise try a character beyond that region */ 1955
1985 if ((cursor - p_limit) * direction <= len_byte) 1956 hit:
1986 break; /* a small overrun is genuine */
1987 cursor -= infinity; /* large overrun = hit */
1988 i = dirlen - direction; 1957 i = dirlen - direction;
1989 if (! NILP (trt)) 1958 if (! NILP (trt))
1990 { 1959 {
@@ -2056,8 +2025,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
2056 pos_byte += cursor - p2; 2025 pos_byte += cursor - p2;
2057 } 2026 }
2058 else 2027 else
2059 /* Now we'll pick up a clump that has to be done the hard */ 2028 /* Now we'll pick up a clump that has to be done the hard
2060 /* way because it covers a discontinuity */ 2029 way because it covers a discontinuity. */
2061 { 2030 {
2062 limit = ((direction > 0) 2031 limit = ((direction > 0)
2063 ? BUFFER_CEILING_OF (pos_byte - dirlen + 1) 2032 ? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
@@ -2069,19 +2038,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
2069 and still be valid for a possible match. */ 2038 and still be valid for a possible match. */
2070 while (1) 2039 while (1)
2071 { 2040 {
2072 /* This loop can be coded for space rather than */ 2041 /* This loop can be coded for space rather than
2073 /* speed because it will usually run only once. */ 2042 speed because it will usually run only once.
2074 /* (the reach is at most len + 21, and typically */ 2043 (the reach is at most len + 21, and typically
2075 /* does not exceed len) */ 2044 does not exceed len). */
2076 while ((limit - pos_byte) * direction >= 0) 2045 while ((limit - pos_byte) * direction >= 0)
2077 pos_byte += BM_tab[FETCH_BYTE (pos_byte)]; 2046 {
2078 /* now run the same tests to distinguish going off the */ 2047 int ch = FETCH_BYTE (pos_byte);
2079 /* end, a match or a phony match. */ 2048 if (BM_tab[ch] == 0)
2080 if ((pos_byte - limit) * direction <= len_byte) 2049 goto hit2;
2081 break; /* ran off the end */ 2050 pos_byte += BM_tab[ch];
2082 /* Found what might be a match. 2051 }
2083 Set POS_BYTE back to last (first if reverse) pos. */ 2052 break; /* ran off the end */
2084 pos_byte -= infinity; 2053
2054 hit2:
2055 /* Found what might be a match. */
2085 i = dirlen - direction; 2056 i = dirlen - direction;
2086 while ((i -= direction) + direction != 0) 2057 while ((i -= direction) + direction != 0)
2087 { 2058 {
@@ -2110,7 +2081,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
2110 /* Above loop has moved POS_BYTE part or all the way 2081 /* Above loop has moved POS_BYTE part or all the way
2111 back to the first pos (last pos if reverse). 2082 back to the first pos (last pos if reverse).
2112 Set it once again at the last (first if reverse) char. */ 2083 Set it once again at the last (first if reverse) char. */
2113 pos_byte += dirlen - i- direction; 2084 pos_byte += dirlen - i - direction;
2114 if (i + direction == 0) 2085 if (i + direction == 0)
2115 { 2086 {
2116 int position, start, end; 2087 int position, start, end;