diff options
| author | Andreas Schwab | 2009-04-16 09:30:45 +0000 |
|---|---|---|
| committer | Andreas Schwab | 2009-04-16 09:30:45 +0000 |
| commit | f4646fffc2839ef6d3d46f409da6a3ebf38bb45b (patch) | |
| tree | 9d6dd292a2ec099312aa09a2bf41bc80060b36cd /src | |
| parent | 05fcb8daf64fd80937b8e97a813a72b5f5c42a5f (diff) | |
| download | emacs-f4646fffc2839ef6d3d46f409da6a3ebf38bb45b.tar.gz emacs-f4646fffc2839ef6d3d46f409da6a3ebf38bb45b.zip | |
(boyer_moore): Use zero as marker value for a possible
match instead of depending on overflow behavior.
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 5 | ||||
| -rw-r--r-- | src/search.c | 167 |
2 files changed, 74 insertions, 98 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index 60fd0530617..c541efb966e 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,8 @@ | |||
| 1 | 2009-04-16 Andreas Schwab <schwab@linux-m68k.org> | ||
| 2 | |||
| 3 | * search.c (boyer_moore): Use zero as marker value for a possible | ||
| 4 | match instead of depending on overflow behavior. | ||
| 5 | |||
| 1 | 2009-04-16 Chong Yidong <cyd@stupidchicken.com> | 6 | 2009-04-16 Chong Yidong <cyd@stupidchicken.com> |
| 2 | 7 | ||
| 3 | * keyboard.c (adjust_point_for_property): Disable 2009-02-12 | 8 | * keyboard.c (adjust_point_for_property): Disable 2009-02-12 |
diff --git a/src/search.c b/src/search.c index 39c130ba70f..2eb435276a3 100644 --- a/src/search.c +++ b/src/search.c | |||
| @@ -1729,9 +1729,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1729 | { | 1729 | { |
| 1730 | int direction = ((n > 0) ? 1 : -1); | 1730 | int direction = ((n > 0) ? 1 : -1); |
| 1731 | register int dirlen; | 1731 | register int dirlen; |
| 1732 | int infinity, limit, stride_for_teases = 0; | 1732 | int limit, stride_for_teases = 0; |
| 1733 | register int *BM_tab; | 1733 | int BM_tab[0400]; |
| 1734 | int *BM_tab_base; | ||
| 1735 | register unsigned char *cursor, *p_limit; | 1734 | register unsigned char *cursor, *p_limit; |
| 1736 | register int i, j; | 1735 | register int i, j; |
| 1737 | unsigned char *pat, *pat_end; | 1736 | unsigned char *pat, *pat_end; |
| @@ -1747,37 +1746,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1747 | int translate_prev_byte3 = 0; | 1746 | int translate_prev_byte3 = 0; |
| 1748 | int translate_prev_byte4 = 0; | 1747 | int translate_prev_byte4 = 0; |
| 1749 | 1748 | ||
| 1750 | BM_tab = (int *) alloca (0400 * sizeof (int)); | 1749 | /* The general approach is that we are going to maintain that we know |
| 1751 | 1750 | the first (closest to the present position, in whatever direction | |
| 1752 | /* The general approach is that we are going to maintain that we know */ | 1751 | we're searching) character that could possibly be the last |
| 1753 | /* the first (closest to the present position, in whatever direction */ | 1752 | (furthest from present position) character of a valid match. We |
| 1754 | /* we're searching) character that could possibly be the last */ | 1753 | advance the state of our knowledge by looking at that character |
| 1755 | /* (furthest from present position) character of a valid match. We */ | 1754 | and seeing whether it indeed matches the last character of the |
| 1756 | /* advance the state of our knowledge by looking at that character */ | 1755 | pattern. If it does, we take a closer look. If it does not, we |
| 1757 | /* and seeing whether it indeed matches the last character of the */ | 1756 | move our pointer (to putative last characters) as far as is |
| 1758 | /* pattern. If it does, we take a closer look. If it does not, we */ | 1757 | logically possible. This amount of movement, which I call a |
| 1759 | /* move our pointer (to putative last characters) as far as is */ | 1758 | stride, will be the length of the pattern if the actual character |
| 1760 | /* logically possible. This amount of movement, which I call a */ | 1759 | appears nowhere in the pattern, otherwise it will be the distance |
| 1761 | /* stride, will be the length of the pattern if the actual character */ | 1760 | from the last occurrence of that character to the end of the |
| 1762 | /* appears nowhere in the pattern, otherwise it will be the distance */ | 1761 | pattern. If the amount is zero we have a possible match. */ |
| 1763 | /* from the last occurrence of that character to the end of the */ | 1762 | |
| 1764 | /* pattern. */ | 1763 | /* Here we make a "mickey mouse" BM table. The stride of the search |
| 1765 | /* As a coding trick, an enormous stride is coded into the table for */ | 1764 | is determined only by the last character of the putative match. |
| 1766 | /* characters that match the last character. This allows use of only */ | 1765 | If that character does not match, we will stride the proper |
| 1767 | /* a single test, a test for having gone past the end of the */ | 1766 | distance to propose a match that superimposes it on the last |
| 1768 | /* permissible match region, to test for both possible matches (when */ | 1767 | instance of a character that matches it (per trt), or misses |
| 1769 | /* the stride goes past the end immediately) and failure to */ | 1768 | it entirely if there is none. */ |
| 1770 | /* match (where you get nudged past the end one stride at a time). */ | ||
| 1771 | |||
| 1772 | /* Here we make a "mickey mouse" BM table. The stride of the search */ | ||
| 1773 | /* is determined only by the last character of the putative match. */ | ||
| 1774 | /* If that character does not match, we will stride the proper */ | ||
| 1775 | /* distance to propose a match that superimposes it on the last */ | ||
| 1776 | /* instance of a character that matches it (per trt), or misses */ | ||
| 1777 | /* it entirely if there is none. */ | ||
| 1778 | 1769 | ||
| 1779 | dirlen = len_byte * direction; | 1770 | dirlen = len_byte * direction; |
| 1780 | infinity = dirlen - (lim_byte + pos_byte + len_byte + len_byte) * direction; | ||
| 1781 | 1771 | ||
| 1782 | /* Record position after the end of the pattern. */ | 1772 | /* Record position after the end of the pattern. */ |
| 1783 | pat_end = base_pat + len_byte; | 1773 | pat_end = base_pat + len_byte; |
| @@ -1787,23 +1777,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1787 | if (direction < 0) | 1777 | if (direction < 0) |
| 1788 | base_pat = pat_end - 1; | 1778 | base_pat = pat_end - 1; |
| 1789 | 1779 | ||
| 1790 | BM_tab_base = BM_tab; | 1780 | /* A character that does not appear in the pattern induces a |
| 1791 | BM_tab += 0400; | 1781 | stride equal to the pattern length. */ |
| 1792 | j = dirlen; /* to get it in a register */ | 1782 | for (i = 0; i < 0400; i++) |
| 1793 | /* A character that does not appear in the pattern induces a */ | 1783 | BM_tab[i] = dirlen; |
| 1794 | /* stride equal to the pattern length. */ | ||
| 1795 | while (BM_tab_base != BM_tab) | ||
| 1796 | { | ||
| 1797 | *--BM_tab = j; | ||
| 1798 | *--BM_tab = j; | ||
| 1799 | *--BM_tab = j; | ||
| 1800 | *--BM_tab = j; | ||
| 1801 | } | ||
| 1802 | 1784 | ||
| 1803 | /* We use this for translation, instead of TRT itself. | 1785 | /* We use this for translation, instead of TRT itself. |
| 1804 | We fill this in to handle the characters that actually | 1786 | We fill this in to handle the characters that actually |
| 1805 | occur in the pattern. Others don't matter anyway! */ | 1787 | occur in the pattern. Others don't matter anyway! */ |
| 1806 | bzero (simple_translate, sizeof simple_translate); | ||
| 1807 | for (i = 0; i < 0400; i++) | 1788 | for (i = 0; i < 0400; i++) |
| 1808 | simple_translate[i] = i; | 1789 | simple_translate[i] = i; |
| 1809 | 1790 | ||
| @@ -1828,12 +1809,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1828 | } | 1809 | } |
| 1829 | 1810 | ||
| 1830 | i = 0; | 1811 | i = 0; |
| 1831 | while (i != infinity) | 1812 | while (i != dirlen) |
| 1832 | { | 1813 | { |
| 1833 | unsigned char *ptr = base_pat + i; | 1814 | unsigned char *ptr = base_pat + i; |
| 1834 | i += direction; | 1815 | i += direction; |
| 1835 | if (i == dirlen) | ||
| 1836 | i = infinity; | ||
| 1837 | if (! NILP (trt)) | 1816 | if (! NILP (trt)) |
| 1838 | { | 1817 | { |
| 1839 | /* If the byte currently looking at is the last of a | 1818 | /* If the byte currently looking at is the last of a |
| @@ -1861,7 +1840,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1861 | else | 1840 | else |
| 1862 | j = *ptr; | 1841 | j = *ptr; |
| 1863 | 1842 | ||
| 1864 | if (i == infinity) | 1843 | if (i == dirlen) |
| 1865 | stride_for_teases = BM_tab[j]; | 1844 | stride_for_teases = BM_tab[j]; |
| 1866 | 1845 | ||
| 1867 | BM_tab[j] = dirlen - i; | 1846 | BM_tab[j] = dirlen - i; |
| @@ -1894,17 +1873,16 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1894 | { | 1873 | { |
| 1895 | j = *ptr; | 1874 | j = *ptr; |
| 1896 | 1875 | ||
| 1897 | if (i == infinity) | 1876 | if (i == dirlen) |
| 1898 | stride_for_teases = BM_tab[j]; | 1877 | stride_for_teases = BM_tab[j]; |
| 1899 | BM_tab[j] = dirlen - i; | 1878 | BM_tab[j] = dirlen - i; |
| 1900 | } | 1879 | } |
| 1901 | /* stride_for_teases tells how much to stride if we get a */ | 1880 | /* stride_for_teases tells how much to stride if we get a |
| 1902 | /* match on the far character but are subsequently */ | 1881 | match on the far character but are subsequently |
| 1903 | /* disappointed, by recording what the stride would have been */ | 1882 | disappointed, by recording what the stride would have been |
| 1904 | /* for that character if the last character had been */ | 1883 | for that character if the last character had been |
| 1905 | /* different. */ | 1884 | different. */ |
| 1906 | } | 1885 | } |
| 1907 | infinity = dirlen - infinity; | ||
| 1908 | pos_byte += dirlen - ((direction > 0) ? direction : 0); | 1886 | pos_byte += dirlen - ((direction > 0) ? direction : 0); |
| 1909 | /* loop invariant - POS_BYTE points at where last char (first | 1887 | /* loop invariant - POS_BYTE points at where last char (first |
| 1910 | char if reverse) of pattern would align in a possible match. */ | 1888 | char if reverse) of pattern would align in a possible match. */ |
| @@ -1948,43 +1926,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 1948 | 1926 | ||
| 1949 | p_limit = BYTE_POS_ADDR (limit); | 1927 | p_limit = BYTE_POS_ADDR (limit); |
| 1950 | p2 = (cursor = BYTE_POS_ADDR (pos_byte)); | 1928 | p2 = (cursor = BYTE_POS_ADDR (pos_byte)); |
| 1951 | /* In this loop, pos + cursor - p2 is the surrogate for pos */ | 1929 | /* In this loop, pos + cursor - p2 is the surrogate for pos. */ |
| 1952 | while (1) /* use one cursor setting as long as i can */ | 1930 | while (1) /* use one cursor setting as long as i can */ |
| 1953 | { | 1931 | { |
| 1954 | if (direction > 0) /* worth duplicating */ | 1932 | if (direction > 0) /* worth duplicating */ |
| 1955 | { | 1933 | { |
| 1956 | /* Use signed comparison if appropriate | 1934 | while (cursor <= p_limit) |
| 1957 | to make cursor+infinity sure to be > p_limit. | 1935 | { |
| 1958 | Assuming that the buffer lies in a range of addresses | 1936 | if (BM_tab[*cursor] == 0) |
| 1959 | that are all "positive" (as ints) or all "negative", | 1937 | goto hit; |
| 1960 | either kind of comparison will work as long | ||
| 1961 | as we don't step by infinity. So pick the kind | ||
| 1962 | that works when we do step by infinity. */ | ||
| 1963 | if ((EMACS_INT) (p_limit + infinity) > (EMACS_INT) p_limit) | ||
| 1964 | while ((EMACS_INT) cursor <= (EMACS_INT) p_limit) | ||
| 1965 | cursor += BM_tab[*cursor]; | ||
| 1966 | else | ||
| 1967 | while ((EMACS_UINT) cursor <= (EMACS_UINT) p_limit) | ||
| 1968 | cursor += BM_tab[*cursor]; | 1938 | cursor += BM_tab[*cursor]; |
| 1939 | } | ||
| 1969 | } | 1940 | } |
| 1970 | else | 1941 | else |
| 1971 | { | 1942 | { |
| 1972 | if ((EMACS_INT) (p_limit + infinity) < (EMACS_INT) p_limit) | 1943 | while (cursor >= p_limit) |
| 1973 | while ((EMACS_INT) cursor >= (EMACS_INT) p_limit) | 1944 | { |
| 1974 | cursor += BM_tab[*cursor]; | 1945 | if (BM_tab[*cursor] == 0) |
| 1975 | else | 1946 | goto hit; |
| 1976 | while ((EMACS_UINT) cursor >= (EMACS_UINT) p_limit) | ||
| 1977 | cursor += BM_tab[*cursor]; | 1947 | cursor += BM_tab[*cursor]; |
| 1948 | } | ||
| 1978 | } | 1949 | } |
| 1979 | /* If you are here, cursor is beyond the end of the searched region. */ | 1950 | /* If you are here, cursor is beyond the end of the |
| 1980 | /* This can happen if you match on the far character of the pattern, */ | 1951 | searched region. You fail to match within the |
| 1981 | /* because the "stride" of that character is infinity, a number able */ | 1952 | permitted region and would otherwise try a character |
| 1982 | /* to throw you well beyond the end of the search. It can also */ | 1953 | beyond that region. */ |
| 1983 | /* happen if you fail to match within the permitted region and would */ | 1954 | break; |
| 1984 | /* otherwise try a character beyond that region */ | 1955 | |
| 1985 | if ((cursor - p_limit) * direction <= len_byte) | 1956 | hit: |
| 1986 | break; /* a small overrun is genuine */ | ||
| 1987 | cursor -= infinity; /* large overrun = hit */ | ||
| 1988 | i = dirlen - direction; | 1957 | i = dirlen - direction; |
| 1989 | if (! NILP (trt)) | 1958 | if (! NILP (trt)) |
| 1990 | { | 1959 | { |
| @@ -2056,8 +2025,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 2056 | pos_byte += cursor - p2; | 2025 | pos_byte += cursor - p2; |
| 2057 | } | 2026 | } |
| 2058 | else | 2027 | else |
| 2059 | /* Now we'll pick up a clump that has to be done the hard */ | 2028 | /* Now we'll pick up a clump that has to be done the hard |
| 2060 | /* way because it covers a discontinuity */ | 2029 | way because it covers a discontinuity. */ |
| 2061 | { | 2030 | { |
| 2062 | limit = ((direction > 0) | 2031 | limit = ((direction > 0) |
| 2063 | ? BUFFER_CEILING_OF (pos_byte - dirlen + 1) | 2032 | ? BUFFER_CEILING_OF (pos_byte - dirlen + 1) |
| @@ -2069,19 +2038,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 2069 | and still be valid for a possible match. */ | 2038 | and still be valid for a possible match. */ |
| 2070 | while (1) | 2039 | while (1) |
| 2071 | { | 2040 | { |
| 2072 | /* This loop can be coded for space rather than */ | 2041 | /* This loop can be coded for space rather than |
| 2073 | /* speed because it will usually run only once. */ | 2042 | speed because it will usually run only once. |
| 2074 | /* (the reach is at most len + 21, and typically */ | 2043 | (the reach is at most len + 21, and typically |
| 2075 | /* does not exceed len) */ | 2044 | does not exceed len). */ |
| 2076 | while ((limit - pos_byte) * direction >= 0) | 2045 | while ((limit - pos_byte) * direction >= 0) |
| 2077 | pos_byte += BM_tab[FETCH_BYTE (pos_byte)]; | 2046 | { |
| 2078 | /* now run the same tests to distinguish going off the */ | 2047 | int ch = FETCH_BYTE (pos_byte); |
| 2079 | /* end, a match or a phony match. */ | 2048 | if (BM_tab[ch] == 0) |
| 2080 | if ((pos_byte - limit) * direction <= len_byte) | 2049 | goto hit2; |
| 2081 | break; /* ran off the end */ | 2050 | pos_byte += BM_tab[ch]; |
| 2082 | /* Found what might be a match. | 2051 | } |
| 2083 | Set POS_BYTE back to last (first if reverse) pos. */ | 2052 | break; /* ran off the end */ |
| 2084 | pos_byte -= infinity; | 2053 | |
| 2054 | hit2: | ||
| 2055 | /* Found what might be a match. */ | ||
| 2085 | i = dirlen - direction; | 2056 | i = dirlen - direction; |
| 2086 | while ((i -= direction) + direction != 0) | 2057 | while ((i -= direction) + direction != 0) |
| 2087 | { | 2058 | { |
| @@ -2110,7 +2081,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, | |||
| 2110 | /* Above loop has moved POS_BYTE part or all the way | 2081 | /* Above loop has moved POS_BYTE part or all the way |
| 2111 | back to the first pos (last pos if reverse). | 2082 | back to the first pos (last pos if reverse). |
| 2112 | Set it once again at the last (first if reverse) char. */ | 2083 | Set it once again at the last (first if reverse) char. */ |
| 2113 | pos_byte += dirlen - i- direction; | 2084 | pos_byte += dirlen - i - direction; |
| 2114 | if (i + direction == 0) | 2085 | if (i + direction == 0) |
| 2115 | { | 2086 | { |
| 2116 | int position, start, end; | 2087 | int position, start, end; |