diff options
| author | Stefan Monnier | 2006-09-22 17:30:13 +0000 |
|---|---|---|
| committer | Stefan Monnier | 2006-09-22 17:30:13 +0000 |
| commit | 4560a582d623dbf040f4176bdebb8107c12c2bb8 (patch) | |
| tree | 4e9eed296a37bf4d9f85a6a8c96dddd3ff9469ed /src/regex.c | |
| parent | 3ffcda547185fe2950f0ffe108604a1a13dd7b8b (diff) | |
| download | emacs-4560a582d623dbf040f4176bdebb8107c12c2bb8.tar.gz emacs-4560a582d623dbf040f4176bdebb8107c12c2bb8.zip | |
(analyse_first): For eight-bit-control chars, mark both the
char's value and its leading byte in the fastmap.
(re_search_2): When fast-scanning without translation, be careful to
check that we only match the leading byte of a multibyte char.
Diffstat (limited to 'src/regex.c')
| -rw-r--r-- | src/regex.c | 50 |
1 files changed, 42 insertions, 8 deletions
diff --git a/src/regex.c b/src/regex.c index 763b490c906..66e363e731c 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -3877,11 +3877,13 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 3877 | if (fastmap) | 3877 | if (fastmap) |
| 3878 | { | 3878 | { |
| 3879 | int c = RE_STRING_CHAR (p + 1, pend - p); | 3879 | int c = RE_STRING_CHAR (p + 1, pend - p); |
| 3880 | 3880 | /* When fast-scanning, the fastmap can be indexed either with | |
| 3881 | a char (smaller than 256) or with the first byte of | ||
| 3882 | a char's byte sequence. So we have to conservatively add | ||
| 3883 | both to the table. */ | ||
| 3881 | if (SINGLE_BYTE_CHAR_P (c)) | 3884 | if (SINGLE_BYTE_CHAR_P (c)) |
| 3882 | fastmap[c] = 1; | 3885 | fastmap[c] = 1; |
| 3883 | else | 3886 | fastmap[p[1]] = 1; |
| 3884 | fastmap[p[1]] = 1; | ||
| 3885 | } | 3887 | } |
| 3886 | break; | 3888 | break; |
| 3887 | 3889 | ||
| @@ -3899,6 +3901,10 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 3899 | So any that are not listed in the charset | 3901 | So any that are not listed in the charset |
| 3900 | are possible matches, even in multibyte buffers. */ | 3902 | are possible matches, even in multibyte buffers. */ |
| 3901 | if (!fastmap) break; | 3903 | if (!fastmap) break; |
| 3904 | /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially | ||
| 3905 | because it will automatically be set when needed by virtue of | ||
| 3906 | being larger than the highest char of its charset (0xbf) but | ||
| 3907 | smaller than (1<<BYTEWIDTH). */ | ||
| 3902 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; | 3908 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; |
| 3903 | j < (1 << BYTEWIDTH); j++) | 3909 | j < (1 << BYTEWIDTH); j++) |
| 3904 | fastmap[j] = 1; | 3910 | fastmap[j] = 1; |
| @@ -3909,7 +3915,13 @@ analyse_first (p, pend, fastmap, multibyte) | |||
| 3909 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; | 3915 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; |
| 3910 | j >= 0; j--) | 3916 | j >= 0; j--) |
| 3911 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | 3917 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) |
| 3912 | fastmap[j] = 1; | 3918 | { |
| 3919 | fastmap[j] = 1; | ||
| 3920 | #ifdef emacs | ||
| 3921 | if (j >= 0x80 && j < 0xa0) | ||
| 3922 | fastmap[LEADING_CODE_8_BIT_CONTROL] = 1; | ||
| 3923 | #endif | ||
| 3924 | } | ||
| 3913 | 3925 | ||
| 3914 | if ((not && multibyte) | 3926 | if ((not && multibyte) |
| 3915 | /* Any character set can possibly contain a character | 3927 | /* Any character set can possibly contain a character |
| @@ -4352,11 +4364,33 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) | |||
| 4352 | } | 4364 | } |
| 4353 | } | 4365 | } |
| 4354 | else | 4366 | else |
| 4355 | while (range > lim && !fastmap[*d]) | 4367 | do |
| 4356 | { | 4368 | { |
| 4357 | d++; | 4369 | re_char *d_start = d; |
| 4358 | range--; | 4370 | while (range > lim && !fastmap[*d]) |
| 4359 | } | 4371 | { |
| 4372 | d++; | ||
| 4373 | range--; | ||
| 4374 | } | ||
| 4375 | #ifdef emacs | ||
| 4376 | if (multibyte && range > lim) | ||
| 4377 | { | ||
| 4378 | /* Check that we are at the beginning of a char. */ | ||
| 4379 | int at_boundary; | ||
| 4380 | AT_CHAR_BOUNDARY_P (at_boundary, d, d_start); | ||
| 4381 | if (at_boundary) | ||
| 4382 | break; | ||
| 4383 | else | ||
| 4384 | { /* We have matched an internal byte of a char | ||
| 4385 | rather than the leading byte, so it's a false | ||
| 4386 | positive: we should keep scanning. */ | ||
| 4387 | d++; range--; | ||
| 4388 | } | ||
| 4389 | } | ||
| 4390 | else | ||
| 4391 | #endif | ||
| 4392 | break; | ||
| 4393 | } while (1); | ||
| 4360 | 4394 | ||
| 4361 | startpos += irange - range; | 4395 | startpos += irange - range; |
| 4362 | } | 4396 | } |