diff options
| author | Richard M. Stallman | 1999-08-29 20:38:11 +0000 |
|---|---|---|
| committer | Richard M. Stallman | 1999-08-29 20:38:11 +0000 |
| commit | 96cc36cc58afebd9bfe6ea40129da800c8b487e5 (patch) | |
| tree | 19cfc395cbff439d05f6604f1d2f9277e6804a9a /src/regex.c | |
| parent | 6e66e4c6fc350fe8cb85d4cc35d9ca567df56548 (diff) | |
| download | emacs-96cc36cc58afebd9bfe6ea40129da800c8b487e5.tar.gz emacs-96cc36cc58afebd9bfe6ea40129da800c8b487e5.zip | |
[emacs]: Handle character classes for multibyte chars:
(ISBLANK, ISGRAPH, ISPRINT, ISALNUM, ISALPHA, ISLOWER)
(ISPUNCT, ISSPACE, ISUPPER): New definitions for emacs only.
(ISWORD): New macro.
(re_opcode_t): Add 2 bytes of flag bits to charset and charset_not.
(CHARSET_RANGE_TABLE): Update definition.
(CHARSET_RANGE_TABLE_BITS): New macro.
(print_partial_compiled_pattern): Skip charset's range table.
(struct range_table_work_area): New field `bits'.
(SET_RANGE_TABLE_WORK_AREA_BIT): New macro.
(BIT_ALNUM, BIT_ALPHA, BIT_WORD, BIT_GRAPH, BIT_LOWER, BIT_PRINT)
(BIT_PUNCT, BIT_SPACE, BIT_UPPER): New macros.
(CLEAR_RANGE_TABLE_WORK_USED): Clear field `bits'.
(RANGE_TABLE_WORK_BITS): New macro.
(IS_CHAR_CLASS): Check for "word".
(regex_compile): Set the `bits' field for some character classes.
Handle the `word' class. Store the `bits' field into the range table.
(re_compile_fastmap): Handle flag bits in range table.
(re_match_2_internal): For charset and charset_not,
handle flag bits in the range table.
Diffstat (limited to 'src/regex.c')
| -rw-r--r-- | src/regex.c | 236 |
1 files changed, 201 insertions, 35 deletions
diff --git a/src/regex.c b/src/regex.c index 3f951afe637..450850609a6 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -191,9 +191,6 @@ init_syntax_once () | |||
| 191 | /* Get the interface, including the syntax bits. */ | 191 | /* Get the interface, including the syntax bits. */ |
| 192 | #include "regex.h" | 192 | #include "regex.h" |
| 193 | 193 | ||
| 194 | /* isalpha etc. are used for the character classes. */ | ||
| 195 | #include <ctype.h> | ||
| 196 | |||
| 197 | /* Jim Meyering writes: | 194 | /* Jim Meyering writes: |
| 198 | 195 | ||
| 199 | "... Some ctype macros are valid only for character codes that | 196 | "... Some ctype macros are valid only for character codes that |
| @@ -211,6 +208,51 @@ init_syntax_once () | |||
| 211 | #define ISASCII(c) isascii(c) | 208 | #define ISASCII(c) isascii(c) |
| 212 | #endif | 209 | #endif |
| 213 | 210 | ||
| 211 | /* isalpha etc. are used for the character classes. */ | ||
| 212 | #include <ctype.h> | ||
| 213 | |||
| 214 | /* In Emacs, these are only used for single-byte characters. */ | ||
| 215 | #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | ||
| 216 | #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | ||
| 217 | #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | ||
| 218 | |||
| 219 | #ifdef emacs | ||
| 220 | |||
| 221 | /* This is only used for single-byte characters. */ | ||
| 222 | #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | ||
| 223 | |||
| 224 | /* The rest must handle multibyte characters. */ | ||
| 225 | |||
| 226 | #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ | ||
| 227 | ? ISASCII (c) && isprint (c) && !isspace (c) \ | ||
| 228 | : 1) | ||
| 229 | |||
| 230 | #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ | ||
| 231 | ? ISASCII (c) && isalnum (c) \ | ||
| 232 | : 1) | ||
| 233 | |||
| 234 | #define ISALNUM(c) (SINGLE_BYTE_CHAR_P (c) \ | ||
| 235 | ? ISASCII (c) && isalnum (c) \ | ||
| 236 | : SYNTAX (c) == Sword) | ||
| 237 | |||
| 238 | #define ISALPHA(c) (SINGLE_BYTE_CHAR_P (c) \ | ||
| 239 | ? ISASCII (c) && isalpha (c) \ | ||
| 240 | : SYNTAX (c) == Sword) | ||
| 241 | |||
| 242 | #define ISLOWER(c) (LOWERCASEP (c)) | ||
| 243 | |||
| 244 | #define ISPUNCT(c) (SINGLE_BYTE_CHAR_P (c) \ | ||
| 245 | ? ISASCII (c) && ispunct (c) \ | ||
| 246 | : SYNTAX (c) != Sword) | ||
| 247 | |||
| 248 | #define ISSPACE(c) (SYNTAX (c) == Swhitespace) | ||
| 249 | |||
| 250 | #define ISUPPER(c) (UPPERCASEP (c)) | ||
| 251 | |||
| 252 | #define ISWORD(c) (SYNTAX (c) == Sword) | ||
| 253 | |||
| 254 | #else /* not emacs */ | ||
| 255 | |||
| 214 | #ifdef isblank | 256 | #ifdef isblank |
| 215 | #define ISBLANK(c) (ISASCII (c) && isblank (c)) | 257 | #define ISBLANK(c) (ISASCII (c) && isblank (c)) |
| 216 | #else | 258 | #else |
| @@ -233,6 +275,10 @@ init_syntax_once () | |||
| 233 | #define ISUPPER(c) (ISASCII (c) && isupper (c)) | 275 | #define ISUPPER(c) (ISASCII (c) && isupper (c)) |
| 234 | #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | 276 | #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) |
| 235 | 277 | ||
| 278 | #define ISWORD(c) ISALPHA(c) | ||
| 279 | |||
| 280 | #endif /* not emacs */ | ||
| 281 | |||
| 236 | #ifndef NULL | 282 | #ifndef NULL |
| 237 | #define NULL (void *)0 | 283 | #define NULL (void *)0 |
| 238 | #endif | 284 | #endif |
| @@ -383,7 +429,15 @@ typedef enum | |||
| 383 | for a bitmap saying which chars are in. Bits in each byte | 429 | for a bitmap saying which chars are in. Bits in each byte |
| 384 | are ordered low-bit-first. A character is in the set if its | 430 | are ordered low-bit-first. A character is in the set if its |
| 385 | bit is 1. A character too large to have a bit in the map is | 431 | bit is 1. A character too large to have a bit in the map is |
| 386 | automatically not in the set. */ | 432 | automatically not in the set. |
| 433 | |||
| 434 | If the length byte has the 0x80 bit set, then that stuff | ||
| 435 | is followed by a range table: | ||
| 436 | 2 bytes of flags for character sets (low 8 bits, high 8 bits) | ||
| 437 | See RANGE_TABLE_WORK_BITS below. | ||
| 438 | 2 bytes, the number of pairs that follow | ||
| 439 | pairs, each 2 multibyte characters, | ||
| 440 | each multibyte character represented as 3 bytes. */ | ||
| 387 | charset, | 441 | charset, |
| 388 | 442 | ||
| 389 | /* Same parameters as charset, but match any character that is | 443 | /* Same parameters as charset, but match any character that is |
| @@ -617,8 +671,14 @@ extract_number_and_incr (destination, source) | |||
| 617 | 671 | ||
| 618 | /* Return the address of range table of charset P. But not the start | 672 | /* Return the address of range table of charset P. But not the start |
| 619 | of table itself, but the before where the number of ranges is | 673 | of table itself, but the before where the number of ranges is |
| 620 | stored. `2 +' means to skip re_opcode_t and size of bitmap. */ | 674 | stored. `2 +' means to skip re_opcode_t and size of bitmap, |
| 621 | #define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)]) | 675 | and the 2 bytes of flags at the start of the range table. */ |
| 676 | #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)]) | ||
| 677 | |||
| 678 | /* Extract the bit flags that start a range table. */ | ||
| 679 | #define CHARSET_RANGE_TABLE_BITS(p) \ | ||
| 680 | ((p)[2 + CHARSET_BITMAP_SIZE (p)] \ | ||
| 681 | + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100) | ||
| 622 | 682 | ||
| 623 | /* Test if C is listed in the bitmap of charset P. */ | 683 | /* Test if C is listed in the bitmap of charset P. */ |
| 624 | #define CHARSET_LOOKUP_BITMAP(p, c) \ | 684 | #define CHARSET_LOOKUP_BITMAP(p, c) \ |
| @@ -791,6 +851,9 @@ print_partial_compiled_pattern (start, end) | |||
| 791 | { | 851 | { |
| 792 | register int c, last = -100; | 852 | register int c, last = -100; |
| 793 | register int in_range = 0; | 853 | register int in_range = 0; |
| 854 | int length = *p & 0x7f; | ||
| 855 | int has_range_table = *p & 0x80; | ||
| 856 | int range_length = p[length + 2] + p[length + 3] * 0x100; | ||
| 794 | 857 | ||
| 795 | printf ("/charset [%s", | 858 | printf ("/charset [%s", |
| 796 | (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | 859 | (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); |
| @@ -798,7 +861,7 @@ print_partial_compiled_pattern (start, end) | |||
| 798 | assert (p + *p < pend); | 861 | assert (p + *p < pend); |
| 799 | 862 | ||
| 800 | for (c = 0; c < 256; c++) | 863 | for (c = 0; c < 256; c++) |
| 801 | if (c / 8 < *p | 864 | if (c / 8 < length |
| 802 | && (p[1 + (c/8)] & (1 << (c % 8)))) | 865 | && (p[1 + (c/8)] & (1 << (c % 8)))) |
| 803 | { | 866 | { |
| 804 | /* Are we starting a range? */ | 867 | /* Are we starting a range? */ |
| @@ -809,7 +872,7 @@ print_partial_compiled_pattern (start, end) | |||
| 809 | } | 872 | } |
| 810 | /* Have we broken a range? */ | 873 | /* Have we broken a range? */ |
| 811 | else if (last + 1 != c && in_range) | 874 | else if (last + 1 != c && in_range) |
| 812 | { | 875 | { |
| 813 | putchar (last); | 876 | putchar (last); |
| 814 | in_range = 0; | 877 | in_range = 0; |
| 815 | } | 878 | } |
| @@ -820,12 +883,20 @@ print_partial_compiled_pattern (start, end) | |||
| 820 | last = c; | 883 | last = c; |
| 821 | } | 884 | } |
| 822 | 885 | ||
| 886 | p += 1 + length; | ||
| 887 | |||
| 823 | if (in_range) | 888 | if (in_range) |
| 824 | putchar (last); | 889 | putchar (last); |
| 825 | 890 | ||
| 826 | putchar (']'); | 891 | putchar (']'); |
| 827 | 892 | ||
| 828 | p += 1 + *p; | 893 | if (has_range_table) |
| 894 | printf ("has-range-table"); | ||
| 895 | |||
| 896 | /* ??? Should print the range table; for now, | ||
| 897 | just skip it. */ | ||
| 898 | if (has_range_table) | ||
| 899 | p += 4 + 6 * range_length; | ||
| 829 | } | 900 | } |
| 830 | break; | 901 | break; |
| 831 | 902 | ||
| @@ -1710,6 +1781,7 @@ struct range_table_work_area | |||
| 1710 | int *table; /* actual work area. */ | 1781 | int *table; /* actual work area. */ |
| 1711 | int allocated; /* allocated size for work area in bytes. */ | 1782 | int allocated; /* allocated size for work area in bytes. */ |
| 1712 | int used; /* actually used size in words. */ | 1783 | int used; /* actually used size in words. */ |
| 1784 | int bits; /* flag to record character classes */ | ||
| 1713 | }; | 1785 | }; |
| 1714 | 1786 | ||
| 1715 | /* Make sure that WORK_AREA can hold more N multibyte characters. */ | 1787 | /* Make sure that WORK_AREA can hold more N multibyte characters. */ |
| @@ -1729,6 +1801,21 @@ struct range_table_work_area | |||
| 1729 | } \ | 1801 | } \ |
| 1730 | } while (0) | 1802 | } while (0) |
| 1731 | 1803 | ||
| 1804 | #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \ | ||
| 1805 | (work_area).bits |= (bit) | ||
| 1806 | |||
| 1807 | /* These bits represent the various character classes such as [:alnum:] | ||
| 1808 | in a charset's range table. */ | ||
| 1809 | #define BIT_ALNUM 0x1 | ||
| 1810 | #define BIT_ALPHA 0x2 | ||
| 1811 | #define BIT_WORD 0x4 | ||
| 1812 | #define BIT_GRAPH 0x20 | ||
| 1813 | #define BIT_LOWER 0x40 | ||
| 1814 | #define BIT_PRINT 0x80 | ||
| 1815 | #define BIT_PUNCT 0x100 | ||
| 1816 | #define BIT_SPACE 0x200 | ||
| 1817 | #define BIT_UPPER 0x400 | ||
| 1818 | |||
| 1732 | /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ | 1819 | /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ |
| 1733 | #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ | 1820 | #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ |
| 1734 | do { \ | 1821 | do { \ |
| @@ -1744,8 +1831,9 @@ struct range_table_work_area | |||
| 1744 | free ((work_area).table); \ | 1831 | free ((work_area).table); \ |
| 1745 | } while (0) | 1832 | } while (0) |
| 1746 | 1833 | ||
| 1747 | #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0) | 1834 | #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0) |
| 1748 | #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) | 1835 | #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) |
| 1836 | #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits) | ||
| 1749 | #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) | 1837 | #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) |
| 1750 | 1838 | ||
| 1751 | 1839 | ||
| @@ -1780,7 +1868,8 @@ struct range_table_work_area | |||
| 1780 | || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | 1868 | || STREQ (string, "alnum") || STREQ (string, "xdigit") \ |
| 1781 | || STREQ (string, "space") || STREQ (string, "print") \ | 1869 | || STREQ (string, "space") || STREQ (string, "print") \ |
| 1782 | || STREQ (string, "punct") || STREQ (string, "graph") \ | 1870 | || STREQ (string, "punct") || STREQ (string, "graph") \ |
| 1783 | || STREQ (string, "cntrl") || STREQ (string, "blank")) | 1871 | || STREQ (string, "cntrl") || STREQ (string, "blank") \ |
| 1872 | || STREQ (string, "word")) | ||
| 1784 | 1873 | ||
| 1785 | #ifndef MATCH_MAY_ALLOCATE | 1874 | #ifndef MATCH_MAY_ALLOCATE |
| 1786 | 1875 | ||
| @@ -2281,6 +2370,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2281 | boolean is_space = STREQ (str, "space"); | 2370 | boolean is_space = STREQ (str, "space"); |
| 2282 | boolean is_upper = STREQ (str, "upper"); | 2371 | boolean is_upper = STREQ (str, "upper"); |
| 2283 | boolean is_xdigit = STREQ (str, "xdigit"); | 2372 | boolean is_xdigit = STREQ (str, "xdigit"); |
| 2373 | boolean is_word = STREQ (str, "word"); | ||
| 2284 | 2374 | ||
| 2285 | if (!IS_CHAR_CLASS (str)) | 2375 | if (!IS_CHAR_CLASS (str)) |
| 2286 | FREE_STACK_RETURN (REG_ECTYPE); | 2376 | FREE_STACK_RETURN (REG_ECTYPE); |
| @@ -2291,6 +2381,31 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2291 | 2381 | ||
| 2292 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2382 | if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| 2293 | 2383 | ||
| 2384 | /* Most character classes in a multibyte match | ||
| 2385 | just set a flag. Exceptions are is_blank, | ||
| 2386 | is_digit, is_cntrl, and is_xdigit, since | ||
| 2387 | they can only match ASCII characters. We | ||
| 2388 | don't need to handle them for multibyte. */ | ||
| 2389 | |||
| 2390 | if (bufp->multibyte) | ||
| 2391 | { | ||
| 2392 | int bit = 0; | ||
| 2393 | |||
| 2394 | if (is_alnum) bit = BIT_ALNUM; | ||
| 2395 | if (is_alpha) bit = BIT_ALPHA; | ||
| 2396 | if (is_graph) bit = BIT_GRAPH; | ||
| 2397 | if (is_lower) bit = BIT_LOWER; | ||
| 2398 | if (is_print) bit = BIT_PRINT; | ||
| 2399 | if (is_punct) bit = BIT_PUNCT; | ||
| 2400 | if (is_space) bit = BIT_SPACE; | ||
| 2401 | if (is_upper) bit = BIT_UPPER; | ||
| 2402 | if (is_word) bit = BIT_WORD; | ||
| 2403 | if (bit) | ||
| 2404 | SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, | ||
| 2405 | bit); | ||
| 2406 | } | ||
| 2407 | |||
| 2408 | /* Handle character classes for ASCII characters. */ | ||
| 2294 | for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | 2409 | for (ch = 0; ch < 1 << BYTEWIDTH; ch++) |
| 2295 | { | 2410 | { |
| 2296 | int translated = TRANSLATE (ch); | 2411 | int translated = TRANSLATE (ch); |
| @@ -2311,6 +2426,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2311 | || (is_upper && ISUPPER (ch)) | 2426 | || (is_upper && ISUPPER (ch)) |
| 2312 | || (is_xdigit && ISXDIGIT (ch))) | 2427 | || (is_xdigit && ISXDIGIT (ch))) |
| 2313 | SET_LIST_BIT (translated); | 2428 | SET_LIST_BIT (translated); |
| 2429 | if ( (is_word && ISWORD (ch))) | ||
| 2430 | SET_LIST_BIT (translated); | ||
| 2314 | } | 2431 | } |
| 2315 | 2432 | ||
| 2316 | /* Repeat the loop. */ | 2433 | /* Repeat the loop. */ |
| @@ -2395,19 +2512,26 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2395 | b[-1]--; | 2512 | b[-1]--; |
| 2396 | b += b[-1]; | 2513 | b += b[-1]; |
| 2397 | 2514 | ||
| 2398 | /* Build real range table from work area. */ | 2515 | /* Build real range table from work area. */ |
| 2399 | if (RANGE_TABLE_WORK_USED (range_table_work)) | 2516 | if (RANGE_TABLE_WORK_USED (range_table_work) |
| 2517 | || RANGE_TABLE_WORK_BITS (range_table_work)) | ||
| 2400 | { | 2518 | { |
| 2401 | int i; | 2519 | int i; |
| 2402 | int used = RANGE_TABLE_WORK_USED (range_table_work); | 2520 | int used = RANGE_TABLE_WORK_USED (range_table_work); |
| 2403 | 2521 | ||
| 2404 | /* Allocate space for COUNT + RANGE_TABLE. Needs two | 2522 | /* Allocate space for COUNT + RANGE_TABLE. Needs two |
| 2405 | bytes for COUNT and three bytes for each character. */ | 2523 | bytes for flags, two for COUNT, and three bytes for |
| 2406 | GET_BUFFER_SPACE (2 + used * 3); | 2524 | each character. */ |
| 2525 | GET_BUFFER_SPACE (4 + used * 3); | ||
| 2407 | 2526 | ||
| 2408 | /* Indicate the existence of range table. */ | 2527 | /* Indicate the existence of range table. */ |
| 2409 | laststart[1] |= 0x80; | 2528 | laststart[1] |= 0x80; |
| 2410 | 2529 | ||
| 2530 | /* Store the character class flag bits into the range table. | ||
| 2531 | If not in emacs, these flag bits are always 0. */ | ||
| 2532 | *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff; | ||
| 2533 | *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8; | ||
| 2534 | |||
| 2411 | STORE_NUMBER_AND_INCR (b, used / 2); | 2535 | STORE_NUMBER_AND_INCR (b, used / 2); |
| 2412 | for (i = 0; i < used; i++) | 2536 | for (i = 0; i < used; i++) |
| 2413 | STORE_CHARACTER_AND_INCR | 2537 | STORE_CHARACTER_AND_INCR |
| @@ -3161,6 +3285,10 @@ group_in_compile_stack (compile_stack, regnum) | |||
| 3161 | characters can start a string that matches the pattern. This fastmap | 3285 | characters can start a string that matches the pattern. This fastmap |
| 3162 | is used by re_search to skip quickly over impossible starting points. | 3286 | is used by re_search to skip quickly over impossible starting points. |
| 3163 | 3287 | ||
| 3288 | Character codes above (1 << BYTEWIDTH) are not represented in the | ||
| 3289 | fastmap, but the leading codes are represented. Thus, the fastmap | ||
| 3290 | indicates which character sets could start a match. | ||
| 3291 | |||
| 3164 | The caller must supply the address of a (1 << BYTEWIDTH)-byte data | 3292 | The caller must supply the address of a (1 << BYTEWIDTH)-byte data |
| 3165 | area as BUFP->fastmap. | 3293 | area as BUFP->fastmap. |
| 3166 | 3294 | ||
| @@ -3262,22 +3390,30 @@ re_compile_fastmap (bufp) | |||
| 3262 | 3390 | ||
| 3263 | #ifndef emacs | 3391 | #ifndef emacs |
| 3264 | case charset: | 3392 | case charset: |
| 3265 | for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | 3393 | { |
| 3266 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | 3394 | int length = (*p & 0x7f);; |
| 3267 | fastmap[j] = 1; | 3395 | p++; |
| 3268 | break; | ||
| 3269 | 3396 | ||
| 3397 | for (j = length * BYTEWIDTH - 1; j >= 0; j--) | ||
| 3398 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | ||
| 3399 | fastmap[j] = 1; | ||
| 3400 | } | ||
| 3401 | break; | ||
| 3270 | 3402 | ||
| 3271 | case charset_not: | 3403 | case charset_not: |
| 3272 | /* Chars beyond end of map must be allowed. */ | 3404 | /* Chars beyond end of map must be allowed. */ |
| 3273 | for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) | 3405 | { |
| 3274 | fastmap[j] = 1; | 3406 | int length = (*p & 0x7f);; |
| 3407 | p++; | ||
| 3275 | 3408 | ||
| 3276 | for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | 3409 | for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
| 3277 | if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | ||
| 3278 | fastmap[j] = 1; | 3410 | fastmap[j] = 1; |
| 3279 | break; | ||
| 3280 | 3411 | ||
| 3412 | for (j = length * BYTEWIDTH - 1; j >= 0; j--) | ||
| 3413 | if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | ||
| 3414 | fastmap[j] = 1; | ||
| 3415 | } | ||
| 3416 | break; | ||
| 3281 | 3417 | ||
| 3282 | case wordchar: | 3418 | case wordchar: |
| 3283 | for (j = 0; j < (1 << BYTEWIDTH); j++) | 3419 | for (j = 0; j < (1 << BYTEWIDTH); j++) |
| @@ -3298,6 +3434,12 @@ re_compile_fastmap (bufp) | |||
| 3298 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | 3434 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) |
| 3299 | fastmap[j] = 1; | 3435 | fastmap[j] = 1; |
| 3300 | 3436 | ||
| 3437 | /* If we can match a syntax class, we can match | ||
| 3438 | any character set. */ | ||
| 3439 | if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | ||
| 3440 | && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) | ||
| 3441 | goto set_fastmap_for_multibyte_characters; | ||
| 3442 | |||
| 3301 | if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | 3443 | if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) |
| 3302 | && match_any_multibyte_characters == false) | 3444 | && match_any_multibyte_characters == false) |
| 3303 | { | 3445 | { |
| @@ -4617,26 +4759,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4617 | range table. */ | 4759 | range table. */ |
| 4618 | unsigned char *range_table; | 4760 | unsigned char *range_table; |
| 4619 | 4761 | ||
| 4620 | /* Nonzero if there is range table. */ | 4762 | /* Nonzero if there is a range table. */ |
| 4621 | int range_table_exists; | 4763 | int range_table_exists; |
| 4622 | 4764 | ||
| 4623 | /* Number of ranges of range table. Not in bytes. */ | 4765 | /* Number of ranges of range table. This is not included |
| 4624 | int count; | 4766 | in the initial byte-length of the command. */ |
| 4767 | int count = 0; | ||
| 4625 | 4768 | ||
| 4626 | DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); | 4769 | DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); |
| 4627 | 4770 | ||
| 4628 | PREFETCH (); | 4771 | PREFETCH (); |
| 4629 | c = (unsigned char) *d; | 4772 | c = (unsigned char) *d; |
| 4630 | 4773 | ||
| 4631 | range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ | ||
| 4632 | range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); | 4774 | range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); |
| 4775 | |||
| 4776 | #ifdef emacs | ||
| 4633 | if (range_table_exists) | 4777 | if (range_table_exists) |
| 4634 | EXTRACT_NUMBER_AND_INCR (count, range_table); | 4778 | { |
| 4635 | else | 4779 | range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ |
| 4636 | count = 0; | 4780 | EXTRACT_NUMBER_AND_INCR (count, range_table); |
| 4781 | } | ||
| 4637 | 4782 | ||
| 4638 | if (multibyte && BASE_LEADING_CODE_P (c)) | 4783 | if (multibyte && BASE_LEADING_CODE_P (c)) |
| 4639 | c = STRING_CHAR_AND_LENGTH (d, dend - d, len); | 4784 | c = STRING_CHAR_AND_LENGTH (d, dend - d, len); |
| 4785 | #endif /* emacs */ | ||
| 4640 | 4786 | ||
| 4641 | if (SINGLE_BYTE_CHAR_P (c)) | 4787 | if (SINGLE_BYTE_CHAR_P (c)) |
| 4642 | { /* Lookup bitmap. */ | 4788 | { /* Lookup bitmap. */ |
| @@ -4646,13 +4792,33 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4646 | /* Cast to `unsigned' instead of `unsigned char' in | 4792 | /* Cast to `unsigned' instead of `unsigned char' in |
| 4647 | case the bit list is a full 32 bytes long. */ | 4793 | case the bit list is a full 32 bytes long. */ |
| 4648 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) | 4794 | if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) |
| 4649 | && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | 4795 | && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
| 4650 | not = !not; | 4796 | not = !not; |
| 4651 | } | 4797 | } |
| 4798 | #ifdef emacs | ||
| 4652 | else if (range_table_exists) | 4799 | else if (range_table_exists) |
| 4653 | CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); | 4800 | { |
| 4801 | int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); | ||
| 4802 | |||
| 4803 | if ( (class_bits & BIT_ALNUM && ISALNUM (c)) | ||
| 4804 | | (class_bits & BIT_ALPHA && ISALPHA (c)) | ||
| 4805 | | (class_bits & BIT_GRAPH && ISGRAPH (c)) | ||
| 4806 | | (class_bits & BIT_LOWER && ISLOWER (c)) | ||
| 4807 | | (class_bits & BIT_PRINT && ISPRINT (c)) | ||
| 4808 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) | ||
| 4809 | | (class_bits & BIT_SPACE && ISSPACE (c)) | ||
| 4810 | | (class_bits & BIT_UPPER && ISUPPER (c)) | ||
| 4811 | | (class_bits & BIT_WORD && ISWORD (c))) | ||
| 4812 | not = !not; | ||
| 4813 | else | ||
| 4814 | CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); | ||
| 4815 | } | ||
| 4816 | #endif /* emacs */ | ||
| 4654 | 4817 | ||
| 4655 | p = CHARSET_RANGE_TABLE_END (range_table, count); | 4818 | if (range_table_exists) |
| 4819 | p = CHARSET_RANGE_TABLE_END (range_table, count); | ||
| 4820 | else | ||
| 4821 | p += CHARSET_BITMAP_SIZE (&p[-1]) + 1; | ||
| 4656 | 4822 | ||
| 4657 | if (!not) goto fail; | 4823 | if (!not) goto fail; |
| 4658 | 4824 | ||