aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/regex.c236
1 files changed, 201 insertions, 35 deletions
diff --git a/src/regex.c b/src/regex.c
index 3f951afe637..450850609a6 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -191,9 +191,6 @@ init_syntax_once ()
191/* Get the interface, including the syntax bits. */ 191/* Get the interface, including the syntax bits. */
192#include "regex.h" 192#include "regex.h"
193 193
194/* isalpha etc. are used for the character classes. */
195#include <ctype.h>
196
197/* Jim Meyering writes: 194/* Jim Meyering writes:
198 195
199 "... Some ctype macros are valid only for character codes that 196 "... Some ctype macros are valid only for character codes that
@@ -211,6 +208,51 @@ init_syntax_once ()
211#define ISASCII(c) isascii(c) 208#define ISASCII(c) isascii(c)
212#endif 209#endif
213 210
211/* isalpha etc. are used for the character classes. */
212#include <ctype.h>
213
214/* In Emacs, these are only used for single-byte characters. */
215#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
216#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
217#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
218
219#ifdef emacs
220
221/* This is only used for single-byte characters. */
222#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
223
224/* The rest must handle multibyte characters. */
225
226#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
227 ? ISASCII (c) && isprint (c) && !isspace (c) \
228 : 1)
229
230#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
231 ? ISASCII (c) && isalnum (c) \
232 : 1)
233
234#define ISALNUM(c) (SINGLE_BYTE_CHAR_P (c) \
235 ? ISASCII (c) && isalnum (c) \
236 : SYNTAX (c) == Sword)
237
238#define ISALPHA(c) (SINGLE_BYTE_CHAR_P (c) \
239 ? ISASCII (c) && isalpha (c) \
240 : SYNTAX (c) == Sword)
241
242#define ISLOWER(c) (LOWERCASEP (c))
243
244#define ISPUNCT(c) (SINGLE_BYTE_CHAR_P (c) \
245 ? ISASCII (c) && ispunct (c) \
246 : SYNTAX (c) != Sword)
247
248#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
249
250#define ISUPPER(c) (UPPERCASEP (c))
251
252#define ISWORD(c) (SYNTAX (c) == Sword)
253
254#else /* not emacs */
255
214#ifdef isblank 256#ifdef isblank
215#define ISBLANK(c) (ISASCII (c) && isblank (c)) 257#define ISBLANK(c) (ISASCII (c) && isblank (c))
216#else 258#else
@@ -233,6 +275,10 @@ init_syntax_once ()
233#define ISUPPER(c) (ISASCII (c) && isupper (c)) 275#define ISUPPER(c) (ISASCII (c) && isupper (c))
234#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 276#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
235 277
278#define ISWORD(c) ISALPHA(c)
279
280#endif /* not emacs */
281
236#ifndef NULL 282#ifndef NULL
237#define NULL (void *)0 283#define NULL (void *)0
238#endif 284#endif
@@ -383,7 +429,15 @@ typedef enum
383 for a bitmap saying which chars are in. Bits in each byte 429 for a bitmap saying which chars are in. Bits in each byte
384 are ordered low-bit-first. A character is in the set if its 430 are ordered low-bit-first. A character is in the set if its
385 bit is 1. A character too large to have a bit in the map is 431 bit is 1. A character too large to have a bit in the map is
386 automatically not in the set. */ 432 automatically not in the set.
433
434 If the length byte has the 0x80 bit set, then that stuff
435 is followed by a range table:
436 2 bytes of flags for character sets (low 8 bits, high 8 bits)
437 See RANGE_TABLE_WORK_BITS below.
438 2 bytes, the number of pairs that follow
439 pairs, each 2 multibyte characters,
440 each multibyte character represented as 3 bytes. */
387 charset, 441 charset,
388 442
389 /* Same parameters as charset, but match any character that is 443 /* Same parameters as charset, but match any character that is
@@ -617,8 +671,14 @@ extract_number_and_incr (destination, source)
617 671
618/* Return the address of range table of charset P. But not the start 672/* Return the address of range table of charset P. But not the start
619 of table itself, but the before where the number of ranges is 673 of table itself, but the before where the number of ranges is
620 stored. `2 +' means to skip re_opcode_t and size of bitmap. */ 674 stored. `2 +' means to skip re_opcode_t and size of bitmap,
621#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)]) 675 and the 2 bytes of flags at the start of the range table. */
676#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
677
678/* Extract the bit flags that start a range table. */
679#define CHARSET_RANGE_TABLE_BITS(p) \
680 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
681 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
622 682
623/* Test if C is listed in the bitmap of charset P. */ 683/* Test if C is listed in the bitmap of charset P. */
624#define CHARSET_LOOKUP_BITMAP(p, c) \ 684#define CHARSET_LOOKUP_BITMAP(p, c) \
@@ -791,6 +851,9 @@ print_partial_compiled_pattern (start, end)
791 { 851 {
792 register int c, last = -100; 852 register int c, last = -100;
793 register int in_range = 0; 853 register int in_range = 0;
854 int length = *p & 0x7f;
855 int has_range_table = *p & 0x80;
856 int range_length = p[length + 2] + p[length + 3] * 0x100;
794 857
795 printf ("/charset [%s", 858 printf ("/charset [%s",
796 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 859 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
@@ -798,7 +861,7 @@ print_partial_compiled_pattern (start, end)
798 assert (p + *p < pend); 861 assert (p + *p < pend);
799 862
800 for (c = 0; c < 256; c++) 863 for (c = 0; c < 256; c++)
801 if (c / 8 < *p 864 if (c / 8 < length
802 && (p[1 + (c/8)] & (1 << (c % 8)))) 865 && (p[1 + (c/8)] & (1 << (c % 8))))
803 { 866 {
804 /* Are we starting a range? */ 867 /* Are we starting a range? */
@@ -809,7 +872,7 @@ print_partial_compiled_pattern (start, end)
809 } 872 }
810 /* Have we broken a range? */ 873 /* Have we broken a range? */
811 else if (last + 1 != c && in_range) 874 else if (last + 1 != c && in_range)
812 { 875 {
813 putchar (last); 876 putchar (last);
814 in_range = 0; 877 in_range = 0;
815 } 878 }
@@ -820,12 +883,20 @@ print_partial_compiled_pattern (start, end)
820 last = c; 883 last = c;
821 } 884 }
822 885
886 p += 1 + length;
887
823 if (in_range) 888 if (in_range)
824 putchar (last); 889 putchar (last);
825 890
826 putchar (']'); 891 putchar (']');
827 892
828 p += 1 + *p; 893 if (has_range_table)
894 printf ("has-range-table");
895
896 /* ??? Should print the range table; for now,
897 just skip it. */
898 if (has_range_table)
899 p += 4 + 6 * range_length;
829 } 900 }
830 break; 901 break;
831 902
@@ -1710,6 +1781,7 @@ struct range_table_work_area
1710 int *table; /* actual work area. */ 1781 int *table; /* actual work area. */
1711 int allocated; /* allocated size for work area in bytes. */ 1782 int allocated; /* allocated size for work area in bytes. */
1712 int used; /* actually used size in words. */ 1783 int used; /* actually used size in words. */
1784 int bits; /* flag to record character classes */
1713}; 1785};
1714 1786
1715/* Make sure that WORK_AREA can hold more N multibyte characters. */ 1787/* Make sure that WORK_AREA can hold more N multibyte characters. */
@@ -1729,6 +1801,21 @@ struct range_table_work_area
1729 } \ 1801 } \
1730 } while (0) 1802 } while (0)
1731 1803
1804#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1805 (work_area).bits |= (bit)
1806
1807/* These bits represent the various character classes such as [:alnum:]
1808 in a charset's range table. */
1809#define BIT_ALNUM 0x1
1810#define BIT_ALPHA 0x2
1811#define BIT_WORD 0x4
1812#define BIT_GRAPH 0x20
1813#define BIT_LOWER 0x40
1814#define BIT_PRINT 0x80
1815#define BIT_PUNCT 0x100
1816#define BIT_SPACE 0x200
1817#define BIT_UPPER 0x400
1818
1732/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ 1819/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1733#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ 1820#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1734 do { \ 1821 do { \
@@ -1744,8 +1831,9 @@ struct range_table_work_area
1744 free ((work_area).table); \ 1831 free ((work_area).table); \
1745 } while (0) 1832 } while (0)
1746 1833
1747#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0) 1834#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1748#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used) 1835#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1836#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1749#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) 1837#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1750 1838
1751 1839
@@ -1780,7 +1868,8 @@ struct range_table_work_area
1780 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1868 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1781 || STREQ (string, "space") || STREQ (string, "print") \ 1869 || STREQ (string, "space") || STREQ (string, "print") \
1782 || STREQ (string, "punct") || STREQ (string, "graph") \ 1870 || STREQ (string, "punct") || STREQ (string, "graph") \
1783 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1871 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1872 || STREQ (string, "word"))
1784 1873
1785#ifndef MATCH_MAY_ALLOCATE 1874#ifndef MATCH_MAY_ALLOCATE
1786 1875
@@ -2281,6 +2370,7 @@ regex_compile (pattern, size, syntax, bufp)
2281 boolean is_space = STREQ (str, "space"); 2370 boolean is_space = STREQ (str, "space");
2282 boolean is_upper = STREQ (str, "upper"); 2371 boolean is_upper = STREQ (str, "upper");
2283 boolean is_xdigit = STREQ (str, "xdigit"); 2372 boolean is_xdigit = STREQ (str, "xdigit");
2373 boolean is_word = STREQ (str, "word");
2284 2374
2285 if (!IS_CHAR_CLASS (str)) 2375 if (!IS_CHAR_CLASS (str))
2286 FREE_STACK_RETURN (REG_ECTYPE); 2376 FREE_STACK_RETURN (REG_ECTYPE);
@@ -2291,6 +2381,31 @@ regex_compile (pattern, size, syntax, bufp)
2291 2381
2292 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2382 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2293 2383
2384 /* Most character classes in a multibyte match
2385 just set a flag. Exceptions are is_blank,
2386 is_digit, is_cntrl, and is_xdigit, since
2387 they can only match ASCII characters. We
2388 don't need to handle them for multibyte. */
2389
2390 if (bufp->multibyte)
2391 {
2392 int bit = 0;
2393
2394 if (is_alnum) bit = BIT_ALNUM;
2395 if (is_alpha) bit = BIT_ALPHA;
2396 if (is_graph) bit = BIT_GRAPH;
2397 if (is_lower) bit = BIT_LOWER;
2398 if (is_print) bit = BIT_PRINT;
2399 if (is_punct) bit = BIT_PUNCT;
2400 if (is_space) bit = BIT_SPACE;
2401 if (is_upper) bit = BIT_UPPER;
2402 if (is_word) bit = BIT_WORD;
2403 if (bit)
2404 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2405 bit);
2406 }
2407
2408 /* Handle character classes for ASCII characters. */
2294 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 2409 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2295 { 2410 {
2296 int translated = TRANSLATE (ch); 2411 int translated = TRANSLATE (ch);
@@ -2311,6 +2426,8 @@ regex_compile (pattern, size, syntax, bufp)
2311 || (is_upper && ISUPPER (ch)) 2426 || (is_upper && ISUPPER (ch))
2312 || (is_xdigit && ISXDIGIT (ch))) 2427 || (is_xdigit && ISXDIGIT (ch)))
2313 SET_LIST_BIT (translated); 2428 SET_LIST_BIT (translated);
2429 if ( (is_word && ISWORD (ch)))
2430 SET_LIST_BIT (translated);
2314 } 2431 }
2315 2432
2316 /* Repeat the loop. */ 2433 /* Repeat the loop. */
@@ -2395,19 +2512,26 @@ regex_compile (pattern, size, syntax, bufp)
2395 b[-1]--; 2512 b[-1]--;
2396 b += b[-1]; 2513 b += b[-1];
2397 2514
2398 /* Build real range table from work area. */ 2515 /* Build real range table from work area. */
2399 if (RANGE_TABLE_WORK_USED (range_table_work)) 2516 if (RANGE_TABLE_WORK_USED (range_table_work)
2517 || RANGE_TABLE_WORK_BITS (range_table_work))
2400 { 2518 {
2401 int i; 2519 int i;
2402 int used = RANGE_TABLE_WORK_USED (range_table_work); 2520 int used = RANGE_TABLE_WORK_USED (range_table_work);
2403 2521
2404 /* Allocate space for COUNT + RANGE_TABLE. Needs two 2522 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2405 bytes for COUNT and three bytes for each character. */ 2523 bytes for flags, two for COUNT, and three bytes for
2406 GET_BUFFER_SPACE (2 + used * 3); 2524 each character. */
2525 GET_BUFFER_SPACE (4 + used * 3);
2407 2526
2408 /* Indicate the existence of range table. */ 2527 /* Indicate the existence of range table. */
2409 laststart[1] |= 0x80; 2528 laststart[1] |= 0x80;
2410 2529
2530 /* Store the character class flag bits into the range table.
2531 If not in emacs, these flag bits are always 0. */
2532 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2533 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2534
2411 STORE_NUMBER_AND_INCR (b, used / 2); 2535 STORE_NUMBER_AND_INCR (b, used / 2);
2412 for (i = 0; i < used; i++) 2536 for (i = 0; i < used; i++)
2413 STORE_CHARACTER_AND_INCR 2537 STORE_CHARACTER_AND_INCR
@@ -3161,6 +3285,10 @@ group_in_compile_stack (compile_stack, regnum)
3161 characters can start a string that matches the pattern. This fastmap 3285 characters can start a string that matches the pattern. This fastmap
3162 is used by re_search to skip quickly over impossible starting points. 3286 is used by re_search to skip quickly over impossible starting points.
3163 3287
3288 Character codes above (1 << BYTEWIDTH) are not represented in the
3289 fastmap, but the leading codes are represented. Thus, the fastmap
3290 indicates which character sets could start a match.
3291
3164 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 3292 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3165 area as BUFP->fastmap. 3293 area as BUFP->fastmap.
3166 3294
@@ -3262,22 +3390,30 @@ re_compile_fastmap (bufp)
3262 3390
3263#ifndef emacs 3391#ifndef emacs
3264 case charset: 3392 case charset:
3265 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3393 {
3266 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3394 int length = (*p & 0x7f);;
3267 fastmap[j] = 1; 3395 p++;
3268 break;
3269 3396
3397 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3398 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3399 fastmap[j] = 1;
3400 }
3401 break;
3270 3402
3271 case charset_not: 3403 case charset_not:
3272 /* Chars beyond end of map must be allowed. */ 3404 /* Chars beyond end of map must be allowed. */
3273 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 3405 {
3274 fastmap[j] = 1; 3406 int length = (*p & 0x7f);;
3407 p++;
3275 3408
3276 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3409 for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3277 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3278 fastmap[j] = 1; 3410 fastmap[j] = 1;
3279 break;
3280 3411
3412 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3413 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3414 fastmap[j] = 1;
3415 }
3416 break;
3281 3417
3282 case wordchar: 3418 case wordchar:
3283 for (j = 0; j < (1 << BYTEWIDTH); j++) 3419 for (j = 0; j < (1 << BYTEWIDTH); j++)
@@ -3298,6 +3434,12 @@ re_compile_fastmap (bufp)
3298 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3434 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3299 fastmap[j] = 1; 3435 fastmap[j] = 1;
3300 3436
3437 /* If we can match a syntax class, we can match
3438 any character set. */
3439 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3440 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
3441 goto set_fastmap_for_multibyte_characters;
3442
3301 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) 3443 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3302 && match_any_multibyte_characters == false) 3444 && match_any_multibyte_characters == false)
3303 { 3445 {
@@ -4617,26 +4759,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4617 range table. */ 4759 range table. */
4618 unsigned char *range_table; 4760 unsigned char *range_table;
4619 4761
4620 /* Nonzero if there is range table. */ 4762 /* Nonzero if there is a range table. */
4621 int range_table_exists; 4763 int range_table_exists;
4622 4764
4623 /* Number of ranges of range table. Not in bytes. */ 4765 /* Number of ranges of range table. This is not included
4624 int count; 4766 in the initial byte-length of the command. */
4767 int count = 0;
4625 4768
4626 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 4769 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4627 4770
4628 PREFETCH (); 4771 PREFETCH ();
4629 c = (unsigned char) *d; 4772 c = (unsigned char) *d;
4630 4773
4631 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4632 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); 4774 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4775
4776#ifdef emacs
4633 if (range_table_exists) 4777 if (range_table_exists)
4634 EXTRACT_NUMBER_AND_INCR (count, range_table); 4778 {
4635 else 4779 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4636 count = 0; 4780 EXTRACT_NUMBER_AND_INCR (count, range_table);
4781 }
4637 4782
4638 if (multibyte && BASE_LEADING_CODE_P (c)) 4783 if (multibyte && BASE_LEADING_CODE_P (c))
4639 c = STRING_CHAR_AND_LENGTH (d, dend - d, len); 4784 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
4785#endif /* emacs */
4640 4786
4641 if (SINGLE_BYTE_CHAR_P (c)) 4787 if (SINGLE_BYTE_CHAR_P (c))
4642 { /* Lookup bitmap. */ 4788 { /* Lookup bitmap. */
@@ -4646,13 +4792,33 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4646 /* Cast to `unsigned' instead of `unsigned char' in 4792 /* Cast to `unsigned' instead of `unsigned char' in
4647 case the bit list is a full 32 bytes long. */ 4793 case the bit list is a full 32 bytes long. */
4648 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) 4794 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4649 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4795 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4650 not = !not; 4796 not = !not;
4651 } 4797 }
4798#ifdef emacs
4652 else if (range_table_exists) 4799 else if (range_table_exists)
4653 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); 4800 {
4801 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4802
4803 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4804 | (class_bits & BIT_ALPHA && ISALPHA (c))
4805 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4806 | (class_bits & BIT_LOWER && ISLOWER (c))
4807 | (class_bits & BIT_PRINT && ISPRINT (c))
4808 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4809 | (class_bits & BIT_SPACE && ISSPACE (c))
4810 | (class_bits & BIT_UPPER && ISUPPER (c))
4811 | (class_bits & BIT_WORD && ISWORD (c)))
4812 not = !not;
4813 else
4814 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4815 }
4816#endif /* emacs */
4654 4817
4655 p = CHARSET_RANGE_TABLE_END (range_table, count); 4818 if (range_table_exists)
4819 p = CHARSET_RANGE_TABLE_END (range_table, count);
4820 else
4821 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
4656 4822
4657 if (!not) goto fail; 4823 if (!not) goto fail;
4658 4824