diff options
| author | Dave Love | 1999-10-06 18:25:22 +0000 |
|---|---|---|
| committer | Dave Love | 1999-10-06 18:25:22 +0000 |
| commit | f71b19b645a2e8a0090f4c97a4d3bc9e892043c5 (patch) | |
| tree | d8dd3243d03d4019910c4667f3e2c3944479c1ae /src | |
| parent | 93548d2e69a40bf8056b05af5ffb36a17d96a4b6 (diff) | |
| download | emacs-f71b19b645a2e8a0090f4c97a4d3bc9e892043c5.tar.gz emacs-f71b19b645a2e8a0090f4c97a4d3bc9e892043c5.zip | |
1999-09-04 Richard M. Stallman <rms@gnu.org>
* regex.c [emacs] (ISALNUM, ISALPHA, ISPUNCT): Don't depend on locale
[emacs] (ISASCII): Don't define ISASCII in this case.
(IS_REAL_ASCII): New macro, 2 alternate definitions.
(ISUNIBYTE): Likewise.
[emacs] (ISDIGIT, ISCNTRL, ISXDIGIT, ISGRAPH, ISPRINT):
Don't use ISASCII.
* regex.c: Handle new class names `ascii', `nonascii',
`unibyte, `multibyte'.
(BIT_ASCII, BIT_NONASCII, BIT_UNIBYTE, BIT_MULTIBYTE): New macros.
(IS_CHAR_CLASS): Accept new class names.
(regex_compile, re_match_2_internal): Handle the new classes.
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex.c | 117 |
1 files changed, 83 insertions, 34 deletions
diff --git a/src/regex.c b/src/regex.c index 450850609a6..bdd84ef483e 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -191,32 +191,25 @@ init_syntax_once () | |||
| 191 | /* Get the interface, including the syntax bits. */ | 191 | /* Get the interface, including the syntax bits. */ |
| 192 | #include "regex.h" | 192 | #include "regex.h" |
| 193 | 193 | ||
| 194 | /* Jim Meyering writes: | 194 | /* isalpha etc. are used for the character classes. */ |
| 195 | #include <ctype.h> | ||
| 195 | 196 | ||
| 196 | "... Some ctype macros are valid only for character codes that | 197 | #ifdef emacs |
| 197 | isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | ||
| 198 | using /bin/cc or gcc but without giving an ansi option). So, all | ||
| 199 | ctype uses should be through macros like ISPRINT... If | ||
| 200 | STDC_HEADERS is defined, then autoconf has verified that the ctype | ||
| 201 | macros don't need to be guarded with references to isascii. ... | ||
| 202 | Defining isascii to 1 should let any compiler worth its salt | ||
| 203 | eliminate the && through constant folding." */ | ||
| 204 | 198 | ||
| 205 | #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | 199 | /* 1 if C is an ASCII character. */ |
| 206 | #define ISASCII(c) 1 | 200 | #define IS_REAL_ASCII(c) ((c) < 0200) |
| 207 | #else | ||
| 208 | #define ISASCII(c) isascii(c) | ||
| 209 | #endif | ||
| 210 | 201 | ||
| 211 | /* isalpha etc. are used for the character classes. */ | 202 | /* 1 if C is a unibyte character. */ |
| 212 | #include <ctype.h> | 203 | #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c))) |
| 213 | 204 | ||
| 214 | /* In Emacs, these are only used for single-byte characters. */ | 205 | /* The Emacs definitions should not be directly affected by locales. */ |
| 215 | #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | ||
| 216 | #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | ||
| 217 | #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | ||
| 218 | 206 | ||
| 219 | #ifdef emacs | 207 | /* In Emacs, these are only used for single-byte characters. */ |
| 208 | #define ISDIGIT(c) ((c) >= '0' && (c) <= '9') | ||
| 209 | #define ISCNTRL(c) ((c) < ' ') | ||
| 210 | #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \ | ||
| 211 | || ((c) >= 'a' && (c) <= 'f') \ | ||
| 212 | || ((c) >= 'A' && (c) <= 'F')) | ||
| 220 | 213 | ||
| 221 | /* This is only used for single-byte characters. */ | 214 | /* This is only used for single-byte characters. */ |
| 222 | #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | 215 | #define ISBLANK(c) ((c) == ' ' || (c) == '\t') |
| @@ -224,25 +217,31 @@ init_syntax_once () | |||
| 224 | /* The rest must handle multibyte characters. */ | 217 | /* The rest must handle multibyte characters. */ |
| 225 | 218 | ||
| 226 | #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ | 219 | #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ |
| 227 | ? ISASCII (c) && isprint (c) && !isspace (c) \ | 220 | ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ |
| 228 | : 1) | 221 | : 1) |
| 229 | 222 | ||
| 230 | #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ | 223 | #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ |
| 231 | ? ISASCII (c) && isalnum (c) \ | 224 | ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ |
| 232 | : 1) | 225 | : 1) |
| 233 | 226 | ||
| 234 | #define ISALNUM(c) (SINGLE_BYTE_CHAR_P (c) \ | 227 | #define ISALNUM(c) (IS_REAL_ASCII (c) \ |
| 235 | ? ISASCII (c) && isalnum (c) \ | 228 | ? (((c) >= 'a' && (c) <= 'z') \ |
| 229 | || ((c) >= 'A' && (c) <= 'Z') \ | ||
| 230 | || ((c) >= '0' && (c) <= '9')) \ | ||
| 236 | : SYNTAX (c) == Sword) | 231 | : SYNTAX (c) == Sword) |
| 237 | 232 | ||
| 238 | #define ISALPHA(c) (SINGLE_BYTE_CHAR_P (c) \ | 233 | #define ISALPHA(c) (IS_REAL_ASCII (c) \ |
| 239 | ? ISASCII (c) && isalpha (c) \ | 234 | ? (((c) >= 'a' && (c) <= 'z') \ |
| 235 | || ((c) >= 'A' && (c) <= 'Z')) \ | ||
| 240 | : SYNTAX (c) == Sword) | 236 | : SYNTAX (c) == Sword) |
| 241 | 237 | ||
| 242 | #define ISLOWER(c) (LOWERCASEP (c)) | 238 | #define ISLOWER(c) (LOWERCASEP (c)) |
| 243 | 239 | ||
| 244 | #define ISPUNCT(c) (SINGLE_BYTE_CHAR_P (c) \ | 240 | #define ISPUNCT(c) (IS_REAL_ASCII (c) \ |
| 245 | ? ISASCII (c) && ispunct (c) \ | 241 | ? ((c) > ' ' && (c) < 0177 \ |
| 242 | && !(((c) >= 'a' && (c) <= 'z') \ | ||
| 243 | || ((c) >= 'A' && (c) <= 'Z') \ | ||
| 244 | || ((c) >= '0' && (c) <= '9'))) \ | ||
| 246 | : SYNTAX (c) != Sword) | 245 | : SYNTAX (c) != Sword) |
| 247 | 246 | ||
| 248 | #define ISSPACE(c) (SYNTAX (c) == Swhitespace) | 247 | #define ISSPACE(c) (SYNTAX (c) == Swhitespace) |
| @@ -253,6 +252,33 @@ init_syntax_once () | |||
| 253 | 252 | ||
| 254 | #else /* not emacs */ | 253 | #else /* not emacs */ |
| 255 | 254 | ||
| 255 | /* Jim Meyering writes: | ||
| 256 | |||
| 257 | "... Some ctype macros are valid only for character codes that | ||
| 258 | isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | ||
| 259 | using /bin/cc or gcc but without giving an ansi option). So, all | ||
| 260 | ctype uses should be through macros like ISPRINT... If | ||
| 261 | STDC_HEADERS is defined, then autoconf has verified that the ctype | ||
| 262 | macros don't need to be guarded with references to isascii. ... | ||
| 263 | Defining isascii to 1 should let any compiler worth its salt | ||
| 264 | eliminate the && through constant folding." */ | ||
| 265 | |||
| 266 | #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | ||
| 267 | #define ISASCII(c) 1 | ||
| 268 | #else | ||
| 269 | #define ISASCII(c) isascii(c) | ||
| 270 | #endif | ||
| 271 | |||
| 272 | /* 1 if C is an ASCII character. */ | ||
| 273 | #define IS_REAL_ASCII(c) ((c) < 0200) | ||
| 274 | |||
| 275 | /* This distinction is not meaningful, except in Emacs. */ | ||
| 276 | #define ISUNIBYTE(c) 1 | ||
| 277 | |||
| 278 | #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | ||
| 279 | #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | ||
| 280 | #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | ||
| 281 | |||
| 256 | #ifdef isblank | 282 | #ifdef isblank |
| 257 | #define ISBLANK(c) (ISASCII (c) && isblank (c)) | 283 | #define ISBLANK(c) (ISASCII (c) && isblank (c)) |
| 258 | #else | 284 | #else |
| @@ -1809,12 +1835,16 @@ struct range_table_work_area | |||
| 1809 | #define BIT_ALNUM 0x1 | 1835 | #define BIT_ALNUM 0x1 |
| 1810 | #define BIT_ALPHA 0x2 | 1836 | #define BIT_ALPHA 0x2 |
| 1811 | #define BIT_WORD 0x4 | 1837 | #define BIT_WORD 0x4 |
| 1838 | #define BIT_ASCII 0x8 | ||
| 1839 | #define BIT_NONASCII 0x10 | ||
| 1812 | #define BIT_GRAPH 0x20 | 1840 | #define BIT_GRAPH 0x20 |
| 1813 | #define BIT_LOWER 0x40 | 1841 | #define BIT_LOWER 0x40 |
| 1814 | #define BIT_PRINT 0x80 | 1842 | #define BIT_PRINT 0x80 |
| 1815 | #define BIT_PUNCT 0x100 | 1843 | #define BIT_PUNCT 0x100 |
| 1816 | #define BIT_SPACE 0x200 | 1844 | #define BIT_SPACE 0x200 |
| 1817 | #define BIT_UPPER 0x400 | 1845 | #define BIT_UPPER 0x400 |
| 1846 | #define BIT_UNIBYTE 0x800 | ||
| 1847 | #define BIT_MULTIBYTE 0x1000 | ||
| 1818 | 1848 | ||
| 1819 | /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ | 1849 | /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ |
| 1820 | #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ | 1850 | #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ |
| @@ -1869,7 +1899,9 @@ struct range_table_work_area | |||
| 1869 | || STREQ (string, "space") || STREQ (string, "print") \ | 1899 | || STREQ (string, "space") || STREQ (string, "print") \ |
| 1870 | || STREQ (string, "punct") || STREQ (string, "graph") \ | 1900 | || STREQ (string, "punct") || STREQ (string, "graph") \ |
| 1871 | || STREQ (string, "cntrl") || STREQ (string, "blank") \ | 1901 | || STREQ (string, "cntrl") || STREQ (string, "blank") \ |
| 1872 | || STREQ (string, "word")) | 1902 | || STREQ (string, "word") \ |
| 1903 | || STREQ (string, "ascii") || STREQ (string, "nonascii") \ | ||
| 1904 | || STREQ (string, "unibyte") || STREQ (string, "multibyte")) | ||
| 1873 | 1905 | ||
| 1874 | #ifndef MATCH_MAY_ALLOCATE | 1906 | #ifndef MATCH_MAY_ALLOCATE |
| 1875 | 1907 | ||
| @@ -2360,17 +2392,21 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2360 | int ch; | 2392 | int ch; |
| 2361 | boolean is_alnum = STREQ (str, "alnum"); | 2393 | boolean is_alnum = STREQ (str, "alnum"); |
| 2362 | boolean is_alpha = STREQ (str, "alpha"); | 2394 | boolean is_alpha = STREQ (str, "alpha"); |
| 2395 | boolean is_ascii = STREQ (str, "ascii"); | ||
| 2363 | boolean is_blank = STREQ (str, "blank"); | 2396 | boolean is_blank = STREQ (str, "blank"); |
| 2364 | boolean is_cntrl = STREQ (str, "cntrl"); | 2397 | boolean is_cntrl = STREQ (str, "cntrl"); |
| 2365 | boolean is_digit = STREQ (str, "digit"); | 2398 | boolean is_digit = STREQ (str, "digit"); |
| 2366 | boolean is_graph = STREQ (str, "graph"); | 2399 | boolean is_graph = STREQ (str, "graph"); |
| 2367 | boolean is_lower = STREQ (str, "lower"); | 2400 | boolean is_lower = STREQ (str, "lower"); |
| 2401 | boolean is_multibyte = STREQ (str, "multibyte"); | ||
| 2402 | boolean is_nonascii = STREQ (str, "nonascii"); | ||
| 2368 | boolean is_print = STREQ (str, "print"); | 2403 | boolean is_print = STREQ (str, "print"); |
| 2369 | boolean is_punct = STREQ (str, "punct"); | 2404 | boolean is_punct = STREQ (str, "punct"); |
| 2370 | boolean is_space = STREQ (str, "space"); | 2405 | boolean is_space = STREQ (str, "space"); |
| 2406 | boolean is_unibyte = STREQ (str, "unibyte"); | ||
| 2371 | boolean is_upper = STREQ (str, "upper"); | 2407 | boolean is_upper = STREQ (str, "upper"); |
| 2372 | boolean is_xdigit = STREQ (str, "xdigit"); | ||
| 2373 | boolean is_word = STREQ (str, "word"); | 2408 | boolean is_word = STREQ (str, "word"); |
| 2409 | boolean is_xdigit = STREQ (str, "xdigit"); | ||
| 2374 | 2410 | ||
| 2375 | if (!IS_CHAR_CLASS (str)) | 2411 | if (!IS_CHAR_CLASS (str)) |
| 2376 | FREE_STACK_RETURN (REG_ECTYPE); | 2412 | FREE_STACK_RETURN (REG_ECTYPE); |
| @@ -2393,11 +2429,15 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2393 | 2429 | ||
| 2394 | if (is_alnum) bit = BIT_ALNUM; | 2430 | if (is_alnum) bit = BIT_ALNUM; |
| 2395 | if (is_alpha) bit = BIT_ALPHA; | 2431 | if (is_alpha) bit = BIT_ALPHA; |
| 2432 | if (is_ascii) bit = BIT_ASCII; | ||
| 2396 | if (is_graph) bit = BIT_GRAPH; | 2433 | if (is_graph) bit = BIT_GRAPH; |
| 2397 | if (is_lower) bit = BIT_LOWER; | 2434 | if (is_lower) bit = BIT_LOWER; |
| 2435 | if (is_multibyte) bit = BIT_MULTIBYTE; | ||
| 2436 | if (is_nonascii) bit = BIT_NONASCII; | ||
| 2398 | if (is_print) bit = BIT_PRINT; | 2437 | if (is_print) bit = BIT_PRINT; |
| 2399 | if (is_punct) bit = BIT_PUNCT; | 2438 | if (is_punct) bit = BIT_PUNCT; |
| 2400 | if (is_space) bit = BIT_SPACE; | 2439 | if (is_space) bit = BIT_SPACE; |
| 2440 | if (is_unibyte) bit = BIT_UNIBYTE; | ||
| 2401 | if (is_upper) bit = BIT_UPPER; | 2441 | if (is_upper) bit = BIT_UPPER; |
| 2402 | if (is_word) bit = BIT_WORD; | 2442 | if (is_word) bit = BIT_WORD; |
| 2403 | if (bit) | 2443 | if (bit) |
| @@ -2426,6 +2466,12 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2426 | || (is_upper && ISUPPER (ch)) | 2466 | || (is_upper && ISUPPER (ch)) |
| 2427 | || (is_xdigit && ISXDIGIT (ch))) | 2467 | || (is_xdigit && ISXDIGIT (ch))) |
| 2428 | SET_LIST_BIT (translated); | 2468 | SET_LIST_BIT (translated); |
| 2469 | if ( (is_ascii && IS_REAL_ASCII (ch)) | ||
| 2470 | || (is_nonascii && !IS_REAL_ASCII (ch)) | ||
| 2471 | || (is_unibyte && ISUNIBYTE (ch)) | ||
| 2472 | || (is_multibyte && !ISUNIBYTE (ch))) | ||
| 2473 | SET_LIST_BIT (translated); | ||
| 2474 | |||
| 2429 | if ( (is_word && ISWORD (ch))) | 2475 | if ( (is_word && ISWORD (ch))) |
| 2430 | SET_LIST_BIT (translated); | 2476 | SET_LIST_BIT (translated); |
| 2431 | } | 2477 | } |
| @@ -3434,7 +3480,7 @@ re_compile_fastmap (bufp) | |||
| 3434 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | 3480 | if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) |
| 3435 | fastmap[j] = 1; | 3481 | fastmap[j] = 1; |
| 3436 | 3482 | ||
| 3437 | /* If we can match a syntax class, we can match | 3483 | /* If we can match a character class, we can match |
| 3438 | any character set. */ | 3484 | any character set. */ |
| 3439 | if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | 3485 | if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) |
| 3440 | && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) | 3486 | && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) |
| @@ -3450,8 +3496,7 @@ re_compile_fastmap (bufp) | |||
| 3450 | /* Make P points the range table. */ | 3496 | /* Make P points the range table. */ |
| 3451 | p += CHARSET_BITMAP_SIZE (&p[-2]); | 3497 | p += CHARSET_BITMAP_SIZE (&p[-2]); |
| 3452 | 3498 | ||
| 3453 | /* Extract the number of ranges in range table into | 3499 | /* Extract the number of ranges in range table into COUNT. */ |
| 3454 | COUNT. */ | ||
| 3455 | EXTRACT_NUMBER_AND_INCR (count, p); | 3500 | EXTRACT_NUMBER_AND_INCR (count, p); |
| 3456 | for (; count > 0; count--, p += 2 * 3) /* XXX */ | 3501 | for (; count > 0; count--, p += 2 * 3) /* XXX */ |
| 3457 | { | 3502 | { |
| @@ -4802,11 +4847,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 4802 | 4847 | ||
| 4803 | if ( (class_bits & BIT_ALNUM && ISALNUM (c)) | 4848 | if ( (class_bits & BIT_ALNUM && ISALNUM (c)) |
| 4804 | | (class_bits & BIT_ALPHA && ISALPHA (c)) | 4849 | | (class_bits & BIT_ALPHA && ISALPHA (c)) |
| 4850 | | (class_bits & BIT_ASCII && IS_REAL_ASCII (c)) | ||
| 4805 | | (class_bits & BIT_GRAPH && ISGRAPH (c)) | 4851 | | (class_bits & BIT_GRAPH && ISGRAPH (c)) |
| 4806 | | (class_bits & BIT_LOWER && ISLOWER (c)) | 4852 | | (class_bits & BIT_LOWER && ISLOWER (c)) |
| 4853 | | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c)) | ||
| 4854 | | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c)) | ||
| 4807 | | (class_bits & BIT_PRINT && ISPRINT (c)) | 4855 | | (class_bits & BIT_PRINT && ISPRINT (c)) |
| 4808 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) | 4856 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) |
| 4809 | | (class_bits & BIT_SPACE && ISSPACE (c)) | 4857 | | (class_bits & BIT_SPACE && ISSPACE (c)) |
| 4858 | | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c)) | ||
| 4810 | | (class_bits & BIT_UPPER && ISUPPER (c)) | 4859 | | (class_bits & BIT_UPPER && ISUPPER (c)) |
| 4811 | | (class_bits & BIT_WORD && ISWORD (c))) | 4860 | | (class_bits & BIT_WORD && ISWORD (c))) |
| 4812 | not = !not; | 4861 | not = !not; |