aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDave Love1999-10-06 18:25:22 +0000
committerDave Love1999-10-06 18:25:22 +0000
commitf71b19b645a2e8a0090f4c97a4d3bc9e892043c5 (patch)
treed8dd3243d03d4019910c4667f3e2c3944479c1ae /src
parent93548d2e69a40bf8056b05af5ffb36a17d96a4b6 (diff)
downloademacs-f71b19b645a2e8a0090f4c97a4d3bc9e892043c5.tar.gz
emacs-f71b19b645a2e8a0090f4c97a4d3bc9e892043c5.zip
1999-09-04 Richard M. Stallman <rms@gnu.org>
* regex.c [emacs] (ISALNUM, ISALPHA, ISPUNCT): Don't depend on locale [emacs] (ISASCII): Don't define ISASCII in this case. (IS_REAL_ASCII): New macro, 2 alternate definitions. (ISUNIBYTE): Likewise. [emacs] (ISDIGIT, ISCNTRL, ISXDIGIT, ISGRAPH, ISPRINT): Don't use ISASCII. * regex.c: Handle new class names `ascii', `nonascii', `unibyte, `multibyte'. (BIT_ASCII, BIT_NONASCII, BIT_UNIBYTE, BIT_MULTIBYTE): New macros. (IS_CHAR_CLASS): Accept new class names. (regex_compile, re_match_2_internal): Handle the new classes.
Diffstat (limited to 'src')
-rw-r--r--src/regex.c117
1 files changed, 83 insertions, 34 deletions
diff --git a/src/regex.c b/src/regex.c
index 450850609a6..bdd84ef483e 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -191,32 +191,25 @@ init_syntax_once ()
191/* Get the interface, including the syntax bits. */ 191/* Get the interface, including the syntax bits. */
192#include "regex.h" 192#include "regex.h"
193 193
194/* Jim Meyering writes: 194/* isalpha etc. are used for the character classes. */
195#include <ctype.h>
195 196
196 "... Some ctype macros are valid only for character codes that 197#ifdef emacs
197 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
198 using /bin/cc or gcc but without giving an ansi option). So, all
199 ctype uses should be through macros like ISPRINT... If
200 STDC_HEADERS is defined, then autoconf has verified that the ctype
201 macros don't need to be guarded with references to isascii. ...
202 Defining isascii to 1 should let any compiler worth its salt
203 eliminate the && through constant folding." */
204 198
205#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 199/* 1 if C is an ASCII character. */
206#define ISASCII(c) 1 200#define IS_REAL_ASCII(c) ((c) < 0200)
207#else
208#define ISASCII(c) isascii(c)
209#endif
210 201
211/* isalpha etc. are used for the character classes. */ 202/* 1 if C is a unibyte character. */
212#include <ctype.h> 203#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
213 204
214/* In Emacs, these are only used for single-byte characters. */ 205/* The Emacs definitions should not be directly affected by locales. */
215#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
216#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
217#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
218 206
219#ifdef emacs 207/* In Emacs, these are only used for single-byte characters. */
208#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
209#define ISCNTRL(c) ((c) < ' ')
210#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
211 || ((c) >= 'a' && (c) <= 'f') \
212 || ((c) >= 'A' && (c) <= 'F'))
220 213
221/* This is only used for single-byte characters. */ 214/* This is only used for single-byte characters. */
222#define ISBLANK(c) ((c) == ' ' || (c) == '\t') 215#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
@@ -224,25 +217,31 @@ init_syntax_once ()
224/* The rest must handle multibyte characters. */ 217/* The rest must handle multibyte characters. */
225 218
226#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ 219#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
227 ? ISASCII (c) && isprint (c) && !isspace (c) \ 220 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
228 : 1) 221 : 1)
229 222
230#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ 223#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
231 ? ISASCII (c) && isalnum (c) \ 224 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
232 : 1) 225 : 1)
233 226
234#define ISALNUM(c) (SINGLE_BYTE_CHAR_P (c) \ 227#define ISALNUM(c) (IS_REAL_ASCII (c) \
235 ? ISASCII (c) && isalnum (c) \ 228 ? (((c) >= 'a' && (c) <= 'z') \
229 || ((c) >= 'A' && (c) <= 'Z') \
230 || ((c) >= '0' && (c) <= '9')) \
236 : SYNTAX (c) == Sword) 231 : SYNTAX (c) == Sword)
237 232
238#define ISALPHA(c) (SINGLE_BYTE_CHAR_P (c) \ 233#define ISALPHA(c) (IS_REAL_ASCII (c) \
239 ? ISASCII (c) && isalpha (c) \ 234 ? (((c) >= 'a' && (c) <= 'z') \
235 || ((c) >= 'A' && (c) <= 'Z')) \
240 : SYNTAX (c) == Sword) 236 : SYNTAX (c) == Sword)
241 237
242#define ISLOWER(c) (LOWERCASEP (c)) 238#define ISLOWER(c) (LOWERCASEP (c))
243 239
244#define ISPUNCT(c) (SINGLE_BYTE_CHAR_P (c) \ 240#define ISPUNCT(c) (IS_REAL_ASCII (c) \
245 ? ISASCII (c) && ispunct (c) \ 241 ? ((c) > ' ' && (c) < 0177 \
242 && !(((c) >= 'a' && (c) <= 'z') \
243 || ((c) >= 'A' && (c) <= 'Z') \
244 || ((c) >= '0' && (c) <= '9'))) \
246 : SYNTAX (c) != Sword) 245 : SYNTAX (c) != Sword)
247 246
248#define ISSPACE(c) (SYNTAX (c) == Swhitespace) 247#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
@@ -253,6 +252,33 @@ init_syntax_once ()
253 252
254#else /* not emacs */ 253#else /* not emacs */
255 254
255/* Jim Meyering writes:
256
257 "... Some ctype macros are valid only for character codes that
258 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
259 using /bin/cc or gcc but without giving an ansi option). So, all
260 ctype uses should be through macros like ISPRINT... If
261 STDC_HEADERS is defined, then autoconf has verified that the ctype
262 macros don't need to be guarded with references to isascii. ...
263 Defining isascii to 1 should let any compiler worth its salt
264 eliminate the && through constant folding." */
265
266#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
267#define ISASCII(c) 1
268#else
269#define ISASCII(c) isascii(c)
270#endif
271
272/* 1 if C is an ASCII character. */
273#define IS_REAL_ASCII(c) ((c) < 0200)
274
275/* This distinction is not meaningful, except in Emacs. */
276#define ISUNIBYTE(c) 1
277
278#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
279#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
280#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
281
256#ifdef isblank 282#ifdef isblank
257#define ISBLANK(c) (ISASCII (c) && isblank (c)) 283#define ISBLANK(c) (ISASCII (c) && isblank (c))
258#else 284#else
@@ -1809,12 +1835,16 @@ struct range_table_work_area
1809#define BIT_ALNUM 0x1 1835#define BIT_ALNUM 0x1
1810#define BIT_ALPHA 0x2 1836#define BIT_ALPHA 0x2
1811#define BIT_WORD 0x4 1837#define BIT_WORD 0x4
1838#define BIT_ASCII 0x8
1839#define BIT_NONASCII 0x10
1812#define BIT_GRAPH 0x20 1840#define BIT_GRAPH 0x20
1813#define BIT_LOWER 0x40 1841#define BIT_LOWER 0x40
1814#define BIT_PRINT 0x80 1842#define BIT_PRINT 0x80
1815#define BIT_PUNCT 0x100 1843#define BIT_PUNCT 0x100
1816#define BIT_SPACE 0x200 1844#define BIT_SPACE 0x200
1817#define BIT_UPPER 0x400 1845#define BIT_UPPER 0x400
1846#define BIT_UNIBYTE 0x800
1847#define BIT_MULTIBYTE 0x1000
1818 1848
1819/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ 1849/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1820#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ 1850#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
@@ -1869,7 +1899,9 @@ struct range_table_work_area
1869 || STREQ (string, "space") || STREQ (string, "print") \ 1899 || STREQ (string, "space") || STREQ (string, "print") \
1870 || STREQ (string, "punct") || STREQ (string, "graph") \ 1900 || STREQ (string, "punct") || STREQ (string, "graph") \
1871 || STREQ (string, "cntrl") || STREQ (string, "blank") \ 1901 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1872 || STREQ (string, "word")) 1902 || STREQ (string, "word") \
1903 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1904 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1873 1905
1874#ifndef MATCH_MAY_ALLOCATE 1906#ifndef MATCH_MAY_ALLOCATE
1875 1907
@@ -2360,17 +2392,21 @@ regex_compile (pattern, size, syntax, bufp)
2360 int ch; 2392 int ch;
2361 boolean is_alnum = STREQ (str, "alnum"); 2393 boolean is_alnum = STREQ (str, "alnum");
2362 boolean is_alpha = STREQ (str, "alpha"); 2394 boolean is_alpha = STREQ (str, "alpha");
2395 boolean is_ascii = STREQ (str, "ascii");
2363 boolean is_blank = STREQ (str, "blank"); 2396 boolean is_blank = STREQ (str, "blank");
2364 boolean is_cntrl = STREQ (str, "cntrl"); 2397 boolean is_cntrl = STREQ (str, "cntrl");
2365 boolean is_digit = STREQ (str, "digit"); 2398 boolean is_digit = STREQ (str, "digit");
2366 boolean is_graph = STREQ (str, "graph"); 2399 boolean is_graph = STREQ (str, "graph");
2367 boolean is_lower = STREQ (str, "lower"); 2400 boolean is_lower = STREQ (str, "lower");
2401 boolean is_multibyte = STREQ (str, "multibyte");
2402 boolean is_nonascii = STREQ (str, "nonascii");
2368 boolean is_print = STREQ (str, "print"); 2403 boolean is_print = STREQ (str, "print");
2369 boolean is_punct = STREQ (str, "punct"); 2404 boolean is_punct = STREQ (str, "punct");
2370 boolean is_space = STREQ (str, "space"); 2405 boolean is_space = STREQ (str, "space");
2406 boolean is_unibyte = STREQ (str, "unibyte");
2371 boolean is_upper = STREQ (str, "upper"); 2407 boolean is_upper = STREQ (str, "upper");
2372 boolean is_xdigit = STREQ (str, "xdigit");
2373 boolean is_word = STREQ (str, "word"); 2408 boolean is_word = STREQ (str, "word");
2409 boolean is_xdigit = STREQ (str, "xdigit");
2374 2410
2375 if (!IS_CHAR_CLASS (str)) 2411 if (!IS_CHAR_CLASS (str))
2376 FREE_STACK_RETURN (REG_ECTYPE); 2412 FREE_STACK_RETURN (REG_ECTYPE);
@@ -2393,11 +2429,15 @@ regex_compile (pattern, size, syntax, bufp)
2393 2429
2394 if (is_alnum) bit = BIT_ALNUM; 2430 if (is_alnum) bit = BIT_ALNUM;
2395 if (is_alpha) bit = BIT_ALPHA; 2431 if (is_alpha) bit = BIT_ALPHA;
2432 if (is_ascii) bit = BIT_ASCII;
2396 if (is_graph) bit = BIT_GRAPH; 2433 if (is_graph) bit = BIT_GRAPH;
2397 if (is_lower) bit = BIT_LOWER; 2434 if (is_lower) bit = BIT_LOWER;
2435 if (is_multibyte) bit = BIT_MULTIBYTE;
2436 if (is_nonascii) bit = BIT_NONASCII;
2398 if (is_print) bit = BIT_PRINT; 2437 if (is_print) bit = BIT_PRINT;
2399 if (is_punct) bit = BIT_PUNCT; 2438 if (is_punct) bit = BIT_PUNCT;
2400 if (is_space) bit = BIT_SPACE; 2439 if (is_space) bit = BIT_SPACE;
2440 if (is_unibyte) bit = BIT_UNIBYTE;
2401 if (is_upper) bit = BIT_UPPER; 2441 if (is_upper) bit = BIT_UPPER;
2402 if (is_word) bit = BIT_WORD; 2442 if (is_word) bit = BIT_WORD;
2403 if (bit) 2443 if (bit)
@@ -2426,6 +2466,12 @@ regex_compile (pattern, size, syntax, bufp)
2426 || (is_upper && ISUPPER (ch)) 2466 || (is_upper && ISUPPER (ch))
2427 || (is_xdigit && ISXDIGIT (ch))) 2467 || (is_xdigit && ISXDIGIT (ch)))
2428 SET_LIST_BIT (translated); 2468 SET_LIST_BIT (translated);
2469 if ( (is_ascii && IS_REAL_ASCII (ch))
2470 || (is_nonascii && !IS_REAL_ASCII (ch))
2471 || (is_unibyte && ISUNIBYTE (ch))
2472 || (is_multibyte && !ISUNIBYTE (ch)))
2473 SET_LIST_BIT (translated);
2474
2429 if ( (is_word && ISWORD (ch))) 2475 if ( (is_word && ISWORD (ch)))
2430 SET_LIST_BIT (translated); 2476 SET_LIST_BIT (translated);
2431 } 2477 }
@@ -3434,7 +3480,7 @@ re_compile_fastmap (bufp)
3434 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3480 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3435 fastmap[j] = 1; 3481 fastmap[j] = 1;
3436 3482
3437 /* If we can match a syntax class, we can match 3483 /* If we can match a character class, we can match
3438 any character set. */ 3484 any character set. */
3439 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) 3485 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3440 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) 3486 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
@@ -3450,8 +3496,7 @@ re_compile_fastmap (bufp)
3450 /* Make P points the range table. */ 3496 /* Make P points the range table. */
3451 p += CHARSET_BITMAP_SIZE (&p[-2]); 3497 p += CHARSET_BITMAP_SIZE (&p[-2]);
3452 3498
3453 /* Extract the number of ranges in range table into 3499 /* Extract the number of ranges in range table into COUNT. */
3454 COUNT. */
3455 EXTRACT_NUMBER_AND_INCR (count, p); 3500 EXTRACT_NUMBER_AND_INCR (count, p);
3456 for (; count > 0; count--, p += 2 * 3) /* XXX */ 3501 for (; count > 0; count--, p += 2 * 3) /* XXX */
3457 { 3502 {
@@ -4802,11 +4847,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4802 4847
4803 if ( (class_bits & BIT_ALNUM && ISALNUM (c)) 4848 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4804 | (class_bits & BIT_ALPHA && ISALPHA (c)) 4849 | (class_bits & BIT_ALPHA && ISALPHA (c))
4850 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
4805 | (class_bits & BIT_GRAPH && ISGRAPH (c)) 4851 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4806 | (class_bits & BIT_LOWER && ISLOWER (c)) 4852 | (class_bits & BIT_LOWER && ISLOWER (c))
4853 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4854 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
4807 | (class_bits & BIT_PRINT && ISPRINT (c)) 4855 | (class_bits & BIT_PRINT && ISPRINT (c))
4808 | (class_bits & BIT_PUNCT && ISPUNCT (c)) 4856 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4809 | (class_bits & BIT_SPACE && ISSPACE (c)) 4857 | (class_bits & BIT_SPACE && ISSPACE (c))
4858 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
4810 | (class_bits & BIT_UPPER && ISUPPER (c)) 4859 | (class_bits & BIT_UPPER && ISUPPER (c))
4811 | (class_bits & BIT_WORD && ISWORD (c))) 4860 | (class_bits & BIT_WORD && ISWORD (c)))
4812 not = !not; 4861 not = !not;