aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStefan Monnier2000-03-29 04:01:45 +0000
committerStefan Monnier2000-03-29 04:01:45 +0000
commitf6a3f532b0b6dae667514f217db3a62dfaa2d4ce (patch)
treeba2557e71310c92f2f36862e1ae5c756b28a1306
parentbb15bd9a51769b13a2e1adb3c504d193606c6482 (diff)
downloademacs-f6a3f532b0b6dae667514f217db3a62dfaa2d4ce.tar.gz
emacs-f6a3f532b0b6dae667514f217db3a62dfaa2d4ce.zip
(analyse_first): New function obtained by ripping out most
of re_compile_fastmap and generalizing it a little bit so that it can also just return whether a given (sub)pattern can match the empty string or not. (regex_compile): Use `analyse_first' to decide whether the loop-check needs to be done or not for *, +, *? and +? (the loop check is costly for non-greedy repetition). (re_compile_fastmap): Delegate the actual work to `analyse_first'.
-rw-r--r--src/ChangeLog11
-rw-r--r--src/regex.c147
2 files changed, 96 insertions, 62 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index a12f34a2a5b..3f33d4c59da 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,14 @@
12000-03-28 Stefan Monnier <monnier@cs.yale.edu>
2
3 * regex.c (analyse_first): New function obtained by ripping out most
4 of re_compile_fastmap and generalizing it a little bit so that it
5 can also just return whether a given (sub)pattern can match the empty
6 string or not.
7 (regex_compile): Use `analyse_first' to decide whether the loop-check
8 needs to be done or not for *, +, *? and +? (the loop check is costly
9 for non-greedy repetition).
10 (re_compile_fastmap): Delegate the actual work to `analyse_first'.
11
12000-03-28 Dave Love <fx@gnu.org> 122000-03-28 Dave Love <fx@gnu.org>
2 13
3 * s/gnu-linux.h (GC_SETJMP_WORKS): Define for i386, sparc, m68k, 14 * s/gnu-linux.h (GC_SETJMP_WORKS): Define for i386, sparc, m68k,
diff --git a/src/regex.c b/src/regex.c
index f7ca3a7b8b4..911daed209d 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -20,12 +20,10 @@
20 USA. */ 20 USA. */
21 21
22/* TODO: 22/* TODO:
23 - use analyze_first to optimize non-empty loops
24 - clean up multibyte issues 23 - clean up multibyte issues
25 - structure the opcode space into opcode+flag. 24 - structure the opcode space into opcode+flag.
26 - merge with glic's regex.[ch] 25 - merge with glibc's regex.[ch]
27 26 */
28 That's it for now -sm */
29 27
30/* AIX requires this to be the first thing in the file. */ 28/* AIX requires this to be the first thing in the file. */
31#if defined (_AIX) && !defined (REGEX_MALLOC) 29#if defined (_AIX) && !defined (REGEX_MALLOC)
@@ -1542,6 +1540,8 @@ static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1542 const unsigned char *pend, 1540 const unsigned char *pend,
1543 reg_syntax_t syntax)); 1541 reg_syntax_t syntax));
1544static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); 1542static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
1543static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend,
1544 char *fastmap, const int multibyte));
1545 1545
1546/* Fetch the next character in the uncompiled pattern---translating it 1546/* Fetch the next character in the uncompiled pattern---translating it
1547 if necessary. Also cast from a signed character in the constant 1547 if necessary. Also cast from a signed character in the constant
@@ -2134,6 +2134,9 @@ regex_compile (pattern, size, syntax, bufp)
2134 { 2134 {
2135 boolean simple = skip_one_char (laststart) == b; 2135 boolean simple = skip_one_char (laststart) == b;
2136 unsigned int startoffset = 0; 2136 unsigned int startoffset = 0;
2137 re_opcode_t ofj =
2138 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2139 on_failure_jump : on_failure_jump_loop;
2137 assert (skip_one_char (laststart) <= b); 2140 assert (skip_one_char (laststart) <= b);
2138 2141
2139 if (!zero_times_ok && simple) 2142 if (!zero_times_ok && simple)
@@ -2152,14 +2155,13 @@ regex_compile (pattern, size, syntax, bufp)
2152 GET_BUFFER_SPACE (6); 2155 GET_BUFFER_SPACE (6);
2153 if (!zero_times_ok) 2156 if (!zero_times_ok)
2154 /* A + loop. */ 2157 /* A + loop. */
2155 STORE_JUMP (on_failure_jump_loop, b, b + 6); 2158 STORE_JUMP (ofj, b, b + 6);
2156 else 2159 else
2157 /* Simple * loops can use on_failure_keep_string_jump 2160 /* Simple * loops can use on_failure_keep_string_jump
2158 depending on what follows. But since we don't know 2161 depending on what follows. But since we don't know
2159 that yet, we leave the decision up to 2162 that yet, we leave the decision up to
2160 on_failure_jump_smart. */ 2163 on_failure_jump_smart. */
2161 INSERT_JUMP (simple ? on_failure_jump_smart 2164 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2162 : on_failure_jump_loop,
2163 laststart + startoffset, b + 6); 2165 laststart + startoffset, b + 6);
2164 b += 3; 2166 b += 3;
2165 STORE_JUMP (jump, b, laststart + startoffset); 2167 STORE_JUMP (jump, b, laststart + startoffset);
@@ -2180,10 +2182,13 @@ regex_compile (pattern, size, syntax, bufp)
2180 GET_BUFFER_SPACE (7); /* We might use less. */ 2182 GET_BUFFER_SPACE (7); /* We might use less. */
2181 if (many_times_ok) 2183 if (many_times_ok)
2182 { 2184 {
2185 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2186
2183 /* The non-greedy multiple match looks like a repeat..until: 2187 /* The non-greedy multiple match looks like a repeat..until:
2184 we only need a conditional jump at the end of the loop */ 2188 we only need a conditional jump at the end of the loop */
2185 BUF_PUSH (no_op); 2189 if (emptyp) BUF_PUSH (no_op);
2186 STORE_JUMP (on_failure_jump_nastyloop, b, laststart); 2190 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2191 : on_failure_jump, b, laststart);
2187 b += 3; 2192 b += 3;
2188 if (zero_times_ok) 2193 if (zero_times_ok)
2189 { 2194 {
@@ -3268,26 +3273,23 @@ group_in_compile_stack (compile_stack, regnum)
3268 return false; 3273 return false;
3269} 3274}
3270 3275
3271/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 3276/* analyse_first.
3272 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 3277 If fastmap is non-NULL, go through the pattern and fill fastmap
3273 characters can start a string that matches the pattern. This fastmap 3278 with all the possible leading chars. If fastmap is NULL, don't
3274 is used by re_search to skip quickly over impossible starting points. 3279 bother filling it up (obviously) and only return whether the
3275 3280 pattern could potentially match the empty string.
3276 Character codes above (1 << BYTEWIDTH) are not represented in the 3281
3277 fastmap, but the leading codes are represented. Thus, the fastmap 3282 Return 1 if p..pend might match the empty string.
3278 indicates which character sets could start a match. 3283 Return 0 if p..pend matches at least one char.
3279 3284 Return -1 if p..pend matches at least one char, but fastmap was not
3280 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 3285 updated accurately.
3281 area as BUFP->fastmap. 3286 Return -2 if an error occurred. */
3282
3283 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3284 the pattern buffer.
3285
3286 Returns 0 if we succeed, -2 if an internal error. */
3287 3287
3288int 3288static int
3289re_compile_fastmap (bufp) 3289analyse_first (p, pend, fastmap, multibyte)
3290 struct re_pattern_buffer *bufp; 3290 unsigned char *p, *pend;
3291 char *fastmap;
3292 const int multibyte;
3291{ 3293{
3292 int j, k; 3294 int j, k;
3293 boolean not; 3295 boolean not;
@@ -3298,13 +3300,6 @@ re_compile_fastmap (bufp)
3298 char *destination; 3300 char *destination;
3299#endif 3301#endif
3300 3302
3301 register char *fastmap = bufp->fastmap;
3302 unsigned char *pattern = bufp->buffer;
3303 unsigned long size = bufp->used;
3304 unsigned char *p = pattern;
3305 register unsigned char *pend = pattern + size;
3306 const boolean multibyte = bufp->multibyte;
3307
3308#if defined (REL_ALLOC) && defined (REGEX_MALLOC) 3303#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3309 /* This holds the pointer to the failure stack, when 3304 /* This holds the pointer to the failure stack, when
3310 it is allocated relocatably. */ 3305 it is allocated relocatably. */
@@ -3321,12 +3316,9 @@ re_compile_fastmap (bufp)
3321 flag is set true. */ 3316 flag is set true. */
3322 boolean match_any_multibyte_characters = false; 3317 boolean match_any_multibyte_characters = false;
3323 3318
3324 assert (fastmap != NULL && p != NULL); 3319 assert (p);
3325 3320
3326 INIT_FAIL_STACK (); 3321 INIT_FAIL_STACK ();
3327 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3328 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3329 bufp->can_be_null = 0;
3330 3322
3331 /* The loop below works as follows: 3323 /* The loop below works as follows:
3332 - It has a working-list kept in the PATTERN_STACK and which basically 3324 - It has a working-list kept in the PATTERN_STACK and which basically
@@ -3344,7 +3336,7 @@ re_compile_fastmap (bufp)
3344 never set `p' (or push) anything `<= p1'. */ 3336 never set `p' (or push) anything `<= p1'. */
3345 3337
3346 /* If can_be_null is set, then the fastmap will not be used anyway. */ 3338 /* If can_be_null is set, then the fastmap will not be used anyway. */
3347 while (!bufp->can_be_null) 3339 while (1)
3348 { 3340 {
3349 /* `p1' is used as a marker of how far back a `on_failure_jump' 3341 /* `p1' is used as a marker of how far back a `on_failure_jump'
3350 can go without being ignored. It is normally equal to `p' 3342 can go without being ignored. It is normally equal to `p'
@@ -3356,22 +3348,18 @@ re_compile_fastmap (bufp)
3356 as used for the *? operator. */ 3348 as used for the *? operator. */
3357 unsigned char *p1 = p; 3349 unsigned char *p1 = p;
3358 3350
3359 if (p >= pend || *p == succeed) 3351 if (p >= pend)
3360 { 3352 {
3361 /* We have reached the (effective) end of pattern. */ 3353 if (path_can_be_null)
3362 if (!PATTERN_STACK_EMPTY ()) 3354 return (RESET_FAIL_STACK (), 1);
3363 {
3364 bufp->can_be_null |= path_can_be_null;
3365 3355
3366 /* Reset for next path. */ 3356 /* We have reached the (effective) end of pattern. */
3367 path_can_be_null = true; 3357 if (PATTERN_STACK_EMPTY ())
3368 3358 return (RESET_FAIL_STACK (), 0);
3369 p = (unsigned char*) POP_PATTERN_OP ();
3370 3359
3371 continue; 3360 p = (unsigned char*) POP_PATTERN_OP ();
3372 } 3361 path_can_be_null = true;
3373 else 3362 continue;
3374 break;
3375 } 3363 }
3376 3364
3377 /* We should never be about to go beyond the end of the pattern. */ 3365 /* We should never be about to go beyond the end of the pattern. */
@@ -3379,6 +3367,9 @@ re_compile_fastmap (bufp)
3379 3367
3380 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 3368 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3381 { 3369 {
3370 case succeed:
3371 p = pend;
3372 continue;
3382 3373
3383 case duplicate: 3374 case duplicate:
3384 /* If the first character has to match a backreference, that means 3375 /* If the first character has to match a backreference, that means
@@ -3393,15 +3384,15 @@ re_compile_fastmap (bufp)
3393 with `break'. */ 3384 with `break'. */
3394 3385
3395 case exactn: 3386 case exactn:
3396 fastmap[p[1]] = 1; 3387 if (fastmap) fastmap[p[1]] = 1;
3397 break; 3388 break;
3398 3389
3399 3390
3400 case anychar: 3391 case anychar:
3401 /* We could put all the chars except for \n (and maybe \0) 3392 /* We could put all the chars except for \n (and maybe \0)
3402 but we don't bother since it is generally not worth it. */ 3393 but we don't bother since it is generally not worth it. */
3403 bufp->can_be_null = 1; 3394 if (!fastmap) break;
3404 continue; 3395 return (RESET_FAIL_STACK (), -1);
3405 3396
3406 3397
3407 case charset_not: 3398 case charset_not:
@@ -3476,8 +3467,7 @@ re_compile_fastmap (bufp)
3476#else /* emacs */ 3467#else /* emacs */
3477 /* This match depends on text properties. These end with 3468 /* This match depends on text properties. These end with
3478 aborting optimizations. */ 3469 aborting optimizations. */
3479 bufp->can_be_null = 1; 3470 return (RESET_FAIL_STACK (), -1);
3480 continue;
3481 3471
3482 case categoryspec: 3472 case categoryspec:
3483 case notcategoryspec: 3473 case notcategoryspec:
@@ -3593,10 +3583,43 @@ re_compile_fastmap (bufp)
3593 p = pend; 3583 p = pend;
3594 } /* while p */ 3584 } /* while p */
3595 3585
3596 /* Set `can_be_null' for the last path (also the first path, if the 3586 return (RESET_FAIL_STACK (), 0);
3597 pattern is empty). */ 3587} /* analyse_first */
3598 bufp->can_be_null |= path_can_be_null; 3588
3599 RESET_FAIL_STACK (); 3589/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3590 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3591 characters can start a string that matches the pattern. This fastmap
3592 is used by re_search to skip quickly over impossible starting points.
3593
3594 Character codes above (1 << BYTEWIDTH) are not represented in the
3595 fastmap, but the leading codes are represented. Thus, the fastmap
3596 indicates which character sets could start a match.
3597
3598 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3599 area as BUFP->fastmap.
3600
3601 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3602 the pattern buffer.
3603
3604 Returns 0 if we succeed, -2 if an internal error. */
3605
3606int
3607re_compile_fastmap (bufp)
3608 struct re_pattern_buffer *bufp;
3609{
3610 char *fastmap = bufp->fastmap;
3611 int analysis;
3612
3613 assert (fastmap && bufp->buffer);
3614
3615 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3616 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3617
3618 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3619 fastmap, bufp->multibyte);
3620 if (analysis < -1)
3621 return analysis;
3622 bufp->can_be_null = (analysis != 0);
3600 return 0; 3623 return 0;
3601} /* re_compile_fastmap */ 3624} /* re_compile_fastmap */
3602 3625