diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 11 | ||||
| -rw-r--r-- | src/regex.c | 147 |
2 files changed, 96 insertions, 62 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index a12f34a2a5b..3f33d4c59da 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,14 @@ | |||
| 1 | 2000-03-28 Stefan Monnier <monnier@cs.yale.edu> | ||
| 2 | |||
| 3 | * regex.c (analyse_first): New function obtained by ripping out most | ||
| 4 | of re_compile_fastmap and generalizing it a little bit so that it | ||
| 5 | can also just return whether a given (sub)pattern can match the empty | ||
| 6 | string or not. | ||
| 7 | (regex_compile): Use `analyse_first' to decide whether the loop-check | ||
| 8 | needs to be done or not for *, +, *? and +? (the loop check is costly | ||
| 9 | for non-greedy repetition). | ||
| 10 | (re_compile_fastmap): Delegate the actual work to `analyse_first'. | ||
| 11 | |||
| 1 | 2000-03-28 Dave Love <fx@gnu.org> | 12 | 2000-03-28 Dave Love <fx@gnu.org> |
| 2 | 13 | ||
| 3 | * s/gnu-linux.h (GC_SETJMP_WORKS): Define for i386, sparc, m68k, | 14 | * s/gnu-linux.h (GC_SETJMP_WORKS): Define for i386, sparc, m68k, |
diff --git a/src/regex.c b/src/regex.c index f7ca3a7b8b4..911daed209d 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -20,12 +20,10 @@ | |||
| 20 | USA. */ | 20 | USA. */ |
| 21 | 21 | ||
| 22 | /* TODO: | 22 | /* TODO: |
| 23 | - use analyze_first to optimize non-empty loops | ||
| 24 | - clean up multibyte issues | 23 | - clean up multibyte issues |
| 25 | - structure the opcode space into opcode+flag. | 24 | - structure the opcode space into opcode+flag. |
| 26 | - merge with glic's regex.[ch] | 25 | - merge with glibc's regex.[ch] |
| 27 | 26 | */ | |
| 28 | That's it for now -sm */ | ||
| 29 | 27 | ||
| 30 | /* AIX requires this to be the first thing in the file. */ | 28 | /* AIX requires this to be the first thing in the file. */ |
| 31 | #if defined (_AIX) && !defined (REGEX_MALLOC) | 29 | #if defined (_AIX) && !defined (REGEX_MALLOC) |
| @@ -1542,6 +1540,8 @@ static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p, | |||
| 1542 | const unsigned char *pend, | 1540 | const unsigned char *pend, |
| 1543 | reg_syntax_t syntax)); | 1541 | reg_syntax_t syntax)); |
| 1544 | static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); | 1542 | static unsigned char *skip_one_char _RE_ARGS((unsigned char *p)); |
| 1543 | static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend, | ||
| 1544 | char *fastmap, const int multibyte)); | ||
| 1545 | 1545 | ||
| 1546 | /* Fetch the next character in the uncompiled pattern---translating it | 1546 | /* Fetch the next character in the uncompiled pattern---translating it |
| 1547 | if necessary. Also cast from a signed character in the constant | 1547 | if necessary. Also cast from a signed character in the constant |
| @@ -2134,6 +2134,9 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2134 | { | 2134 | { |
| 2135 | boolean simple = skip_one_char (laststart) == b; | 2135 | boolean simple = skip_one_char (laststart) == b; |
| 2136 | unsigned int startoffset = 0; | 2136 | unsigned int startoffset = 0; |
| 2137 | re_opcode_t ofj = | ||
| 2138 | (simple || !analyse_first (laststart, b, NULL, 0)) ? | ||
| 2139 | on_failure_jump : on_failure_jump_loop; | ||
| 2137 | assert (skip_one_char (laststart) <= b); | 2140 | assert (skip_one_char (laststart) <= b); |
| 2138 | 2141 | ||
| 2139 | if (!zero_times_ok && simple) | 2142 | if (!zero_times_ok && simple) |
| @@ -2152,14 +2155,13 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2152 | GET_BUFFER_SPACE (6); | 2155 | GET_BUFFER_SPACE (6); |
| 2153 | if (!zero_times_ok) | 2156 | if (!zero_times_ok) |
| 2154 | /* A + loop. */ | 2157 | /* A + loop. */ |
| 2155 | STORE_JUMP (on_failure_jump_loop, b, b + 6); | 2158 | STORE_JUMP (ofj, b, b + 6); |
| 2156 | else | 2159 | else |
| 2157 | /* Simple * loops can use on_failure_keep_string_jump | 2160 | /* Simple * loops can use on_failure_keep_string_jump |
| 2158 | depending on what follows. But since we don't know | 2161 | depending on what follows. But since we don't know |
| 2159 | that yet, we leave the decision up to | 2162 | that yet, we leave the decision up to |
| 2160 | on_failure_jump_smart. */ | 2163 | on_failure_jump_smart. */ |
| 2161 | INSERT_JUMP (simple ? on_failure_jump_smart | 2164 | INSERT_JUMP (simple ? on_failure_jump_smart : ofj, |
| 2162 | : on_failure_jump_loop, | ||
| 2163 | laststart + startoffset, b + 6); | 2165 | laststart + startoffset, b + 6); |
| 2164 | b += 3; | 2166 | b += 3; |
| 2165 | STORE_JUMP (jump, b, laststart + startoffset); | 2167 | STORE_JUMP (jump, b, laststart + startoffset); |
| @@ -2180,10 +2182,13 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2180 | GET_BUFFER_SPACE (7); /* We might use less. */ | 2182 | GET_BUFFER_SPACE (7); /* We might use less. */ |
| 2181 | if (many_times_ok) | 2183 | if (many_times_ok) |
| 2182 | { | 2184 | { |
| 2185 | boolean emptyp = analyse_first (laststart, b, NULL, 0); | ||
| 2186 | |||
| 2183 | /* The non-greedy multiple match looks like a repeat..until: | 2187 | /* The non-greedy multiple match looks like a repeat..until: |
| 2184 | we only need a conditional jump at the end of the loop */ | 2188 | we only need a conditional jump at the end of the loop */ |
| 2185 | BUF_PUSH (no_op); | 2189 | if (emptyp) BUF_PUSH (no_op); |
| 2186 | STORE_JUMP (on_failure_jump_nastyloop, b, laststart); | 2190 | STORE_JUMP (emptyp ? on_failure_jump_nastyloop |
| 2191 | : on_failure_jump, b, laststart); | ||
| 2187 | b += 3; | 2192 | b += 3; |
| 2188 | if (zero_times_ok) | 2193 | if (zero_times_ok) |
| 2189 | { | 2194 | { |
| @@ -3268,26 +3273,23 @@ group_in_compile_stack (compile_stack, regnum) | |||
| 3268 | return false; | 3273 | return false; |
| 3269 | } | 3274 | } |
| 3270 | 3275 | ||
| 3271 | /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | 3276 | /* analyse_first. |
| 3272 | BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | 3277 | If fastmap is non-NULL, go through the pattern and fill fastmap |
| 3273 | characters can start a string that matches the pattern. This fastmap | 3278 | with all the possible leading chars. If fastmap is NULL, don't |
| 3274 | is used by re_search to skip quickly over impossible starting points. | 3279 | bother filling it up (obviously) and only return whether the |
| 3275 | 3280 | pattern could potentially match the empty string. | |
| 3276 | Character codes above (1 << BYTEWIDTH) are not represented in the | 3281 | |
| 3277 | fastmap, but the leading codes are represented. Thus, the fastmap | 3282 | Return 1 if p..pend might match the empty string. |
| 3278 | indicates which character sets could start a match. | 3283 | Return 0 if p..pend matches at least one char. |
| 3279 | 3284 | Return -1 if p..pend matches at least one char, but fastmap was not | |
| 3280 | The caller must supply the address of a (1 << BYTEWIDTH)-byte data | 3285 | updated accurately. |
| 3281 | area as BUFP->fastmap. | 3286 | Return -2 if an error occurred. */ |
| 3282 | |||
| 3283 | We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | ||
| 3284 | the pattern buffer. | ||
| 3285 | |||
| 3286 | Returns 0 if we succeed, -2 if an internal error. */ | ||
| 3287 | 3287 | ||
| 3288 | int | 3288 | static int |
| 3289 | re_compile_fastmap (bufp) | 3289 | analyse_first (p, pend, fastmap, multibyte) |
| 3290 | struct re_pattern_buffer *bufp; | 3290 | unsigned char *p, *pend; |
| 3291 | char *fastmap; | ||
| 3292 | const int multibyte; | ||
| 3291 | { | 3293 | { |
| 3292 | int j, k; | 3294 | int j, k; |
| 3293 | boolean not; | 3295 | boolean not; |
| @@ -3298,13 +3300,6 @@ re_compile_fastmap (bufp) | |||
| 3298 | char *destination; | 3300 | char *destination; |
| 3299 | #endif | 3301 | #endif |
| 3300 | 3302 | ||
| 3301 | register char *fastmap = bufp->fastmap; | ||
| 3302 | unsigned char *pattern = bufp->buffer; | ||
| 3303 | unsigned long size = bufp->used; | ||
| 3304 | unsigned char *p = pattern; | ||
| 3305 | register unsigned char *pend = pattern + size; | ||
| 3306 | const boolean multibyte = bufp->multibyte; | ||
| 3307 | |||
| 3308 | #if defined (REL_ALLOC) && defined (REGEX_MALLOC) | 3303 | #if defined (REL_ALLOC) && defined (REGEX_MALLOC) |
| 3309 | /* This holds the pointer to the failure stack, when | 3304 | /* This holds the pointer to the failure stack, when |
| 3310 | it is allocated relocatably. */ | 3305 | it is allocated relocatably. */ |
| @@ -3321,12 +3316,9 @@ re_compile_fastmap (bufp) | |||
| 3321 | flag is set true. */ | 3316 | flag is set true. */ |
| 3322 | boolean match_any_multibyte_characters = false; | 3317 | boolean match_any_multibyte_characters = false; |
| 3323 | 3318 | ||
| 3324 | assert (fastmap != NULL && p != NULL); | 3319 | assert (p); |
| 3325 | 3320 | ||
| 3326 | INIT_FAIL_STACK (); | 3321 | INIT_FAIL_STACK (); |
| 3327 | bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | ||
| 3328 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ | ||
| 3329 | bufp->can_be_null = 0; | ||
| 3330 | 3322 | ||
| 3331 | /* The loop below works as follows: | 3323 | /* The loop below works as follows: |
| 3332 | - It has a working-list kept in the PATTERN_STACK and which basically | 3324 | - It has a working-list kept in the PATTERN_STACK and which basically |
| @@ -3344,7 +3336,7 @@ re_compile_fastmap (bufp) | |||
| 3344 | never set `p' (or push) anything `<= p1'. */ | 3336 | never set `p' (or push) anything `<= p1'. */ |
| 3345 | 3337 | ||
| 3346 | /* If can_be_null is set, then the fastmap will not be used anyway. */ | 3338 | /* If can_be_null is set, then the fastmap will not be used anyway. */ |
| 3347 | while (!bufp->can_be_null) | 3339 | while (1) |
| 3348 | { | 3340 | { |
| 3349 | /* `p1' is used as a marker of how far back a `on_failure_jump' | 3341 | /* `p1' is used as a marker of how far back a `on_failure_jump' |
| 3350 | can go without being ignored. It is normally equal to `p' | 3342 | can go without being ignored. It is normally equal to `p' |
| @@ -3356,22 +3348,18 @@ re_compile_fastmap (bufp) | |||
| 3356 | as used for the *? operator. */ | 3348 | as used for the *? operator. */ |
| 3357 | unsigned char *p1 = p; | 3349 | unsigned char *p1 = p; |
| 3358 | 3350 | ||
| 3359 | if (p >= pend || *p == succeed) | 3351 | if (p >= pend) |
| 3360 | { | 3352 | { |
| 3361 | /* We have reached the (effective) end of pattern. */ | 3353 | if (path_can_be_null) |
| 3362 | if (!PATTERN_STACK_EMPTY ()) | 3354 | return (RESET_FAIL_STACK (), 1); |
| 3363 | { | ||
| 3364 | bufp->can_be_null |= path_can_be_null; | ||
| 3365 | 3355 | ||
| 3366 | /* Reset for next path. */ | 3356 | /* We have reached the (effective) end of pattern. */ |
| 3367 | path_can_be_null = true; | 3357 | if (PATTERN_STACK_EMPTY ()) |
| 3368 | 3358 | return (RESET_FAIL_STACK (), 0); | |
| 3369 | p = (unsigned char*) POP_PATTERN_OP (); | ||
| 3370 | 3359 | ||
| 3371 | continue; | 3360 | p = (unsigned char*) POP_PATTERN_OP (); |
| 3372 | } | 3361 | path_can_be_null = true; |
| 3373 | else | 3362 | continue; |
| 3374 | break; | ||
| 3375 | } | 3363 | } |
| 3376 | 3364 | ||
| 3377 | /* We should never be about to go beyond the end of the pattern. */ | 3365 | /* We should never be about to go beyond the end of the pattern. */ |
| @@ -3379,6 +3367,9 @@ re_compile_fastmap (bufp) | |||
| 3379 | 3367 | ||
| 3380 | switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) | 3368 | switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) |
| 3381 | { | 3369 | { |
| 3370 | case succeed: | ||
| 3371 | p = pend; | ||
| 3372 | continue; | ||
| 3382 | 3373 | ||
| 3383 | case duplicate: | 3374 | case duplicate: |
| 3384 | /* If the first character has to match a backreference, that means | 3375 | /* If the first character has to match a backreference, that means |
| @@ -3393,15 +3384,15 @@ re_compile_fastmap (bufp) | |||
| 3393 | with `break'. */ | 3384 | with `break'. */ |
| 3394 | 3385 | ||
| 3395 | case exactn: | 3386 | case exactn: |
| 3396 | fastmap[p[1]] = 1; | 3387 | if (fastmap) fastmap[p[1]] = 1; |
| 3397 | break; | 3388 | break; |
| 3398 | 3389 | ||
| 3399 | 3390 | ||
| 3400 | case anychar: | 3391 | case anychar: |
| 3401 | /* We could put all the chars except for \n (and maybe \0) | 3392 | /* We could put all the chars except for \n (and maybe \0) |
| 3402 | but we don't bother since it is generally not worth it. */ | 3393 | but we don't bother since it is generally not worth it. */ |
| 3403 | bufp->can_be_null = 1; | 3394 | if (!fastmap) break; |
| 3404 | continue; | 3395 | return (RESET_FAIL_STACK (), -1); |
| 3405 | 3396 | ||
| 3406 | 3397 | ||
| 3407 | case charset_not: | 3398 | case charset_not: |
| @@ -3476,8 +3467,7 @@ re_compile_fastmap (bufp) | |||
| 3476 | #else /* emacs */ | 3467 | #else /* emacs */ |
| 3477 | /* This match depends on text properties. These end with | 3468 | /* This match depends on text properties. These end with |
| 3478 | aborting optimizations. */ | 3469 | aborting optimizations. */ |
| 3479 | bufp->can_be_null = 1; | 3470 | return (RESET_FAIL_STACK (), -1); |
| 3480 | continue; | ||
| 3481 | 3471 | ||
| 3482 | case categoryspec: | 3472 | case categoryspec: |
| 3483 | case notcategoryspec: | 3473 | case notcategoryspec: |
| @@ -3593,10 +3583,43 @@ re_compile_fastmap (bufp) | |||
| 3593 | p = pend; | 3583 | p = pend; |
| 3594 | } /* while p */ | 3584 | } /* while p */ |
| 3595 | 3585 | ||
| 3596 | /* Set `can_be_null' for the last path (also the first path, if the | 3586 | return (RESET_FAIL_STACK (), 0); |
| 3597 | pattern is empty). */ | 3587 | } /* analyse_first */ |
| 3598 | bufp->can_be_null |= path_can_be_null; | 3588 | |
| 3599 | RESET_FAIL_STACK (); | 3589 | /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in |
| 3590 | BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | ||
| 3591 | characters can start a string that matches the pattern. This fastmap | ||
| 3592 | is used by re_search to skip quickly over impossible starting points. | ||
| 3593 | |||
| 3594 | Character codes above (1 << BYTEWIDTH) are not represented in the | ||
| 3595 | fastmap, but the leading codes are represented. Thus, the fastmap | ||
| 3596 | indicates which character sets could start a match. | ||
| 3597 | |||
| 3598 | The caller must supply the address of a (1 << BYTEWIDTH)-byte data | ||
| 3599 | area as BUFP->fastmap. | ||
| 3600 | |||
| 3601 | We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | ||
| 3602 | the pattern buffer. | ||
| 3603 | |||
| 3604 | Returns 0 if we succeed, -2 if an internal error. */ | ||
| 3605 | |||
| 3606 | int | ||
| 3607 | re_compile_fastmap (bufp) | ||
| 3608 | struct re_pattern_buffer *bufp; | ||
| 3609 | { | ||
| 3610 | char *fastmap = bufp->fastmap; | ||
| 3611 | int analysis; | ||
| 3612 | |||
| 3613 | assert (fastmap && bufp->buffer); | ||
| 3614 | |||
| 3615 | bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | ||
| 3616 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ | ||
| 3617 | |||
| 3618 | analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, | ||
| 3619 | fastmap, bufp->multibyte); | ||
| 3620 | if (analysis < -1) | ||
| 3621 | return analysis; | ||
| 3622 | bufp->can_be_null = (analysis != 0); | ||
| 3600 | return 0; | 3623 | return 0; |
| 3601 | } /* re_compile_fastmap */ | 3624 | } /* re_compile_fastmap */ |
| 3602 | 3625 | ||