diff options
Diffstat (limited to 'src/regex.c')
| -rw-r--r-- | src/regex.c | 259 |
1 files changed, 127 insertions, 132 deletions
diff --git a/src/regex.c b/src/regex.c index 43351b380de..b4f2d8d8049 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -19,9 +19,7 @@ | |||
| 19 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | 19 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 20 | USA. */ | 20 | USA. */ |
| 21 | 21 | ||
| 22 | /* BUGS: | 22 | /* TODO: |
| 23 | - (x?)*y\1z should match both xxxxyxz and xxxyz. | ||
| 24 | TODO: | ||
| 25 | - structure the opcode space into opcode+flag. | 23 | - structure the opcode space into opcode+flag. |
| 26 | - merge with glibc's regex.[ch]. | 24 | - merge with glibc's regex.[ch]. |
| 27 | - replace (succeed_n + jump_n + set_number_at) with something that doesn't | 25 | - replace (succeed_n + jump_n + set_number_at) with something that doesn't |
| @@ -1520,26 +1518,6 @@ do { \ | |||
| 1520 | } \ | 1518 | } \ |
| 1521 | } while (0) | 1519 | } while (0) |
| 1522 | 1520 | ||
| 1523 | /* Discard a saved register off the stack. */ | ||
| 1524 | #define DISCARD_FAILURE_REG_OR_COUNT() \ | ||
| 1525 | do { \ | ||
| 1526 | int reg = POP_FAILURE_INT (); \ | ||
| 1527 | if (reg == -1) \ | ||
| 1528 | { \ | ||
| 1529 | /* It's a counter. */ \ | ||
| 1530 | POP_FAILURE_POINTER (); \ | ||
| 1531 | reg = POP_FAILURE_INT (); \ | ||
| 1532 | DEBUG_PRINT3 (" Discard counter %p = %d\n", ptr, reg); \ | ||
| 1533 | } \ | ||
| 1534 | else \ | ||
| 1535 | { \ | ||
| 1536 | POP_FAILURE_POINTER (); \ | ||
| 1537 | POP_FAILURE_POINTER (); \ | ||
| 1538 | DEBUG_PRINT4 (" Discard reg %d (spanning %p -> %p)\n", \ | ||
| 1539 | reg, regstart[reg], regend[reg]); \ | ||
| 1540 | } \ | ||
| 1541 | } while (0) | ||
| 1542 | |||
| 1543 | /* Check that we are not stuck in an infinite loop. */ | 1521 | /* Check that we are not stuck in an infinite loop. */ |
| 1544 | #define CHECK_INFINITE_LOOP(pat_cur, string_place) \ | 1522 | #define CHECK_INFINITE_LOOP(pat_cur, string_place) \ |
| 1545 | do { \ | 1523 | do { \ |
| @@ -1553,16 +1531,15 @@ do { \ | |||
| 1553 | && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \ | 1531 | && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \ |
| 1554 | if (FAILURE_PAT (failure) == pat_cur) \ | 1532 | if (FAILURE_PAT (failure) == pat_cur) \ |
| 1555 | { \ | 1533 | { \ |
| 1556 | while (fail_stack.frame < fail_stack.avail) \ | 1534 | cycle = 1; \ |
| 1557 | DISCARD_FAILURE_REG_OR_COUNT (); \ | 1535 | break; \ |
| 1558 | goto fail; \ | ||
| 1559 | } \ | 1536 | } \ |
| 1560 | DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \ | 1537 | DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \ |
| 1561 | failure = NEXT_FAILURE_HANDLE(failure); \ | 1538 | failure = NEXT_FAILURE_HANDLE(failure); \ |
| 1562 | } \ | 1539 | } \ |
| 1563 | DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \ | 1540 | DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \ |
| 1564 | } while (0) | 1541 | } while (0) |
| 1565 | 1542 | ||
| 1566 | /* Push the information about the state we will need | 1543 | /* Push the information about the state we will need |
| 1567 | if we ever fail back to it. | 1544 | if we ever fail back to it. |
| 1568 | 1545 | ||
| @@ -2645,8 +2622,8 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2645 | unsigned int startoffset = 0; | 2622 | unsigned int startoffset = 0; |
| 2646 | re_opcode_t ofj = | 2623 | re_opcode_t ofj = |
| 2647 | /* Check if the loop can match the empty string. */ | 2624 | /* Check if the loop can match the empty string. */ |
| 2648 | (simple || !analyse_first (laststart, b, NULL, 0)) ? | 2625 | (simple || !analyse_first (laststart, b, NULL, 0)) |
| 2649 | on_failure_jump : on_failure_jump_loop; | 2626 | ? on_failure_jump : on_failure_jump_loop; |
| 2650 | assert (skip_one_char (laststart) <= b); | 2627 | assert (skip_one_char (laststart) <= b); |
| 2651 | 2628 | ||
| 2652 | if (!zero_times_ok && simple) | 2629 | if (!zero_times_ok && simple) |
| @@ -2694,8 +2671,9 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2694 | { | 2671 | { |
| 2695 | boolean emptyp = analyse_first (laststart, b, NULL, 0); | 2672 | boolean emptyp = analyse_first (laststart, b, NULL, 0); |
| 2696 | 2673 | ||
| 2697 | /* The non-greedy multiple match looks like a repeat..until: | 2674 | /* The non-greedy multiple match looks like |
| 2698 | we only need a conditional jump at the end of the loop */ | 2675 | a repeat..until: we only need a conditional jump |
| 2676 | at the end of the loop. */ | ||
| 2699 | if (emptyp) BUF_PUSH (no_op); | 2677 | if (emptyp) BUF_PUSH (no_op); |
| 2700 | STORE_JUMP (emptyp ? on_failure_jump_nastyloop | 2678 | STORE_JUMP (emptyp ? on_failure_jump_nastyloop |
| 2701 | : on_failure_jump, b, laststart); | 2679 | : on_failure_jump, b, laststart); |
| @@ -2704,7 +2682,7 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 2704 | { | 2682 | { |
| 2705 | /* The repeat...until naturally matches one or more. | 2683 | /* The repeat...until naturally matches one or more. |
| 2706 | To also match zero times, we need to first jump to | 2684 | To also match zero times, we need to first jump to |
| 2707 | the end of the loop (its conditional jump). */ | 2685 | the end of the loop (its conditional jump). */ |
| 2708 | INSERT_JUMP (jump, laststart, b); | 2686 | INSERT_JUMP (jump, laststart, b); |
| 2709 | b += 3; | 2687 | b += 3; |
| 2710 | } | 2688 | } |
| @@ -3241,99 +3219,99 @@ regex_compile (pattern, size, syntax, bufp) | |||
| 3241 | goto unfetch_interval; | 3219 | goto unfetch_interval; |
| 3242 | } | 3220 | } |
| 3243 | 3221 | ||
| 3244 | if (upper_bound == 0) | 3222 | if (upper_bound == 0) |
| 3245 | /* If the upper bound is zero, just drop the sub pattern | 3223 | /* If the upper bound is zero, just drop the sub pattern |
| 3246 | altogether. */ | 3224 | altogether. */ |
| 3247 | b = laststart; | 3225 | b = laststart; |
| 3248 | else if (lower_bound == 1 && upper_bound == 1) | 3226 | else if (lower_bound == 1 && upper_bound == 1) |
| 3249 | /* Just match it once: nothing to do here. */ | 3227 | /* Just match it once: nothing to do here. */ |
| 3250 | ; | 3228 | ; |
| 3251 | 3229 | ||
| 3252 | /* Otherwise, we have a nontrivial interval. When | 3230 | /* Otherwise, we have a nontrivial interval. When |
| 3253 | we're all done, the pattern will look like: | 3231 | we're all done, the pattern will look like: |
| 3254 | set_number_at <jump count> <upper bound> | 3232 | set_number_at <jump count> <upper bound> |
| 3255 | set_number_at <succeed_n count> <lower bound> | 3233 | set_number_at <succeed_n count> <lower bound> |
| 3256 | succeed_n <after jump addr> <succeed_n count> | 3234 | succeed_n <after jump addr> <succeed_n count> |
| 3257 | <body of loop> | 3235 | <body of loop> |
| 3258 | jump_n <succeed_n addr> <jump count> | 3236 | jump_n <succeed_n addr> <jump count> |
| 3259 | (The upper bound and `jump_n' are omitted if | 3237 | (The upper bound and `jump_n' are omitted if |
| 3260 | `upper_bound' is 1, though.) */ | 3238 | `upper_bound' is 1, though.) */ |
| 3261 | else | 3239 | else |
| 3262 | { /* If the upper bound is > 1, we need to insert | 3240 | { /* If the upper bound is > 1, we need to insert |
| 3263 | more at the end of the loop. */ | 3241 | more at the end of the loop. */ |
| 3264 | unsigned int nbytes = (upper_bound < 0 ? 3 | 3242 | unsigned int nbytes = (upper_bound < 0 ? 3 |
| 3265 | : upper_bound > 1 ? 5 : 0); | 3243 | : upper_bound > 1 ? 5 : 0); |
| 3266 | unsigned int startoffset = 0; | 3244 | unsigned int startoffset = 0; |
| 3267 | 3245 | ||
| 3268 | GET_BUFFER_SPACE (20); /* We might use less. */ | 3246 | GET_BUFFER_SPACE (20); /* We might use less. */ |
| 3269 | 3247 | ||
| 3270 | if (lower_bound == 0) | 3248 | if (lower_bound == 0) |
| 3271 | { | 3249 | { |
| 3272 | /* A succeed_n that starts with 0 is really a | 3250 | /* A succeed_n that starts with 0 is really a |
| 3273 | a simple on_failure_jump_loop. */ | 3251 | a simple on_failure_jump_loop. */ |
| 3274 | INSERT_JUMP (on_failure_jump_loop, laststart, | 3252 | INSERT_JUMP (on_failure_jump_loop, laststart, |
| 3275 | b + 3 + nbytes); | 3253 | b + 3 + nbytes); |
| 3276 | b += 3; | 3254 | b += 3; |
| 3277 | } | 3255 | } |
| 3278 | else | 3256 | else |
| 3279 | { | 3257 | { |
| 3280 | /* Initialize lower bound of the `succeed_n', even | 3258 | /* Initialize lower bound of the `succeed_n', even |
| 3281 | though it will be set during matching by its | 3259 | though it will be set during matching by its |
| 3282 | attendant `set_number_at' (inserted next), | 3260 | attendant `set_number_at' (inserted next), |
| 3283 | because `re_compile_fastmap' needs to know. | 3261 | because `re_compile_fastmap' needs to know. |
| 3284 | Jump to the `jump_n' we might insert below. */ | 3262 | Jump to the `jump_n' we might insert below. */ |
| 3285 | INSERT_JUMP2 (succeed_n, laststart, | 3263 | INSERT_JUMP2 (succeed_n, laststart, |
| 3286 | b + 5 + nbytes, | 3264 | b + 5 + nbytes, |
| 3287 | lower_bound); | 3265 | lower_bound); |
| 3288 | b += 5; | 3266 | b += 5; |
| 3289 | 3267 | ||
| 3290 | /* Code to initialize the lower bound. Insert | 3268 | /* Code to initialize the lower bound. Insert |
| 3291 | before the `succeed_n'. The `5' is the last two | 3269 | before the `succeed_n'. The `5' is the last two |
| 3292 | bytes of this `set_number_at', plus 3 bytes of | 3270 | bytes of this `set_number_at', plus 3 bytes of |
| 3293 | the following `succeed_n'. */ | 3271 | the following `succeed_n'. */ |
| 3294 | insert_op2 (set_number_at, laststart, 5, lower_bound, b); | 3272 | insert_op2 (set_number_at, laststart, 5, lower_bound, b); |
| 3295 | b += 5; | 3273 | b += 5; |
| 3296 | startoffset += 5; | 3274 | startoffset += 5; |
| 3297 | } | 3275 | } |
| 3298 | 3276 | ||
| 3299 | if (upper_bound < 0) | 3277 | if (upper_bound < 0) |
| 3300 | { | 3278 | { |
| 3301 | /* A negative upper bound stands for infinity, | 3279 | /* A negative upper bound stands for infinity, |
| 3302 | in which case it degenerates to a plain jump. */ | 3280 | in which case it degenerates to a plain jump. */ |
| 3303 | STORE_JUMP (jump, b, laststart + startoffset); | 3281 | STORE_JUMP (jump, b, laststart + startoffset); |
| 3304 | b += 3; | 3282 | b += 3; |
| 3305 | } | 3283 | } |
| 3306 | else if (upper_bound > 1) | 3284 | else if (upper_bound > 1) |
| 3307 | { /* More than one repetition is allowed, so | 3285 | { /* More than one repetition is allowed, so |
| 3308 | append a backward jump to the `succeed_n' | 3286 | append a backward jump to the `succeed_n' |
| 3309 | that starts this interval. | 3287 | that starts this interval. |
| 3310 | 3288 | ||
| 3311 | When we've reached this during matching, | 3289 | When we've reached this during matching, |
| 3312 | we'll have matched the interval once, so | 3290 | we'll have matched the interval once, so |
| 3313 | jump back only `upper_bound - 1' times. */ | 3291 | jump back only `upper_bound - 1' times. */ |
| 3314 | STORE_JUMP2 (jump_n, b, laststart + startoffset, | 3292 | STORE_JUMP2 (jump_n, b, laststart + startoffset, |
| 3315 | upper_bound - 1); | 3293 | upper_bound - 1); |
| 3316 | b += 5; | 3294 | b += 5; |
| 3317 | 3295 | ||
| 3318 | /* The location we want to set is the second | 3296 | /* The location we want to set is the second |
| 3319 | parameter of the `jump_n'; that is `b-2' as | 3297 | parameter of the `jump_n'; that is `b-2' as |
| 3320 | an absolute address. `laststart' will be | 3298 | an absolute address. `laststart' will be |
| 3321 | the `set_number_at' we're about to insert; | 3299 | the `set_number_at' we're about to insert; |
| 3322 | `laststart+3' the number to set, the source | 3300 | `laststart+3' the number to set, the source |
| 3323 | for the relative address. But we are | 3301 | for the relative address. But we are |
| 3324 | inserting into the middle of the pattern -- | 3302 | inserting into the middle of the pattern -- |
| 3325 | so everything is getting moved up by 5. | 3303 | so everything is getting moved up by 5. |
| 3326 | Conclusion: (b - 2) - (laststart + 3) + 5, | 3304 | Conclusion: (b - 2) - (laststart + 3) + 5, |
| 3327 | i.e., b - laststart. | 3305 | i.e., b - laststart. |
| 3328 | 3306 | ||
| 3329 | We insert this at the beginning of the loop | 3307 | We insert this at the beginning of the loop |
| 3330 | so that if we fail during matching, we'll | 3308 | so that if we fail during matching, we'll |
| 3331 | reinitialize the bounds. */ | 3309 | reinitialize the bounds. */ |
| 3332 | insert_op2 (set_number_at, laststart, b - laststart, | 3310 | insert_op2 (set_number_at, laststart, b - laststart, |
| 3333 | upper_bound - 1, b); | 3311 | upper_bound - 1, b); |
| 3334 | b += 5; | 3312 | b += 5; |
| 3335 | } | 3313 | } |
| 3336 | } | 3314 | } |
| 3337 | pending_exact = 0; | 3315 | pending_exact = 0; |
| 3338 | beg_interval = NULL; | 3316 | beg_interval = NULL; |
| 3339 | } | 3317 | } |
| @@ -5518,7 +5496,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5518 | cycle detection cannot work. Worse yet, such a detection | 5496 | cycle detection cannot work. Worse yet, such a detection |
| 5519 | can not only fail to detect a cycle, but it can also wrongly | 5497 | can not only fail to detect a cycle, but it can also wrongly |
| 5520 | detect a cycle (between different instantiations of the same | 5498 | detect a cycle (between different instantiations of the same |
| 5521 | loop. | 5499 | loop). |
| 5522 | So the method used for those nasty loops is a little different: | 5500 | So the method used for those nasty loops is a little different: |
| 5523 | We use a special cycle-detection-stack-frame which is pushed | 5501 | We use a special cycle-detection-stack-frame which is pushed |
| 5524 | when the on_failure_jump_nastyloop failure-point is *popped*. | 5502 | when the on_failure_jump_nastyloop failure-point is *popped*. |
| @@ -5532,11 +5510,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5532 | mcnt, p + mcnt); | 5510 | mcnt, p + mcnt); |
| 5533 | 5511 | ||
| 5534 | assert ((re_opcode_t)p[-4] == no_op); | 5512 | assert ((re_opcode_t)p[-4] == no_op); |
| 5535 | CHECK_INFINITE_LOOP (p - 4, d); | 5513 | { |
| 5536 | PUSH_FAILURE_POINT (p - 3, d); | 5514 | int cycle = 0; |
| 5515 | CHECK_INFINITE_LOOP (p - 4, d); | ||
| 5516 | if (!cycle) | ||
| 5517 | /* If there's a cycle, just continue without pushing | ||
| 5518 | this failure point. The failure point is the "try again" | ||
| 5519 | option, which shouldn't be tried. | ||
| 5520 | We want (x?)*?y\1z to match both xxyz and xxyxz. */ | ||
| 5521 | PUSH_FAILURE_POINT (p - 3, d); | ||
| 5522 | } | ||
| 5537 | break; | 5523 | break; |
| 5538 | 5524 | ||
| 5539 | |||
| 5540 | /* Simple loop detecting on_failure_jump: just check on the | 5525 | /* Simple loop detecting on_failure_jump: just check on the |
| 5541 | failure stack if the same spot was already hit earlier. */ | 5526 | failure stack if the same spot was already hit earlier. */ |
| 5542 | case on_failure_jump_loop: | 5527 | case on_failure_jump_loop: |
| @@ -5544,9 +5529,19 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) | |||
| 5544 | EXTRACT_NUMBER_AND_INCR (mcnt, p); | 5529 | EXTRACT_NUMBER_AND_INCR (mcnt, p); |
| 5545 | DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n", | 5530 | DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n", |
| 5546 | mcnt, p + mcnt); | 5531 | mcnt, p + mcnt); |
| 5547 | 5532 | { | |
| 5548 | CHECK_INFINITE_LOOP (p - 3, d); | 5533 | int cycle = 0; |
| 5549 | PUSH_FAILURE_POINT (p - 3, d); | 5534 | CHECK_INFINITE_LOOP (p - 3, d); |
| 5535 | if (cycle) | ||
| 5536 | /* If there's a cycle, get out of the loop, as if the matching | ||
| 5537 | had failed. We used to just `goto fail' here, but that was | ||
| 5538 | aborting the search a bit too early: we want to keep the | ||
| 5539 | empty-loop-match and keep matching after the loop. | ||
| 5540 | We want (x?)*y\1z to match both xxyz and xxyxz. */ | ||
| 5541 | p += mcnt; | ||
| 5542 | else | ||
| 5543 | PUSH_FAILURE_POINT (p - 3, d); | ||
| 5544 | } | ||
| 5550 | break; | 5545 | break; |
| 5551 | 5546 | ||
| 5552 | 5547 | ||