diff options
| author | Stefan Monnier | 2023-09-29 17:39:10 -0400 |
|---|---|---|
| committer | Stefan Monnier | 2023-09-29 17:39:10 -0400 |
| commit | e61a03984335b4ffb164280b2df80668b2a92c23 (patch) | |
| tree | 561cddddaf41c7ea3f44466f7881b5f927f69173 | |
| parent | 6e44d6e18438ea2665ae6252a6ec090963dd7e42 (diff) | |
| download | emacs-e61a03984335b4ffb164280b2df80668b2a92c23.tar.gz emacs-e61a03984335b4ffb164280b2df80668b2a92c23.zip | |
regex.c: Consolidate the two analysis functions
We currently have two functions that analyze the bytecode
to try and apply optimizations: `analyze_first` and `mutually_exclusive_p`.
Extract the common code between them into a new function `forall_firstchar`,
and then rewrite the old ones on top of that one.
Along the way, we get slightly better analyses that reverts
the recent de-optimizations but without re-introducing the
corresponding bugs.
* src/regex-emacs.c (forall_firstchar_1, forall_firstchar): New functions.
(analyze_first_old): Rename from `analyze_first`.
(struct anafirst_data): New struct.
(analyze_first_fastmap, analyze_first_null): New functions.
(analyze_first): Rewrite to use `forall_firstchar` with those two functions.
Take a `bufp` rather than a `multibyte` arg.
(regex_compile, re_compile_fastmap): Adjust calls accordingly.
(struct mutexcl_data): New struct.
(mutually_exclusive_one): New function.
(mutually_exclusive_p): Rewrite to use `forall_firstchar` with that function.
| -rw-r--r-- | src/regex-emacs.c | 591 |
1 files changed, 580 insertions, 11 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 33bdad5c46b..dbd5bcb8953 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -1179,8 +1179,8 @@ static void insert_op2 (re_opcode_t op, unsigned char *loc, | |||
| 1179 | static bool at_begline_loc_p (re_char *pattern, re_char *p); | 1179 | static bool at_begline_loc_p (re_char *pattern, re_char *p); |
| 1180 | static bool at_endline_loc_p (re_char *p, re_char *pend); | 1180 | static bool at_endline_loc_p (re_char *p, re_char *pend); |
| 1181 | static re_char *skip_one_char (re_char *p); | 1181 | static re_char *skip_one_char (re_char *p); |
| 1182 | static bool analyze_first (re_char *p, re_char *pend, | 1182 | static bool analyze_first (struct re_pattern_buffer *bufp, |
| 1183 | char *fastmap, bool multibyte); | 1183 | re_char *p, re_char *pend, char *fastmap); |
| 1184 | 1184 | ||
| 1185 | /* Fetch the next character in the uncompiled pattern, with no | 1185 | /* Fetch the next character in the uncompiled pattern, with no |
| 1186 | translation. */ | 1186 | translation. */ |
| @@ -1930,7 +1930,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1930 | ptrdiff_t startoffset = 0; | 1930 | ptrdiff_t startoffset = 0; |
| 1931 | re_opcode_t ofj = | 1931 | re_opcode_t ofj = |
| 1932 | /* Check if the loop can match the empty string. */ | 1932 | /* Check if the loop can match the empty string. */ |
| 1933 | (simple || !analyze_first (laststart, b, NULL, false)) | 1933 | (simple || !analyze_first (bufp, laststart, b, NULL)) |
| 1934 | ? on_failure_jump : on_failure_jump_loop; | 1934 | ? on_failure_jump : on_failure_jump_loop; |
| 1935 | eassert (skip_one_char (laststart) <= b); | 1935 | eassert (skip_one_char (laststart) <= b); |
| 1936 | 1936 | ||
| @@ -1987,7 +1987,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |||
| 1987 | GET_BUFFER_SPACE (7); /* We might use less. */ | 1987 | GET_BUFFER_SPACE (7); /* We might use less. */ |
| 1988 | if (many_times_ok) | 1988 | if (many_times_ok) |
| 1989 | { | 1989 | { |
| 1990 | bool emptyp = analyze_first (laststart, b, NULL, false); | 1990 | bool emptyp = analyze_first (bufp, laststart, b, NULL); |
| 1991 | 1991 | ||
| 1992 | /* The non-greedy multiple match looks like | 1992 | /* The non-greedy multiple match looks like |
| 1993 | a repeat..until: we only need a conditional jump | 1993 | a repeat..until: we only need a conditional jump |
| @@ -2822,7 +2822,229 @@ group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) | |||
| 2822 | return false; | 2822 | return false; |
| 2823 | } | 2823 | } |
| 2824 | 2824 | ||
| 2825 | /* analyze_first. | 2825 | /* Iterate through all the char-matching operations directly reachable from P. |
| 2826 | This is the inner loop of `forall_firstchar`, which see. | ||
| 2827 | LOOP_BEG..LOOP_END delimit the currentl "block" of code (we assume | ||
| 2828 | the code is made of syntactically nested loops). | ||
| 2829 | LOOP_END is blindly assumed to be "safe". | ||
| 2830 | To guarantee termination, at each iteration, either LOOP_BEG should | ||
| 2831 | get bigger, or it should stay the same and P should get bigger. */ | ||
| 2832 | static bool | ||
| 2833 | forall_firstchar_1 (re_char *p, re_char *pend, | ||
| 2834 | re_char *loop_beg, re_char *loop_end, | ||
| 2835 | bool f (const re_char *p, void *arg), void *arg) | ||
| 2836 | { | ||
| 2837 | eassert (p >= loop_beg); | ||
| 2838 | eassert (p <= loop_end); | ||
| 2839 | |||
| 2840 | while (true) | ||
| 2841 | { | ||
| 2842 | re_char *newp1, *newp2, *tmp; | ||
| 2843 | re_char *p_orig = p; | ||
| 2844 | |||
| 2845 | if (p == pend) | ||
| 2846 | return false; | ||
| 2847 | else if (p == loop_end) | ||
| 2848 | return true; | ||
| 2849 | else if (p > loop_end) | ||
| 2850 | { | ||
| 2851 | #if ENABLE_CHECKING | ||
| 2852 | fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption1!!\n"); | ||
| 2853 | #endif | ||
| 2854 | return false; /* FIXME: Broken assumption about the code shape. */ | ||
| 2855 | } | ||
| 2856 | else | ||
| 2857 | switch (*p) | ||
| 2858 | { | ||
| 2859 | /* Cases which stop the iteration. */ | ||
| 2860 | case succeed: | ||
| 2861 | case exactn: | ||
| 2862 | case charset: | ||
| 2863 | case charset_not: | ||
| 2864 | case anychar: | ||
| 2865 | case syntaxspec: | ||
| 2866 | case notsyntaxspec: | ||
| 2867 | case categoryspec: | ||
| 2868 | case notcategoryspec: | ||
| 2869 | return f (p, arg); | ||
| 2870 | |||
| 2871 | /* Cases which may match the empty string. */ | ||
| 2872 | case at_dot: | ||
| 2873 | case begbuf: | ||
| 2874 | case no_op: | ||
| 2875 | case wordbound: | ||
| 2876 | case notwordbound: | ||
| 2877 | case begline: | ||
| 2878 | p++; | ||
| 2879 | continue; | ||
| 2880 | |||
| 2881 | /* Cases which may match the empty string and may | ||
| 2882 | tell us something about the next char. */ | ||
| 2883 | case endline: | ||
| 2884 | case endbuf: | ||
| 2885 | case wordbeg: | ||
| 2886 | case wordend: | ||
| 2887 | case symbeg: | ||
| 2888 | case symend: | ||
| 2889 | if (f (p, arg)) | ||
| 2890 | return true; | ||
| 2891 | p++; | ||
| 2892 | continue; | ||
| 2893 | |||
| 2894 | case jump: | ||
| 2895 | case jump_n: | ||
| 2896 | newp1 = extract_address (p + 1); | ||
| 2897 | if (newp1 > p) | ||
| 2898 | { /* Forward jump, boring. */ | ||
| 2899 | p = newp1; | ||
| 2900 | continue; | ||
| 2901 | } | ||
| 2902 | switch (*newp1) | ||
| 2903 | { | ||
| 2904 | case on_failure_jump: | ||
| 2905 | case on_failure_keep_string_jump: | ||
| 2906 | case on_failure_jump_nastyloop: | ||
| 2907 | case on_failure_jump_loop: | ||
| 2908 | case on_failure_jump_smart: | ||
| 2909 | case succeed_n: | ||
| 2910 | newp2 = extract_address (newp1 + 1); | ||
| 2911 | goto do_twoway_jump; | ||
| 2912 | default: | ||
| 2913 | newp2 = loop_end; /* "Safe" choice. */ | ||
| 2914 | goto do_jump; | ||
| 2915 | } | ||
| 2916 | |||
| 2917 | case on_failure_jump: | ||
| 2918 | case on_failure_keep_string_jump: | ||
| 2919 | case on_failure_jump_nastyloop: | ||
| 2920 | case on_failure_jump_loop: | ||
| 2921 | case on_failure_jump_smart: | ||
| 2922 | newp1 = extract_address (p + 1); | ||
| 2923 | newp2 = p + 3; | ||
| 2924 | /* For `+` loops, we often have an `on_failure_jump` that skips | ||
| 2925 | forward over a subsequent `jump`. Recognize this pattern | ||
| 2926 | since that subsequent `jump` is the one that jumps to the | ||
| 2927 | loop-entry. */ | ||
| 2928 | newp2 = ((re_opcode_t) *newp2 == jump) | ||
| 2929 | ? extract_address (newp2 + 1) : newp2; | ||
| 2930 | |||
| 2931 | do_twoway_jump: | ||
| 2932 | /* We have to check that both destinations are safe. | ||
| 2933 | Arrange for `newp1` to be the smaller of the two. */ | ||
| 2934 | if (newp1 > newp2) | ||
| 2935 | (tmp = newp1, newp1 = newp2, newp2 = tmp); | ||
| 2936 | |||
| 2937 | if (newp2 <= p_orig) /* Both destinations go backward! */ | ||
| 2938 | { | ||
| 2939 | #if ENABLE_CHECKING | ||
| 2940 | fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption2!!\n"); | ||
| 2941 | #endif | ||
| 2942 | return false; | ||
| 2943 | } | ||
| 2944 | |||
| 2945 | if (!forall_firstchar_1 (newp2, pend, loop_beg, loop_end, f, arg)) | ||
| 2946 | return false; | ||
| 2947 | |||
| 2948 | do_jump: | ||
| 2949 | eassert (newp2 <= loop_end); | ||
| 2950 | if (newp1 <= p_orig) | ||
| 2951 | { | ||
| 2952 | if (newp1 < loop_beg) | ||
| 2953 | { | ||
| 2954 | #if ENABLE_CHECKING | ||
| 2955 | fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption3!!\n"); | ||
| 2956 | #endif | ||
| 2957 | return false; | ||
| 2958 | } | ||
| 2959 | else if (newp1 == loop_beg) | ||
| 2960 | /* If we jump backward to the entry point of the current loop | ||
| 2961 | it means it's a zero-length cycle through that loop; | ||
| 2962 | this cycle itself does not break safety. */ | ||
| 2963 | return true; | ||
| 2964 | else | ||
| 2965 | /* We jump backward to a new loop, nested within the current | ||
| 2966 | one. `newp1` is the entry point and `newp2` the exit of | ||
| 2967 | that inner loop. */ | ||
| 2968 | /* `p` gets smaller, but termination is still ensured because | ||
| 2969 | `loop_beg` gets bigger. */ | ||
| 2970 | (loop_beg = newp1, loop_end = newp2); | ||
| 2971 | } | ||
| 2972 | p = newp1; | ||
| 2973 | continue; | ||
| 2974 | |||
| 2975 | case succeed_n: | ||
| 2976 | newp1 = extract_address (p + 1); | ||
| 2977 | newp2 = p + 5; /* Skip the two bytes containing the count. */ | ||
| 2978 | goto do_twoway_jump; | ||
| 2979 | |||
| 2980 | case set_number_at: | ||
| 2981 | int offset = extract_number (p + 1); | ||
| 2982 | DEBUG_STATEMENT (eassert (extract_number (p + 3))); | ||
| 2983 | /* If we're setting the counter of an immediately following | ||
| 2984 | `succeed_n`, then this next execution of `succeed_n` will do | ||
| 2985 | nothing but decrement its counter and "fall through". | ||
| 2986 | So we do the fall through here to avoid considering the | ||
| 2987 | "on failure" part of the `succeed_n` which should only be | ||
| 2988 | considered when coming from the `jump(_n)` at the end of | ||
| 2989 | the loop. */ | ||
| 2990 | p += (offset == 5 && p[5] == succeed_n) ? 10 : 5; | ||
| 2991 | continue; | ||
| 2992 | |||
| 2993 | case start_memory: | ||
| 2994 | case stop_memory: | ||
| 2995 | p += 2; | ||
| 2996 | continue; | ||
| 2997 | |||
| 2998 | /* This could match the empty string, so we may need to continue, | ||
| 2999 | but in most cases, this can match "anything", so we should | ||
| 3000 | return `false` unless told otherwise. */ | ||
| 3001 | case duplicate: | ||
| 3002 | if (!f (p, arg)) | ||
| 3003 | return false; | ||
| 3004 | p += 2; | ||
| 3005 | continue; | ||
| 3006 | |||
| 3007 | default: | ||
| 3008 | abort (); /* We have listed all the cases. */ | ||
| 3009 | } | ||
| 3010 | } | ||
| 3011 | } | ||
| 3012 | |||
| 3013 | /* Iterate through all the char-matching operations directly reachable from P. | ||
| 3014 | Return true if P is "safe", meaning that PEND cannot be reached directly | ||
| 3015 | from P and all calls to F returned true. | ||
| 3016 | Return false if PEND *may* be directly reachable from P or if one of | ||
| 3017 | the calls to F returned false. | ||
| 3018 | PEND can be NULL (and hence never reachable). | ||
| 3019 | |||
| 3020 | Call `F (POS, ARG)` for every POS directly reachable from P, | ||
| 3021 | before reaching PEND, where POS is the position of a char-matching | ||
| 3022 | operation (`exactn`, `charset`, ...). | ||
| 3023 | |||
| 3024 | For operations that match the empty string (`wordbeg`, ...), if F | ||
| 3025 | returns true we stop going down that path immediately but if it returns | ||
| 3026 | false we don't consider it as a failure and we simply look for the | ||
| 3027 | next char-matching operations on that path. | ||
| 3028 | For `duplicate`, it is the reverse: a false is an immediate failure | ||
| 3029 | whereas a true just lets the analysis continue with the rest of the path. | ||
| 3030 | |||
| 3031 | This function can be used while building the bytecode (in which case | ||
| 3032 | you should pass NULL for bufp), but if so, P and PEND need to delimit | ||
| 3033 | a valid block such that there is not jump to a location outside | ||
| 3034 | of [P...PEND]. */ | ||
| 3035 | static bool | ||
| 3036 | forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend, | ||
| 3037 | bool f (re_char *p, void *arg), void *arg) | ||
| 3038 | { | ||
| 3039 | eassert (!bufp || bufp->used); | ||
| 3040 | eassert (pend || bufp->used); | ||
| 3041 | return forall_firstchar_1 (p, pend, | ||
| 3042 | bufp ? bufp->buffer - 1 : p, | ||
| 3043 | bufp ? bufp->buffer + bufp->used + 1 : pend, | ||
| 3044 | f, arg); | ||
| 3045 | } | ||
| 3046 | |||
| 3047 | /* analyze_first_old. | ||
| 2826 | If fastmap is non-NULL, go through the pattern and fill fastmap | 3048 | If fastmap is non-NULL, go through the pattern and fill fastmap |
| 2827 | with all the possible leading chars. If fastmap is NULL, don't | 3049 | with all the possible leading chars. If fastmap is NULL, don't |
| 2828 | bother filling it up (obviously) and only return whether the | 3050 | bother filling it up (obviously) and only return whether the |
| @@ -2833,7 +3055,7 @@ group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) | |||
| 2833 | or if fastmap was not updated accurately. */ | 3055 | or if fastmap was not updated accurately. */ |
| 2834 | 3056 | ||
| 2835 | static bool | 3057 | static bool |
| 2836 | analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | 3058 | analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte) |
| 2837 | { | 3059 | { |
| 2838 | int j, k; | 3060 | int j, k; |
| 2839 | int nbits; | 3061 | int nbits; |
| @@ -3079,7 +3301,7 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 3079 | { /* We have to look down both arms. | 3301 | { /* We have to look down both arms. |
| 3080 | We first go down the "straight" path so as to minimize | 3302 | We first go down the "straight" path so as to minimize |
| 3081 | stack usage when going through alternatives. */ | 3303 | stack usage when going through alternatives. */ |
| 3082 | bool r = analyze_first (p, pend, fastmap, multibyte); | 3304 | bool r = analyze_first_old (p, pend, fastmap, multibyte); |
| 3083 | if (r) return r; | 3305 | if (r) return r; |
| 3084 | p += j; | 3306 | p += j; |
| 3085 | } | 3307 | } |
| @@ -3133,7 +3355,263 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |||
| 3133 | /* We reached the end without matching anything. */ | 3355 | /* We reached the end without matching anything. */ |
| 3134 | return true; | 3356 | return true; |
| 3135 | 3357 | ||
| 3136 | } /* analyze_first */ | 3358 | } /* analyze_first_old */ |
| 3359 | |||
| 3360 | struct anafirst_data { | ||
| 3361 | bool multibyte; | ||
| 3362 | char *fastmap; | ||
| 3363 | bool match_any_multibyte_characters; | ||
| 3364 | }; | ||
| 3365 | |||
| 3366 | static bool | ||
| 3367 | analyze_first_fastmap (const re_char *p, void *arg) | ||
| 3368 | { | ||
| 3369 | struct anafirst_data *data = arg; | ||
| 3370 | |||
| 3371 | int j, k; | ||
| 3372 | int nbits; | ||
| 3373 | bool not; | ||
| 3374 | |||
| 3375 | switch (*p) | ||
| 3376 | { | ||
| 3377 | case succeed: | ||
| 3378 | return false; | ||
| 3379 | |||
| 3380 | case duplicate: | ||
| 3381 | /* If the first character has to match a backreference, that means | ||
| 3382 | that the group was empty (since it already matched). Since this | ||
| 3383 | is the only case that interests us here, we can assume that the | ||
| 3384 | backreference must match the empty string and we need to | ||
| 3385 | build the fastmap from the rest of the path. */ | ||
| 3386 | return true; | ||
| 3387 | |||
| 3388 | /* Following are the cases which match a character. These end | ||
| 3389 | with 'break'. */ | ||
| 3390 | |||
| 3391 | case exactn: | ||
| 3392 | p++; | ||
| 3393 | /* If multibyte is nonzero, the first byte of each | ||
| 3394 | character is an ASCII or a leading code. Otherwise, | ||
| 3395 | each byte is a character. Thus, this works in both | ||
| 3396 | cases. */ | ||
| 3397 | data->fastmap[p[1]] = 1; | ||
| 3398 | if (data->multibyte) | ||
| 3399 | { | ||
| 3400 | /* Cover the case of matching a raw char in a | ||
| 3401 | multibyte regexp against unibyte. */ | ||
| 3402 | if (CHAR_BYTE8_HEAD_P (p[1])) | ||
| 3403 | data->fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1; | ||
| 3404 | } | ||
| 3405 | else | ||
| 3406 | { | ||
| 3407 | /* For the case of matching this unibyte regex | ||
| 3408 | against multibyte, we must set a leading code of | ||
| 3409 | the corresponding multibyte character. */ | ||
| 3410 | int c = RE_CHAR_TO_MULTIBYTE (p[1]); | ||
| 3411 | |||
| 3412 | data->fastmap[CHAR_LEADING_CODE (c)] = 1; | ||
| 3413 | } | ||
| 3414 | return true; | ||
| 3415 | |||
| 3416 | case anychar: | ||
| 3417 | /* We could put all the chars except for \n (and maybe \0) | ||
| 3418 | but we don't bother since it is generally not worth it. */ | ||
| 3419 | return false; | ||
| 3420 | |||
| 3421 | case charset_not: | ||
| 3422 | { | ||
| 3423 | /* Chars beyond end of bitmap are possible matches. */ | ||
| 3424 | for (j = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH; | ||
| 3425 | j < (1 << BYTEWIDTH); j++) | ||
| 3426 | data->fastmap[j] = 1; | ||
| 3427 | } | ||
| 3428 | FALLTHROUGH; | ||
| 3429 | case charset: | ||
| 3430 | not = (re_opcode_t) *(p) == charset_not; | ||
| 3431 | nbits = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH; | ||
| 3432 | p += 2; | ||
| 3433 | for (j = 0; j < nbits; j++) | ||
| 3434 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | ||
| 3435 | data->fastmap[j] = 1; | ||
| 3436 | |||
| 3437 | /* To match raw bytes (in the 80..ff range) against multibyte | ||
| 3438 | strings, add their leading bytes to the fastmap. */ | ||
| 3439 | for (j = 0x80; j < nbits; j++) | ||
| 3440 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | ||
| 3441 | data->fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1; | ||
| 3442 | |||
| 3443 | if (/* Any leading code can possibly start a character | ||
| 3444 | which doesn't match the specified set of characters. */ | ||
| 3445 | not | ||
| 3446 | || | ||
| 3447 | /* If we can match a character class, we can match any | ||
| 3448 | multibyte characters. */ | ||
| 3449 | (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | ||
| 3450 | && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) | ||
| 3451 | |||
| 3452 | { | ||
| 3453 | if (!data->match_any_multibyte_characters) | ||
| 3454 | { | ||
| 3455 | for (j = MIN_MULTIBYTE_LEADING_CODE; | ||
| 3456 | j <= MAX_MULTIBYTE_LEADING_CODE; j++) | ||
| 3457 | data->fastmap[j] = 1; | ||
| 3458 | data->match_any_multibyte_characters = true; | ||
| 3459 | } | ||
| 3460 | } | ||
| 3461 | |||
| 3462 | else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | ||
| 3463 | && data->match_any_multibyte_characters == false) | ||
| 3464 | { | ||
| 3465 | /* Set fastmap[I] to 1 where I is a leading code of each | ||
| 3466 | multibyte character in the range table. */ | ||
| 3467 | int c, count; | ||
| 3468 | unsigned char lc1, lc2; | ||
| 3469 | |||
| 3470 | /* Make P points the range table. '+ 2' is to skip flag | ||
| 3471 | bits for a character class. */ | ||
| 3472 | p += CHARSET_BITMAP_SIZE (&p[-2]) + 2; | ||
| 3473 | |||
| 3474 | /* Extract the number of ranges in range table into COUNT. */ | ||
| 3475 | EXTRACT_NUMBER_AND_INCR (count, p); | ||
| 3476 | for (; count > 0; count--, p += 3) | ||
| 3477 | { | ||
| 3478 | /* Extract the start and end of each range. */ | ||
| 3479 | EXTRACT_CHARACTER (c, p); | ||
| 3480 | lc1 = CHAR_LEADING_CODE (c); | ||
| 3481 | p += 3; | ||
| 3482 | EXTRACT_CHARACTER (c, p); | ||
| 3483 | lc2 = CHAR_LEADING_CODE (c); | ||
| 3484 | for (j = lc1; j <= lc2; j++) | ||
| 3485 | data->fastmap[j] = 1; | ||
| 3486 | } | ||
| 3487 | } | ||
| 3488 | return true; | ||
| 3489 | |||
| 3490 | case syntaxspec: | ||
| 3491 | case notsyntaxspec: | ||
| 3492 | /* This match depends on text properties. These end with | ||
| 3493 | aborting optimizations. */ | ||
| 3494 | return false; | ||
| 3495 | |||
| 3496 | case categoryspec: | ||
| 3497 | case notcategoryspec: | ||
| 3498 | not = (re_opcode_t)p[0] == notcategoryspec; | ||
| 3499 | p++; | ||
| 3500 | k = *p++; | ||
| 3501 | for (j = (1 << BYTEWIDTH); j >= 0; j--) | ||
| 3502 | if ((CHAR_HAS_CATEGORY (j, k)) ^ not) | ||
| 3503 | data->fastmap[j] = 1; | ||
| 3504 | |||
| 3505 | /* Any leading code can possibly start a character which | ||
| 3506 | has or doesn't has the_malloc_fn specified category. */ | ||
| 3507 | if (!data->match_any_multibyte_characters) | ||
| 3508 | { | ||
| 3509 | for (j = MIN_MULTIBYTE_LEADING_CODE; | ||
| 3510 | j <= MAX_MULTIBYTE_LEADING_CODE; j++) | ||
| 3511 | data->fastmap[j] = 1; | ||
| 3512 | data->match_any_multibyte_characters = true; | ||
| 3513 | } | ||
| 3514 | return true; | ||
| 3515 | |||
| 3516 | case endline: | ||
| 3517 | case endbuf: | ||
| 3518 | case wordbeg: | ||
| 3519 | case wordend: | ||
| 3520 | case symbeg: | ||
| 3521 | case symend: | ||
| 3522 | /* This false doesn't mean failure but rather "not succeeded yet". */ | ||
| 3523 | return false; | ||
| 3524 | |||
| 3525 | default: | ||
| 3526 | #if ENABLE_CHECKING | ||
| 3527 | abort (); /* We have listed all the cases. */ | ||
| 3528 | #endif | ||
| 3529 | return false; | ||
| 3530 | } | ||
| 3531 | } | ||
| 3532 | |||
| 3533 | static bool | ||
| 3534 | analyze_first_null (const re_char *p, void *arg) | ||
| 3535 | { | ||
| 3536 | switch (*p) | ||
| 3537 | { | ||
| 3538 | case succeed: | ||
| 3539 | /* This is safe: we can't reach `pend` at all from here. */ | ||
| 3540 | return true; | ||
| 3541 | |||
| 3542 | case duplicate: | ||
| 3543 | /* Either `duplicate` ends up matching a non-empty string, in which | ||
| 3544 | case we're good, or it matches the empty string, in which case we | ||
| 3545 | need to continue checking the rest of this path, which is exactly | ||
| 3546 | what returning `true` does, here. */ | ||
| 3547 | return true; | ||
| 3548 | |||
| 3549 | case exactn: | ||
| 3550 | case anychar: | ||
| 3551 | case charset_not: | ||
| 3552 | case charset: | ||
| 3553 | case syntaxspec: | ||
| 3554 | case notsyntaxspec: | ||
| 3555 | case categoryspec: | ||
| 3556 | case notcategoryspec: | ||
| 3557 | return true; | ||
| 3558 | |||
| 3559 | case endline: | ||
| 3560 | case endbuf: | ||
| 3561 | case wordbeg: | ||
| 3562 | case wordend: | ||
| 3563 | case symbeg: | ||
| 3564 | case symend: | ||
| 3565 | /* This false doesn't mean failure but rather "not succeeded yet". */ | ||
| 3566 | return false; | ||
| 3567 | |||
| 3568 | default: | ||
| 3569 | #if ENABLE_CHECKING | ||
| 3570 | abort (); /* We have listed all the cases. */ | ||
| 3571 | #endif | ||
| 3572 | return false; | ||
| 3573 | } | ||
| 3574 | } | ||
| 3575 | |||
| 3576 | /* analyze_first. | ||
| 3577 | If fastmap is non-NULL, go through the pattern and fill fastmap | ||
| 3578 | with all the possible leading chars. If fastmap is NULL, don't | ||
| 3579 | bother filling it up (obviously) and only return whether the | ||
| 3580 | pattern could potentially match the empty string. | ||
| 3581 | |||
| 3582 | Return false if p matches at least one char before reaching pend. | ||
| 3583 | Return true if p..pend might match the empty string | ||
| 3584 | or if fastmap was not updated accurately. */ | ||
| 3585 | |||
| 3586 | static bool | ||
| 3587 | analyze_first (struct re_pattern_buffer *bufp, | ||
| 3588 | re_char *p, re_char *pend, char *fastmap) | ||
| 3589 | { | ||
| 3590 | eassert (pend); | ||
| 3591 | struct anafirst_data data = { bufp ? bufp->multibyte : false, | ||
| 3592 | fastmap, false }; | ||
| 3593 | bool safe = forall_firstchar (bufp->used ? bufp : NULL, p, pend, | ||
| 3594 | fastmap ? analyze_first_fastmap | ||
| 3595 | : analyze_first_null, | ||
| 3596 | &data); | ||
| 3597 | #if ENABLE_CHECKING | ||
| 3598 | bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte); | ||
| 3599 | if (old && safe) | ||
| 3600 | { | ||
| 3601 | fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n", | ||
| 3602 | fastmap ? 1 : 0); | ||
| 3603 | print_partial_compiled_pattern (stderr, p, pend); | ||
| 3604 | } | ||
| 3605 | else if (!old && !safe) | ||
| 3606 | { | ||
| 3607 | fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n", | ||
| 3608 | fastmap ? 1 : 0); | ||
| 3609 | print_partial_compiled_pattern (stderr, p, pend); | ||
| 3610 | } | ||
| 3611 | #endif | ||
| 3612 | return !safe; | ||
| 3613 | } | ||
| 3614 | |||
| 3137 | 3615 | ||
| 3138 | /* Compute a fastmap for the compiled pattern in BUFP. | 3616 | /* Compute a fastmap for the compiled pattern in BUFP. |
| 3139 | A fastmap records which of the (1 << BYTEWIDTH) possible | 3617 | A fastmap records which of the (1 << BYTEWIDTH) possible |
| @@ -3162,8 +3640,8 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) | |||
| 3162 | /* FIXME: Is the following assignment correct even when ANALYSIS < 0? */ | 3640 | /* FIXME: Is the following assignment correct even when ANALYSIS < 0? */ |
| 3163 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ | 3641 | bufp->fastmap_accurate = 1; /* It will be when we're done. */ |
| 3164 | 3642 | ||
| 3165 | bufp->can_be_null = analyze_first (bufp->buffer, bufp->buffer + bufp->used, | 3643 | bufp->can_be_null = analyze_first (bufp, bufp->buffer, |
| 3166 | fastmap, RE_MULTIBYTE_P (bufp)); | 3644 | bufp->buffer + bufp->used, fastmap); |
| 3167 | } /* re_compile_fastmap */ | 3645 | } /* re_compile_fastmap */ |
| 3168 | 3646 | ||
| 3169 | /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | 3647 | /* Set REGS to hold NUM_REGS registers, storing them in STARTS and |
| @@ -3962,11 +4440,102 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 3962 | return false; | 4440 | return false; |
| 3963 | } | 4441 | } |
| 3964 | 4442 | ||
| 4443 | struct mutexcl_data { | ||
| 4444 | struct re_pattern_buffer *bufp; | ||
| 4445 | re_char *p1; | ||
| 4446 | }; | ||
| 4447 | |||
| 4448 | static bool | ||
| 4449 | mutually_exclusive_one (re_char *p2, void *arg) | ||
| 4450 | { | ||
| 4451 | struct mutexcl_data *data = arg; | ||
| 4452 | switch (*p2) | ||
| 4453 | { | ||
| 4454 | case endline: | ||
| 4455 | case exactn: | ||
| 4456 | return mutually_exclusive_exactn (data->bufp, data->p1, p2); | ||
| 4457 | case charset: | ||
| 4458 | { | ||
| 4459 | if (*data->p1 == exactn) | ||
| 4460 | return mutually_exclusive_exactn (data->bufp, p2, data->p1); | ||
| 4461 | else | ||
| 4462 | return mutually_exclusive_charset (data->bufp, data->p1, p2); | ||
| 4463 | } | ||
| 4464 | |||
| 4465 | case charset_not: | ||
| 4466 | switch (*data->p1) | ||
| 4467 | { | ||
| 4468 | case exactn: | ||
| 4469 | return mutually_exclusive_exactn (data->bufp, p2, data->p1); | ||
| 4470 | case charset: | ||
| 4471 | return mutually_exclusive_charset (data->bufp, p2, data->p1); | ||
| 4472 | case charset_not: | ||
| 4473 | /* When we have two charset_not, it's very unlikely that | ||
| 4474 | they don't overlap. The union of the two sets of excluded | ||
| 4475 | chars should cover all possible chars, which, as a matter of | ||
| 4476 | fact, is virtually impossible in multibyte buffers. */ | ||
| 4477 | return false; | ||
| 4478 | } | ||
| 4479 | return false; | ||
| 4480 | case anychar: | ||
| 4481 | return false; /* FIXME: exactn \n ? */ | ||
| 4482 | case syntaxspec: | ||
| 4483 | return (*data->p1 == notsyntaxspec && data->p1[1] == p2[1]); | ||
| 4484 | case notsyntaxspec: | ||
| 4485 | return (*data->p1 == syntaxspec && data->p1[1] == p2[1]); | ||
| 4486 | case categoryspec: | ||
| 4487 | return (*data->p1 == notcategoryspec && data->p1[1] == p2[1]); | ||
| 4488 | case notcategoryspec: | ||
| 4489 | return (*data->p1 == categoryspec && data->p1[1] == p2[1]); | ||
| 4490 | |||
| 4491 | case endbuf: | ||
| 4492 | case succeed: | ||
| 4493 | return true; | ||
| 4494 | case wordbeg: | ||
| 4495 | return (*data->p1 == notsyntaxspec && data->p1[1] == Sword); | ||
| 4496 | case wordend: | ||
| 4497 | return (*data->p1 == syntaxspec && data->p1[1] == Sword); | ||
| 4498 | case symbeg: | ||
| 4499 | return (*data->p1 == notsyntaxspec | ||
| 4500 | && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); | ||
| 4501 | case symend: | ||
| 4502 | return (*data->p1 == syntaxspec | ||
| 4503 | && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); | ||
| 4504 | |||
| 4505 | case duplicate: | ||
| 4506 | /* At this point, we know nothing about what this can match, sadly. */ | ||
| 4507 | return false; | ||
| 4508 | |||
| 4509 | default: | ||
| 4510 | #if ENABLE_CHECKING | ||
| 4511 | abort (); /* We have listed all the cases. */ | ||
| 4512 | #endif | ||
| 4513 | return false; | ||
| 4514 | } | ||
| 4515 | } | ||
| 4516 | |||
| 3965 | static bool | 4517 | static bool |
| 3966 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, | 4518 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, |
| 3967 | re_char *p2) | 4519 | re_char *p2) |
| 3968 | { | 4520 | { |
| 3969 | return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL); | 4521 | struct mutexcl_data data = { bufp, p1 }; |
| 4522 | bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data); | ||
| 4523 | #if ENABLE_CHECKING | ||
| 4524 | bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL); | ||
| 4525 | if (old && !new) | ||
| 4526 | { | ||
| 4527 | fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n", | ||
| 4528 | (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer)); | ||
| 4529 | print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used); | ||
| 4530 | } | ||
| 4531 | else if (!old && new) | ||
| 4532 | { | ||
| 4533 | fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n", | ||
| 4534 | (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer)); | ||
| 4535 | print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used); | ||
| 4536 | } | ||
| 4537 | #endif | ||
| 4538 | return new; | ||
| 3970 | } | 4539 | } |
| 3971 | 4540 | ||
| 3972 | /* Matching routines. */ | 4541 | /* Matching routines. */ |