diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex-emacs.c | 565 |
1 files changed, 3 insertions, 562 deletions
diff --git a/src/regex-emacs.c b/src/regex-emacs.c index dbd5bcb8953..79e9f2c105a 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c | |||
| @@ -3044,319 +3044,6 @@ forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend, | |||
| 3044 | f, arg); | 3044 | f, arg); |
| 3045 | } | 3045 | } |
| 3046 | 3046 | ||
| 3047 | /* analyze_first_old. | ||
| 3048 | If fastmap is non-NULL, go through the pattern and fill fastmap | ||
| 3049 | with all the possible leading chars. If fastmap is NULL, don't | ||
| 3050 | bother filling it up (obviously) and only return whether the | ||
| 3051 | pattern could potentially match the empty string. | ||
| 3052 | |||
| 3053 | Return false if p..pend matches at least one char. | ||
| 3054 | Return true if p..pend might match the empty string | ||
| 3055 | or if fastmap was not updated accurately. */ | ||
| 3056 | |||
| 3057 | static bool | ||
| 3058 | analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte) | ||
| 3059 | { | ||
| 3060 | int j, k; | ||
| 3061 | int nbits; | ||
| 3062 | bool not; | ||
| 3063 | |||
| 3064 | /* If all elements for base leading-codes in fastmap is set, this | ||
| 3065 | flag is set true. */ | ||
| 3066 | bool match_any_multibyte_characters = false; | ||
| 3067 | |||
| 3068 | eassert (p); | ||
| 3069 | |||
| 3070 | /* The loop below works as follows: | ||
| 3071 | - It has a working-list kept in the PATTERN_STACK and which basically | ||
| 3072 | starts by only containing a pointer to the first operation. | ||
| 3073 | - If the opcode we're looking at is a match against some set of | ||
| 3074 | chars, then we add those chars to the fastmap and go on to the | ||
| 3075 | next work element from the worklist (done via 'break'). | ||
| 3076 | - If the opcode is a control operator on the other hand, we either | ||
| 3077 | ignore it (if it's meaningless at this point, such as 'start_memory') | ||
| 3078 | or execute it (if it's a jump). If the jump has several destinations | ||
| 3079 | (i.e. 'on_failure_jump'), then we push the other destination onto the | ||
| 3080 | worklist. | ||
| 3081 | We guarantee termination by ignoring backward jumps (more or less), | ||
| 3082 | so that P is monotonically increasing. More to the point, we | ||
| 3083 | never set P (or push) anything '<= p1'. */ | ||
| 3084 | |||
| 3085 | while (p < pend) | ||
| 3086 | { | ||
| 3087 | /* P1 is used as a marker of how far back a 'on_failure_jump' | ||
| 3088 | can go without being ignored. It is normally equal to P | ||
| 3089 | (which prevents any backward 'on_failure_jump') except right | ||
| 3090 | after a plain 'jump', to allow patterns such as: | ||
| 3091 | 0: jump 10 | ||
| 3092 | 3..9: <body> | ||
| 3093 | 10: on_failure_jump 3 | ||
| 3094 | as used for the *? operator. */ | ||
| 3095 | re_char *p1 = p; | ||
| 3096 | |||
| 3097 | switch (*p++) | ||
| 3098 | { | ||
| 3099 | case succeed: | ||
| 3100 | return true; | ||
| 3101 | |||
| 3102 | case duplicate: | ||
| 3103 | /* If the first character has to match a backreference, that means | ||
| 3104 | that the group was empty (since it already matched). Since this | ||
| 3105 | is the only case that interests us here, we can assume that the | ||
| 3106 | backreference must match the empty string. */ | ||
| 3107 | /* Note: Only true if we started from bufp->buffer (e.g. when | ||
| 3108 | fastmap is non-NULL), but when fastmap is NULL we're only | ||
| 3109 | trying to figure out if the body of a loop can possibly match | ||
| 3110 | the empty string so it's safe (tho sometimes pessimistic) | ||
| 3111 | to presume that `duplicate` can. */ | ||
| 3112 | p++; | ||
| 3113 | continue; | ||
| 3114 | |||
| 3115 | |||
| 3116 | /* Following are the cases which match a character. These end | ||
| 3117 | with 'break'. */ | ||
| 3118 | |||
| 3119 | case exactn: | ||
| 3120 | if (fastmap) | ||
| 3121 | { | ||
| 3122 | /* If multibyte is nonzero, the first byte of each | ||
| 3123 | character is an ASCII or a leading code. Otherwise, | ||
| 3124 | each byte is a character. Thus, this works in both | ||
| 3125 | cases. */ | ||
| 3126 | fastmap[p[1]] = 1; | ||
| 3127 | if (multibyte) | ||
| 3128 | { | ||
| 3129 | /* Cover the case of matching a raw char in a | ||
| 3130 | multibyte regexp against unibyte. */ | ||
| 3131 | if (CHAR_BYTE8_HEAD_P (p[1])) | ||
| 3132 | fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1; | ||
| 3133 | } | ||
| 3134 | else | ||
| 3135 | { | ||
| 3136 | /* For the case of matching this unibyte regex | ||
| 3137 | against multibyte, we must set a leading code of | ||
| 3138 | the corresponding multibyte character. */ | ||
| 3139 | int c = RE_CHAR_TO_MULTIBYTE (p[1]); | ||
| 3140 | |||
| 3141 | fastmap[CHAR_LEADING_CODE (c)] = 1; | ||
| 3142 | } | ||
| 3143 | } | ||
| 3144 | break; | ||
| 3145 | |||
| 3146 | |||
| 3147 | case anychar: | ||
| 3148 | /* We could put all the chars except for \n (and maybe \0) | ||
| 3149 | but we don't bother since it is generally not worth it. */ | ||
| 3150 | if (!fastmap) break; | ||
| 3151 | return true; | ||
| 3152 | |||
| 3153 | |||
| 3154 | case charset_not: | ||
| 3155 | if (!fastmap) break; | ||
| 3156 | { | ||
| 3157 | /* Chars beyond end of bitmap are possible matches. */ | ||
| 3158 | for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; | ||
| 3159 | j < (1 << BYTEWIDTH); j++) | ||
| 3160 | fastmap[j] = 1; | ||
| 3161 | } | ||
| 3162 | FALLTHROUGH; | ||
| 3163 | case charset: | ||
| 3164 | if (!fastmap) break; | ||
| 3165 | not = (re_opcode_t) *(p - 1) == charset_not; | ||
| 3166 | nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; | ||
| 3167 | p++; | ||
| 3168 | for (j = 0; j < nbits; j++) | ||
| 3169 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | ||
| 3170 | fastmap[j] = 1; | ||
| 3171 | |||
| 3172 | /* To match raw bytes (in the 80..ff range) against multibyte | ||
| 3173 | strings, add their leading bytes to the fastmap. */ | ||
| 3174 | for (j = 0x80; j < nbits; j++) | ||
| 3175 | if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) | ||
| 3176 | fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1; | ||
| 3177 | |||
| 3178 | if (/* Any leading code can possibly start a character | ||
| 3179 | which doesn't match the specified set of characters. */ | ||
| 3180 | not | ||
| 3181 | || | ||
| 3182 | /* If we can match a character class, we can match any | ||
| 3183 | multibyte characters. */ | ||
| 3184 | (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | ||
| 3185 | && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) | ||
| 3186 | |||
| 3187 | { | ||
| 3188 | if (match_any_multibyte_characters == false) | ||
| 3189 | { | ||
| 3190 | for (j = MIN_MULTIBYTE_LEADING_CODE; | ||
| 3191 | j <= MAX_MULTIBYTE_LEADING_CODE; j++) | ||
| 3192 | fastmap[j] = 1; | ||
| 3193 | match_any_multibyte_characters = true; | ||
| 3194 | } | ||
| 3195 | } | ||
| 3196 | |||
| 3197 | else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) | ||
| 3198 | && match_any_multibyte_characters == false) | ||
| 3199 | { | ||
| 3200 | /* Set fastmap[I] to 1 where I is a leading code of each | ||
| 3201 | multibyte character in the range table. */ | ||
| 3202 | int c, count; | ||
| 3203 | unsigned char lc1, lc2; | ||
| 3204 | |||
| 3205 | /* Make P points the range table. '+ 2' is to skip flag | ||
| 3206 | bits for a character class. */ | ||
| 3207 | p += CHARSET_BITMAP_SIZE (&p[-2]) + 2; | ||
| 3208 | |||
| 3209 | /* Extract the number of ranges in range table into COUNT. */ | ||
| 3210 | EXTRACT_NUMBER_AND_INCR (count, p); | ||
| 3211 | for (; count > 0; count--, p += 3) | ||
| 3212 | { | ||
| 3213 | /* Extract the start and end of each range. */ | ||
| 3214 | EXTRACT_CHARACTER (c, p); | ||
| 3215 | lc1 = CHAR_LEADING_CODE (c); | ||
| 3216 | p += 3; | ||
| 3217 | EXTRACT_CHARACTER (c, p); | ||
| 3218 | lc2 = CHAR_LEADING_CODE (c); | ||
| 3219 | for (j = lc1; j <= lc2; j++) | ||
| 3220 | fastmap[j] = 1; | ||
| 3221 | } | ||
| 3222 | } | ||
| 3223 | break; | ||
| 3224 | |||
| 3225 | case syntaxspec: | ||
| 3226 | case notsyntaxspec: | ||
| 3227 | if (!fastmap) break; | ||
| 3228 | /* This match depends on text properties. These end with | ||
| 3229 | aborting optimizations. */ | ||
| 3230 | return true; | ||
| 3231 | |||
| 3232 | case categoryspec: | ||
| 3233 | case notcategoryspec: | ||
| 3234 | if (!fastmap) break; | ||
| 3235 | not = (re_opcode_t)p[-1] == notcategoryspec; | ||
| 3236 | k = *p++; | ||
| 3237 | for (j = (1 << BYTEWIDTH); j >= 0; j--) | ||
| 3238 | if ((CHAR_HAS_CATEGORY (j, k)) ^ not) | ||
| 3239 | fastmap[j] = 1; | ||
| 3240 | |||
| 3241 | /* Any leading code can possibly start a character which | ||
| 3242 | has or doesn't has the specified category. */ | ||
| 3243 | if (match_any_multibyte_characters == false) | ||
| 3244 | { | ||
| 3245 | for (j = MIN_MULTIBYTE_LEADING_CODE; | ||
| 3246 | j <= MAX_MULTIBYTE_LEADING_CODE; j++) | ||
| 3247 | fastmap[j] = 1; | ||
| 3248 | match_any_multibyte_characters = true; | ||
| 3249 | } | ||
| 3250 | break; | ||
| 3251 | |||
| 3252 | /* All cases after this match the empty string. These end with | ||
| 3253 | 'continue'. */ | ||
| 3254 | |||
| 3255 | case at_dot: | ||
| 3256 | case no_op: | ||
| 3257 | case begline: | ||
| 3258 | case endline: | ||
| 3259 | case begbuf: | ||
| 3260 | case endbuf: | ||
| 3261 | case wordbound: | ||
| 3262 | case notwordbound: | ||
| 3263 | case wordbeg: | ||
| 3264 | case wordend: | ||
| 3265 | case symbeg: | ||
| 3266 | case symend: | ||
| 3267 | continue; | ||
| 3268 | |||
| 3269 | |||
| 3270 | case jump: | ||
| 3271 | EXTRACT_NUMBER_AND_INCR (j, p); | ||
| 3272 | if (j < 0) | ||
| 3273 | /* Backward jumps can only go back to code that we've already | ||
| 3274 | visited. 're_compile' should make sure this is true. */ | ||
| 3275 | break; | ||
| 3276 | p += j; | ||
| 3277 | switch (*p) | ||
| 3278 | { | ||
| 3279 | case on_failure_jump: | ||
| 3280 | case on_failure_keep_string_jump: | ||
| 3281 | case on_failure_jump_loop: | ||
| 3282 | case on_failure_jump_nastyloop: | ||
| 3283 | case on_failure_jump_smart: | ||
| 3284 | p++; | ||
| 3285 | break; | ||
| 3286 | default: | ||
| 3287 | continue; | ||
| 3288 | }; | ||
| 3289 | /* Keep P1 to allow the 'on_failure_jump' we are jumping to | ||
| 3290 | to jump back to "just after here". */ | ||
| 3291 | FALLTHROUGH; | ||
| 3292 | case on_failure_jump: | ||
| 3293 | case on_failure_keep_string_jump: | ||
| 3294 | case on_failure_jump_nastyloop: | ||
| 3295 | case on_failure_jump_loop: | ||
| 3296 | case on_failure_jump_smart: | ||
| 3297 | EXTRACT_NUMBER_AND_INCR (j, p); | ||
| 3298 | if (p + j <= p1) | ||
| 3299 | ; /* Backward jump to be ignored. */ | ||
| 3300 | else | ||
| 3301 | { /* We have to look down both arms. | ||
| 3302 | We first go down the "straight" path so as to minimize | ||
| 3303 | stack usage when going through alternatives. */ | ||
| 3304 | bool r = analyze_first_old (p, pend, fastmap, multibyte); | ||
| 3305 | if (r) return r; | ||
| 3306 | p += j; | ||
| 3307 | } | ||
| 3308 | continue; | ||
| 3309 | |||
| 3310 | |||
| 3311 | case jump_n: | ||
| 3312 | /* This code simply does not properly handle forward jump_n. */ | ||
| 3313 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0)); | ||
| 3314 | p += 4; | ||
| 3315 | /* jump_n can either jump or fall through. The (backward) jump | ||
| 3316 | case has already been handled, so we only need to look at the | ||
| 3317 | fallthrough case. */ | ||
| 3318 | continue; | ||
| 3319 | |||
| 3320 | case succeed_n: | ||
| 3321 | /* If N == 0, it should be an on_failure_jump_loop instead. | ||
| 3322 | `j` can be negative because `EXTRACT_NUMBER` extracts a | ||
| 3323 | signed number whereas `succeed_n` treats it as unsigned. */ | ||
| 3324 | DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0)); | ||
| 3325 | p += 4; | ||
| 3326 | /* We only care about one iteration of the loop, so we don't | ||
| 3327 | need to consider the case where this behaves like an | ||
| 3328 | on_failure_jump. */ | ||
| 3329 | /* FIXME: Sadly, the above is not true when the loop's body | ||
| 3330 | can match the empty string :-( */ | ||
| 3331 | /* continue; */ | ||
| 3332 | return true; | ||
| 3333 | |||
| 3334 | case set_number_at: | ||
| 3335 | p += 4; | ||
| 3336 | continue; | ||
| 3337 | |||
| 3338 | |||
| 3339 | case start_memory: | ||
| 3340 | case stop_memory: | ||
| 3341 | p += 1; | ||
| 3342 | continue; | ||
| 3343 | |||
| 3344 | |||
| 3345 | default: | ||
| 3346 | abort (); /* We have listed all the cases. */ | ||
| 3347 | } /* switch *p++ */ | ||
| 3348 | |||
| 3349 | /* Getting here means we have found the possible starting | ||
| 3350 | characters for one path of the pattern -- and that the empty | ||
| 3351 | string does not match. We need not follow this path further. */ | ||
| 3352 | return false; | ||
| 3353 | } /* while p */ | ||
| 3354 | |||
| 3355 | /* We reached the end without matching anything. */ | ||
| 3356 | return true; | ||
| 3357 | |||
| 3358 | } /* analyze_first_old */ | ||
| 3359 | |||
| 3360 | struct anafirst_data { | 3047 | struct anafirst_data { |
| 3361 | bool multibyte; | 3048 | bool multibyte; |
| 3362 | char *fastmap; | 3049 | char *fastmap; |
| @@ -3594,21 +3281,6 @@ analyze_first (struct re_pattern_buffer *bufp, | |||
| 3594 | fastmap ? analyze_first_fastmap | 3281 | fastmap ? analyze_first_fastmap |
| 3595 | : analyze_first_null, | 3282 | : analyze_first_null, |
| 3596 | &data); | 3283 | &data); |
| 3597 | #if ENABLE_CHECKING | ||
| 3598 | bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte); | ||
| 3599 | if (old && safe) | ||
| 3600 | { | ||
| 3601 | fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n", | ||
| 3602 | fastmap ? 1 : 0); | ||
| 3603 | print_partial_compiled_pattern (stderr, p, pend); | ||
| 3604 | } | ||
| 3605 | else if (!old && !safe) | ||
| 3606 | { | ||
| 3607 | fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n", | ||
| 3608 | fastmap ? 1 : 0); | ||
| 3609 | print_partial_compiled_pattern (stderr, p, pend); | ||
| 3610 | } | ||
| 3611 | #endif | ||
| 3612 | return !safe; | 3284 | return !safe; |
| 3613 | } | 3285 | } |
| 3614 | 3286 | ||
| @@ -4056,30 +3728,6 @@ skip_one_char (re_char *p) | |||
| 4056 | } | 3728 | } |
| 4057 | 3729 | ||
| 4058 | 3730 | ||
| 4059 | /* Jump over non-matching operations. */ | ||
| 4060 | static re_char * | ||
| 4061 | skip_noops (re_char *p, re_char *pend) | ||
| 4062 | { | ||
| 4063 | while (p < pend) | ||
| 4064 | { | ||
| 4065 | switch (*p) | ||
| 4066 | { | ||
| 4067 | case start_memory: | ||
| 4068 | case stop_memory: | ||
| 4069 | p += 2; break; | ||
| 4070 | case no_op: | ||
| 4071 | p += 1; break; | ||
| 4072 | case jump: | ||
| 4073 | p = extract_address (p + 1); | ||
| 4074 | break; | ||
| 4075 | default: | ||
| 4076 | return p; | ||
| 4077 | } | ||
| 4078 | } | ||
| 4079 | eassert (p == pend); | ||
| 4080 | return p; | ||
| 4081 | } | ||
| 4082 | |||
| 4083 | /* Test if C matches charset op. *PP points to the charset or charset_not | 3731 | /* Test if C matches charset op. *PP points to the charset or charset_not |
| 4084 | opcode. When the function finishes, *PP will be advanced past that opcode. | 3732 | opcode. When the function finishes, *PP will be advanced past that opcode. |
| 4085 | C is character to test (possibly after translations) and CORIG is original | 3733 | C is character to test (possibly after translations) and CORIG is original |
| @@ -4247,199 +3895,6 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, | |||
| 4247 | return false; | 3895 | return false; |
| 4248 | } | 3896 | } |
| 4249 | 3897 | ||
| 4250 | /* True if "p1 matches something" implies "p2 fails". */ | ||
| 4251 | /* We're trying to follow all paths reachable from `p2`, but since some | ||
| 4252 | loops can match the empty string, this can loop back to `p2`. | ||
| 4253 | |||
| 4254 | To avoid inf-looping, we take advantage of the fact that | ||
| 4255 | the bytecode we generate is made of syntactically nested loops, more | ||
| 4256 | specifically, every loop has a single entry point and single exit point. | ||
| 4257 | |||
| 4258 | The function takes 2 more arguments (`loop_entry` and `loop_exit`). | ||
| 4259 | `loop_entry` points to the sole entry point of the current loop and | ||
| 4260 | `loop_exit` points to its sole exit point (when non-NULL). | ||
| 4261 | |||
| 4262 | Jumps outside of `loop_entry..exit` should not occur. | ||
| 4263 | The function can assume that `loop_exit` is "mutually exclusive". | ||
| 4264 | The same holds for `loop_entry` except when `p2 == loop_entry`. | ||
| 4265 | |||
| 4266 | To guarantee termination, recursive calls should make sure that either | ||
| 4267 | `loop_entry` is larger, or it's unchanged but `p2` is larger. | ||
| 4268 | |||
| 4269 | FIXME: This is failsafe (can't return true when it shouldn't) | ||
| 4270 | but it could be too conservative if we start generating bytecode | ||
| 4271 | with a different shape, so maybe we should bite the bullet and | ||
| 4272 | replace done_beg/end with an actual list of positions we've | ||
| 4273 | already processed. */ | ||
| 4274 | static bool | ||
| 4275 | mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, | ||
| 4276 | re_char *p2, re_char *loop_entry, re_char *loop_exit) | ||
| 4277 | { | ||
| 4278 | re_opcode_t op2; | ||
| 4279 | unsigned char *pend = bufp->buffer + bufp->used; | ||
| 4280 | re_char *p2_orig = p2; | ||
| 4281 | |||
| 4282 | eassert (p1 >= bufp->buffer && p1 < pend | ||
| 4283 | && p2 >= bufp->buffer && p2 <= pend); | ||
| 4284 | |||
| 4285 | if (p2 == loop_exit) | ||
| 4286 | return true; /* Presumably already checked elsewhere. */ | ||
| 4287 | eassert (loop_entry && p2 >= loop_entry); | ||
| 4288 | if (p2 < loop_entry || (loop_exit && p2 > loop_exit)) | ||
| 4289 | { /* The assumptions about the shape of the code aren't true :-( */ | ||
| 4290 | #ifdef ENABLE_CHECKING | ||
| 4291 | error ("Broken assumption in regex.c:mutually_exclusive_aux"); | ||
| 4292 | #endif | ||
| 4293 | return false; | ||
| 4294 | } | ||
| 4295 | |||
| 4296 | /* Skip over open/close-group commands. | ||
| 4297 | If what follows this loop is a ...+ construct, | ||
| 4298 | look at what begins its body, since we will have to | ||
| 4299 | match at least one of that. */ | ||
| 4300 | p2 = skip_noops (p2, pend); | ||
| 4301 | /* The same skip can be done for p1, except that this function | ||
| 4302 | is only used in the case where p1 is a simple match operator. */ | ||
| 4303 | /* p1 = skip_noops (p1, pend); */ | ||
| 4304 | |||
| 4305 | eassert (p1 >= bufp->buffer && p1 < pend | ||
| 4306 | && p2 >= bufp->buffer && p2 <= pend); | ||
| 4307 | |||
| 4308 | op2 = p2 == pend ? succeed : *p2; | ||
| 4309 | |||
| 4310 | switch (op2) | ||
| 4311 | { | ||
| 4312 | case succeed: | ||
| 4313 | case endbuf: | ||
| 4314 | /* If we're at the end of the pattern, we can change. */ | ||
| 4315 | if (skip_one_char (p1)) | ||
| 4316 | { | ||
| 4317 | DEBUG_PRINT (" End of pattern: fast loop.\n"); | ||
| 4318 | return true; | ||
| 4319 | } | ||
| 4320 | break; | ||
| 4321 | |||
| 4322 | case endline: | ||
| 4323 | case exactn: | ||
| 4324 | return mutually_exclusive_exactn (bufp, p1, p2); | ||
| 4325 | |||
| 4326 | case charset: | ||
| 4327 | { | ||
| 4328 | if ((re_opcode_t) *p1 == exactn) | ||
| 4329 | return mutually_exclusive_exactn (bufp, p2, p1); | ||
| 4330 | else | ||
| 4331 | return mutually_exclusive_charset (bufp, p1, p2); | ||
| 4332 | } | ||
| 4333 | break; | ||
| 4334 | |||
| 4335 | case charset_not: | ||
| 4336 | switch (*p1) | ||
| 4337 | { | ||
| 4338 | case exactn: | ||
| 4339 | return mutually_exclusive_exactn (bufp, p2, p1); | ||
| 4340 | case charset: | ||
| 4341 | return mutually_exclusive_charset (bufp, p2, p1); | ||
| 4342 | case charset_not: | ||
| 4343 | /* When we have two charset_not, it's very unlikely that | ||
| 4344 | they don't overlap. The union of the two sets of excluded | ||
| 4345 | chars should cover all possible chars, which, as a matter of | ||
| 4346 | fact, is virtually impossible in multibyte buffers. */ | ||
| 4347 | break; | ||
| 4348 | } | ||
| 4349 | break; | ||
| 4350 | |||
| 4351 | case wordend: | ||
| 4352 | return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword); | ||
| 4353 | case symend: | ||
| 4354 | return ((re_opcode_t) *p1 == syntaxspec | ||
| 4355 | && (p1[1] == Ssymbol || p1[1] == Sword)); | ||
| 4356 | case notsyntaxspec: | ||
| 4357 | return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]); | ||
| 4358 | |||
| 4359 | case wordbeg: | ||
| 4360 | return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword); | ||
| 4361 | case symbeg: | ||
| 4362 | return ((re_opcode_t) *p1 == notsyntaxspec | ||
| 4363 | && (p1[1] == Ssymbol || p1[1] == Sword)); | ||
| 4364 | case syntaxspec: | ||
| 4365 | return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]); | ||
| 4366 | |||
| 4367 | case wordbound: | ||
| 4368 | /* FIXME: This optimization seems correct after the first iteration | ||
| 4369 | of the loop, but not for the very first :-( | ||
| 4370 | IOW we'd need to pull out the first iteration and do: | ||
| 4371 | |||
| 4372 | syntaxspec w | ||
| 4373 | on_failure_keep_string_jump end | ||
| 4374 | loop: | ||
| 4375 | syntaxspec w | ||
| 4376 | goto loop | ||
| 4377 | end: | ||
| 4378 | wordbound | ||
| 4379 | |||
| 4380 | return (((re_opcode_t) *p1 == notsyntaxspec | ||
| 4381 | || (re_opcode_t) *p1 == syntaxspec) | ||
| 4382 | && p1[1] == Sword); */ | ||
| 4383 | return false; | ||
| 4384 | |||
| 4385 | case categoryspec: | ||
| 4386 | return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]); | ||
| 4387 | case notcategoryspec: | ||
| 4388 | return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]); | ||
| 4389 | |||
| 4390 | case on_failure_jump_nastyloop: | ||
| 4391 | case on_failure_jump_smart: | ||
| 4392 | case on_failure_jump_loop: | ||
| 4393 | case on_failure_keep_string_jump: | ||
| 4394 | case on_failure_jump: | ||
| 4395 | { | ||
| 4396 | int mcnt; | ||
| 4397 | p2++; | ||
| 4398 | EXTRACT_NUMBER_AND_INCR (mcnt, p2); | ||
| 4399 | re_char *p2_other = p2 + mcnt, *tmp; | ||
| 4400 | /* For `+` loops, we often have an `on_failure_jump` that skips forward | ||
| 4401 | over a subsequent `jump` for lack of an `on_failure_dont_jump` | ||
| 4402 | kind of thing. Recognize this pattern since that subsequent | ||
| 4403 | `jump` is the one that jumps to the loop-entry. */ | ||
| 4404 | if ((re_opcode_t) p2[0] == jump && mcnt == 3) | ||
| 4405 | p2 = extract_address (p2 + 1); | ||
| 4406 | |||
| 4407 | /* We have to check that both destinations are safe. | ||
| 4408 | Arrange for `p2` to be the smaller of the two. */ | ||
| 4409 | if (p2 > p2_other) | ||
| 4410 | (tmp = p2, p2 = p2_other, p2_other = tmp); | ||
| 4411 | |||
| 4412 | if (p2_other <= p2_orig /* Both destinations go backward! */ | ||
| 4413 | || !mutually_exclusive_aux (bufp, p1, p2_other, | ||
| 4414 | loop_entry, loop_exit)) | ||
| 4415 | return false; | ||
| 4416 | |||
| 4417 | /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive) | ||
| 4418 | position, let's check `p2`. */ | ||
| 4419 | if (p2 == loop_entry) | ||
| 4420 | /* If we jump backward to the entry point of the current loop | ||
| 4421 | it means it's a zero-length cycle through that loop, so | ||
| 4422 | this cycle itself does not break mutual-exclusion. */ | ||
| 4423 | return true; | ||
| 4424 | else if (p2 > p2_orig) | ||
| 4425 | /* Boring forward jump. */ | ||
| 4426 | return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit); | ||
| 4427 | else if (loop_entry < p2 && p2 < p2_orig) | ||
| 4428 | /* We jump backward to a new loop, nested within the current one. | ||
| 4429 | `p2` is the entry point and `p2_other` the exit of that inner. */ | ||
| 4430 | return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other); | ||
| 4431 | else | ||
| 4432 | return false; | ||
| 4433 | } | ||
| 4434 | |||
| 4435 | default: | ||
| 4436 | ; | ||
| 4437 | } | ||
| 4438 | |||
| 4439 | /* Safe default. */ | ||
| 4440 | return false; | ||
| 4441 | } | ||
| 4442 | |||
| 4443 | struct mutexcl_data { | 3898 | struct mutexcl_data { |
| 4444 | struct re_pattern_buffer *bufp; | 3899 | struct re_pattern_buffer *bufp; |
| 4445 | re_char *p1; | 3900 | re_char *p1; |
| @@ -4514,28 +3969,14 @@ mutually_exclusive_one (re_char *p2, void *arg) | |||
| 4514 | } | 3969 | } |
| 4515 | } | 3970 | } |
| 4516 | 3971 | ||
| 3972 | /* True if "p1 matches something" implies "p2 fails". */ | ||
| 3973 | |||
| 4517 | static bool | 3974 | static bool |
| 4518 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, | 3975 | mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, |
| 4519 | re_char *p2) | 3976 | re_char *p2) |
| 4520 | { | 3977 | { |
| 4521 | struct mutexcl_data data = { bufp, p1 }; | 3978 | struct mutexcl_data data = { bufp, p1 }; |
| 4522 | bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data); | 3979 | return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data); |
| 4523 | #if ENABLE_CHECKING | ||
| 4524 | bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL); | ||
| 4525 | if (old && !new) | ||
| 4526 | { | ||
| 4527 | fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n", | ||
| 4528 | (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer)); | ||
| 4529 | print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used); | ||
| 4530 | } | ||
| 4531 | else if (!old && new) | ||
| 4532 | { | ||
| 4533 | fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n", | ||
| 4534 | (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer)); | ||
| 4535 | print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used); | ||
| 4536 | } | ||
| 4537 | #endif | ||
| 4538 | return new; | ||
| 4539 | } | 3980 | } |
| 4540 | 3981 | ||
| 4541 | /* Matching routines. */ | 3982 | /* Matching routines. */ |