diff options
Diffstat (limited to 'src/minibuf.c')
| -rw-r--r-- | src/minibuf.c | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/src/minibuf.c b/src/minibuf.c index 5dc2b230883..f7dffc24b94 100644 --- a/src/minibuf.c +++ b/src/minibuf.c | |||
| @@ -20,6 +20,7 @@ along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */ | |||
| 20 | 20 | ||
| 21 | #include <config.h> | 21 | #include <config.h> |
| 22 | #include <errno.h> | 22 | #include <errno.h> |
| 23 | #include <math.h> | ||
| 23 | 24 | ||
| 24 | #include <binary-io.h> | 25 | #include <binary-io.h> |
| 25 | 26 | ||
| @@ -2279,6 +2280,201 @@ init_minibuf_once_for_pdumper (void) | |||
| 2279 | last_minibuf_string = Qnil; | 2280 | last_minibuf_string = Qnil; |
| 2280 | } | 2281 | } |
| 2281 | 2282 | ||
| 2283 | /* FLEX/GOTOH algorithm for the 'flex' completion-style. Adapted from | ||
| 2284 | GOTOH, Osamu. An improved algorithm for matching biological | ||
| 2285 | sequences. Journal of molecular biology, 1982, 162.3: 705-708. | ||
| 2286 | |||
| 2287 | This algorithm matches patterns to candidate strings, or needles to | ||
| 2288 | haystacks. It works with cost matrices: imagine rows of these | ||
| 2289 | matrices as pattern characters, and columns as the candidate string | ||
| 2290 | characters. There is a -1 row, and a -1 column. The values there | ||
| 2291 | hold real costs used for situations "before the first ever" match of | ||
| 2292 | a pattern character to a string character. | ||
| 2293 | |||
| 2294 | M and D are cost matrices. At the end of the algorithm, M will have | ||
| 2295 | non-infinite values only for the spots where a pattern character | ||
| 2296 | matches a string character. So a non-infinite M[i,j] means the i-th | ||
| 2297 | character of the pattern matches the j-th character of the string. | ||
| 2298 | The value stored is the lowest possible cost the algorithm had to | ||
| 2299 | "pay" to be able to make that match there, given everything that may | ||
| 2300 | have happened before/to the left. An infinite value simply means no | ||
| 2301 | match at this pattern/string position. Note that both row and column | ||
| 2302 | of M may have more than one match at multiple indices. But this | ||
| 2303 | particular implementation of the algorithm assumes they have at least | ||
| 2304 | one match. | ||
| 2305 | |||
| 2306 | D (originally stands for 'Deletion' in the Gotoh paper) has "running | ||
| 2307 | costs". Each value D[i,j] represents what the algorithm has to pay | ||
| 2308 | to make or extend a gap when a match is found at i+1, j+1. By that | ||
| 2309 | time, that cost may or may not be lower than continuing from a match | ||
| 2310 | that had also been found at i,j. We always pick the lowest cost, and | ||
| 2311 | by the time we reach the final column, we know we have picked the | ||
| 2312 | cheapest possible path choosing when to gap, and when to follow up. | ||
| 2313 | |||
| 2314 | Along the way, we construct P, a matrix used just for backtracking, | ||
| 2315 | to reconstruct that path. Maybe P isn't needed, and all the | ||
| 2316 | information can be cleverly derived from the final state of M and D. | ||
| 2317 | But I couldn't make it work. */ | ||
| 2318 | DEFUN ("completion--flex-score-gotoh", Fcompletion__flex_score_gotoh, | ||
| 2319 | Scompletion__flex_score_gotoh, 2, 2, 0, | ||
| 2320 | doc: /* Compute flex score of STR matching PAT using Gotoh | ||
| 2321 | algorithm. Return nil if no match found, else return (COST . MATCHES) | ||
| 2322 | where COST is a fixnum (lower is better) and MATCHES is a list of match | ||
| 2323 | positions in STR. */) | ||
| 2324 | (Lisp_Object pat, Lisp_Object str) | ||
| 2325 | { | ||
| 2326 | /* Pre-allocated matrices for flex completion scoring. */ | ||
| 2327 | #define FLEX_MAX_STR_SIZE 512 | ||
| 2328 | #define FLEX_MAX_PAT_SIZE 128 | ||
| 2329 | #define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE | ||
| 2330 | /* Macro for 2D indexing into "flat" arrays. */ | ||
| 2331 | #define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)]) | ||
| 2332 | |||
| 2333 | CHECK_STRING (pat); | ||
| 2334 | CHECK_STRING (str); | ||
| 2335 | |||
| 2336 | size_t patlen = SCHARS (pat); | ||
| 2337 | size_t strlen = SCHARS (str); | ||
| 2338 | size_t width = strlen + 1; | ||
| 2339 | size_t size = (patlen + 1) * width; | ||
| 2340 | |||
| 2341 | /* Bail if strings are empty or matrix too large. */ | ||
| 2342 | if (patlen == 0 || strlen == 0) | ||
| 2343 | return Qnil; | ||
| 2344 | |||
| 2345 | if (size > FLEX_MAX_MATRIX_SIZE) | ||
| 2346 | return Qnil; | ||
| 2347 | |||
| 2348 | /* Cost constants (lower is better). Maybe these could be | ||
| 2349 | defcustom's?*/ | ||
| 2350 | const int gap_open_cost = 10; | ||
| 2351 | const int gap_extend_cost = 1; | ||
| 2352 | const int pos_inf = INT_MAX / 2; | ||
| 2353 | |||
| 2354 | static int M[FLEX_MAX_MATRIX_SIZE]; | ||
| 2355 | static int D[FLEX_MAX_MATRIX_SIZE]; | ||
| 2356 | static size_t P[FLEX_MAX_MATRIX_SIZE]; | ||
| 2357 | |||
| 2358 | /* Initialize costs. Fill both matrices with positive infinity. */ | ||
| 2359 | for (int j = 0; j < size; j++) M[j] = pos_inf; | ||
| 2360 | for (int j = 0; j < size; j++) D[j] = pos_inf; | ||
| 2361 | /* Except for D[0,0], which is 0, for prioritizing matches at the | ||
| 2362 | beginning. Remaining elements on the first row are gap_open_cost/2 | ||
| 2363 | to represent cheaper leading gaps. */ | ||
| 2364 | for (int j = 0; j < width; j++) D[j] = gap_open_cost/2; | ||
| 2365 | D[0] = 0; | ||
| 2366 | |||
| 2367 | /* Index of last match before gap started, as computed in the previous | ||
| 2368 | row. Used to build P. */ | ||
| 2369 | int prev_gap_origin = -1; | ||
| 2370 | |||
| 2371 | /* Poor man's iterator type. */ | ||
| 2372 | typedef struct iter { int x; ptrdiff_t c; ptrdiff_t b; } iter_t; | ||
| 2373 | |||
| 2374 | /* Info about first match computed in the previous row. */ | ||
| 2375 | iter_t prev_match = {0,0,0}; | ||
| 2376 | /* Forward pass. */ | ||
| 2377 | for (iter_t i = {0,0,0}; i.x < patlen; i.x++) | ||
| 2378 | { | ||
| 2379 | int pat_char = fetch_string_char_advance(pat, &i.c, &i.b); | ||
| 2380 | int gap_origin = -1; | ||
| 2381 | bool match_seen = false; | ||
| 2382 | |||
| 2383 | for (iter_t j = prev_match; j.x < strlen; j.x++) | ||
| 2384 | { | ||
| 2385 | iter_t jcopy = j; /* else advance function destroys it... */ | ||
| 2386 | int str_char | ||
| 2387 | = fetch_string_char_advance (str, &j.c, &j.b); | ||
| 2388 | |||
| 2389 | /* Check if characters match (case-insensitive if needed). */ | ||
| 2390 | bool cmatch; | ||
| 2391 | if (completion_ignore_case) | ||
| 2392 | cmatch = (downcase (pat_char) == downcase (str_char)); | ||
| 2393 | else | ||
| 2394 | cmatch = (pat_char == str_char); | ||
| 2395 | |||
| 2396 | /* Compute match cost M[i][j], i.e. replace its infinite | ||
| 2397 | value with something finite. */ | ||
| 2398 | if (cmatch) | ||
| 2399 | { | ||
| 2400 | if (!match_seen) | ||
| 2401 | { | ||
| 2402 | match_seen = true; | ||
| 2403 | prev_match = jcopy; | ||
| 2404 | } | ||
| 2405 | int pmatch_cost = MAT (M, i.x - 1, j.x - 1); | ||
| 2406 | int pgap_cost = MAT (D, i.x - 1, j.x - 1); | ||
| 2407 | |||
| 2408 | if (pmatch_cost <= pgap_cost) | ||
| 2409 | { | ||
| 2410 | /* Not only did the previous char also match (else | ||
| 2411 | pmatch_cost would have been infinite) but following | ||
| 2412 | it up with this match is best overall. */ | ||
| 2413 | MAT (M, i.x, j.x) = pmatch_cost; | ||
| 2414 | MAT (P, i.x, j.x) = j.x - 1; | ||
| 2415 | } | ||
| 2416 | else | ||
| 2417 | { | ||
| 2418 | /* Gapping is best, regardless of whether the previous | ||
| 2419 | char also matched. That is, it's better to arrive at | ||
| 2420 | this match from a gap. */ | ||
| 2421 | MAT (M, i.x, j.x) = pgap_cost; | ||
| 2422 | MAT (P, i.x, j.x) = prev_gap_origin; | ||
| 2423 | } | ||
| 2424 | } | ||
| 2425 | |||
| 2426 | /* Regardless of a match here, compute D[i,j], the best | ||
| 2427 | accumulated gapping cost at this point, considering whether | ||
| 2428 | it's more advantageous to open from a previous match on | ||
| 2429 | this row (a cost which may well be infinite if no such | ||
| 2430 | match ever existed) or extend a gap started sometime | ||
| 2431 | before. The next iteration will take this into account, | ||
| 2432 | and so will the next row when analyzing a possible match | ||
| 2433 | for the j+1-th string character. */ | ||
| 2434 | int open_cost = MAT (M, i.x, j.x - 1) + gap_open_cost; | ||
| 2435 | int extend_cost = MAT (D, i.x, j.x - 1) + gap_extend_cost; | ||
| 2436 | |||
| 2437 | if (open_cost < extend_cost) | ||
| 2438 | { | ||
| 2439 | MAT (D, i.x, j.x) = open_cost; | ||
| 2440 | gap_origin = j.x - 1; /* New gap. */ | ||
| 2441 | } | ||
| 2442 | else | ||
| 2443 | MAT (D, i.x, j.x) = extend_cost; /* Extend gap. */ | ||
| 2444 | } | ||
| 2445 | prev_gap_origin = gap_origin; | ||
| 2446 | } | ||
| 2447 | |||
| 2448 | /* Find best (lowest) cost in last row. */ | ||
| 2449 | int best_cost = pos_inf; | ||
| 2450 | int lastcol = -1; | ||
| 2451 | |||
| 2452 | for (int j = 0; j < strlen; j++) | ||
| 2453 | { | ||
| 2454 | int cost = MAT (M, patlen - 1, j); | ||
| 2455 | if (cost < best_cost) | ||
| 2456 | { | ||
| 2457 | best_cost = cost; | ||
| 2458 | lastcol = j; | ||
| 2459 | } | ||
| 2460 | } | ||
| 2461 | |||
| 2462 | if (lastcol < 0 || best_cost >= pos_inf) | ||
| 2463 | return Qnil; | ||
| 2464 | |||
| 2465 | /* Build match positions list by tracing back through P matrix. */ | ||
| 2466 | Lisp_Object matches = Qnil; | ||
| 2467 | for (int i = patlen - 1, l = lastcol; i >= 0 && l >= 0; i--) | ||
| 2468 | { | ||
| 2469 | matches = Fcons (make_fixnum (l), matches); | ||
| 2470 | l = MAT (P, i, l); | ||
| 2471 | } | ||
| 2472 | |||
| 2473 | return Fcons (make_fixnum (best_cost), matches); | ||
| 2474 | #undef MAT | ||
| 2475 | |||
| 2476 | } | ||
| 2477 | |||
| 2282 | void | 2478 | void |
| 2283 | syms_of_minibuf (void) | 2479 | syms_of_minibuf (void) |
| 2284 | { | 2480 | { |
| @@ -2541,6 +2737,7 @@ showing the *Completions* buffer, if any. */); | |||
| 2541 | defsubr (&Stest_completion); | 2737 | defsubr (&Stest_completion); |
| 2542 | defsubr (&Sassoc_string); | 2738 | defsubr (&Sassoc_string); |
| 2543 | defsubr (&Scompleting_read); | 2739 | defsubr (&Scompleting_read); |
| 2740 | defsubr (&Scompletion__flex_score_gotoh); | ||
| 2544 | DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); | 2741 | DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); |
| 2545 | DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); | 2742 | DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); |
| 2546 | DEFSYM (Qcompleting_read_function, "completing-read-function"); | 2743 | DEFSYM (Qcompleting_read_function, "completing-read-function"); |