aboutsummaryrefslogtreecommitdiffstats
path: root/src/minibuf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/minibuf.c')
-rw-r--r--src/minibuf.c197
1 files changed, 197 insertions, 0 deletions
diff --git a/src/minibuf.c b/src/minibuf.c
index 5dc2b230883..f7dffc24b94 100644
--- a/src/minibuf.c
+++ b/src/minibuf.c
@@ -20,6 +20,7 @@ along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */
20 20
21#include <config.h> 21#include <config.h>
22#include <errno.h> 22#include <errno.h>
23#include <math.h>
23 24
24#include <binary-io.h> 25#include <binary-io.h>
25 26
@@ -2279,6 +2280,201 @@ init_minibuf_once_for_pdumper (void)
2279 last_minibuf_string = Qnil; 2280 last_minibuf_string = Qnil;
2280} 2281}
2281 2282
2283/* FLEX/GOTOH algorithm for the 'flex' completion-style. Adapted from
2284 GOTOH, Osamu. An improved algorithm for matching biological
2285 sequences. Journal of molecular biology, 1982, 162.3: 705-708.
2286
2287 This algorithm matches patterns to candidate strings, or needles to
2288 haystacks. It works with cost matrices: imagine rows of these
2289 matrices as pattern characters, and columns as the candidate string
2290 characters. There is a -1 row, and a -1 column. The values there
2291 hold real costs used for situations "before the first ever" match of
2292 a pattern character to a string character.
2293
2294 M and D are cost matrices. At the end of the algorithm, M will have
2295 non-infinite values only for the spots where a pattern character
2296 matches a string character. So a non-infinite M[i,j] means the i-th
2297 character of the pattern matches the j-th character of the string.
2298 The value stored is the lowest possible cost the algorithm had to
2299 "pay" to be able to make that match there, given everything that may
2300 have happened before/to the left. An infinite value simply means no
2301 match at this pattern/string position. Note that both row and column
2302 of M may have more than one match at multiple indices. But this
2303 particular implementation of the algorithm assumes they have at least
2304 one match.
2305
2306 D (originally stands for 'Deletion' in the Gotoh paper) has "running
2307 costs". Each value D[i,j] represents what the algorithm has to pay
2308 to make or extend a gap when a match is found at i+1, j+1. By that
2309 time, that cost may or may not be lower than continuing from a match
2310 that had also been found at i,j. We always pick the lowest cost, and
2311 by the time we reach the final column, we know we have picked the
2312 cheapest possible path choosing when to gap, and when to follow up.
2313
2314 Along the way, we construct P, a matrix used just for backtracking,
2315 to reconstruct that path. Maybe P isn't needed, and all the
2316 information can be cleverly derived from the final state of M and D.
2317 But I couldn't make it work. */
2318DEFUN ("completion--flex-score-gotoh", Fcompletion__flex_score_gotoh,
2319 Scompletion__flex_score_gotoh, 2, 2, 0,
2320 doc: /* Compute flex score of STR matching PAT using Gotoh
2321algorithm. Return nil if no match found, else return (COST . MATCHES)
2322where COST is a fixnum (lower is better) and MATCHES is a list of match
2323positions in STR. */)
2324 (Lisp_Object pat, Lisp_Object str)
2325{
2326 /* Pre-allocated matrices for flex completion scoring. */
2327#define FLEX_MAX_STR_SIZE 512
2328#define FLEX_MAX_PAT_SIZE 128
2329#define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE
2330 /* Macro for 2D indexing into "flat" arrays. */
2331#define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)])
2332
2333 CHECK_STRING (pat);
2334 CHECK_STRING (str);
2335
2336 size_t patlen = SCHARS (pat);
2337 size_t strlen = SCHARS (str);
2338 size_t width = strlen + 1;
2339 size_t size = (patlen + 1) * width;
2340
2341 /* Bail if strings are empty or matrix too large. */
2342 if (patlen == 0 || strlen == 0)
2343 return Qnil;
2344
2345 if (size > FLEX_MAX_MATRIX_SIZE)
2346 return Qnil;
2347
2348 /* Cost constants (lower is better). Maybe these could be
2349 defcustom's?*/
2350 const int gap_open_cost = 10;
2351 const int gap_extend_cost = 1;
2352 const int pos_inf = INT_MAX / 2;
2353
2354 static int M[FLEX_MAX_MATRIX_SIZE];
2355 static int D[FLEX_MAX_MATRIX_SIZE];
2356 static size_t P[FLEX_MAX_MATRIX_SIZE];
2357
2358 /* Initialize costs. Fill both matrices with positive infinity. */
2359 for (int j = 0; j < size; j++) M[j] = pos_inf;
2360 for (int j = 0; j < size; j++) D[j] = pos_inf;
2361 /* Except for D[0,0], which is 0, for prioritizing matches at the
2362 beginning. Remaining elements on the first row are gap_open_cost/2
2363 to represent cheaper leading gaps. */
2364 for (int j = 0; j < width; j++) D[j] = gap_open_cost/2;
2365 D[0] = 0;
2366
2367 /* Index of last match before gap started, as computed in the previous
2368 row. Used to build P. */
2369 int prev_gap_origin = -1;
2370
2371 /* Poor man's iterator type. */
2372 typedef struct iter { int x; ptrdiff_t c; ptrdiff_t b; } iter_t;
2373
2374 /* Info about first match computed in the previous row. */
2375 iter_t prev_match = {0,0,0};
2376 /* Forward pass. */
2377 for (iter_t i = {0,0,0}; i.x < patlen; i.x++)
2378 {
2379 int pat_char = fetch_string_char_advance(pat, &i.c, &i.b);
2380 int gap_origin = -1;
2381 bool match_seen = false;
2382
2383 for (iter_t j = prev_match; j.x < strlen; j.x++)
2384 {
2385 iter_t jcopy = j; /* else advance function destroys it... */
2386 int str_char
2387 = fetch_string_char_advance (str, &j.c, &j.b);
2388
2389 /* Check if characters match (case-insensitive if needed). */
2390 bool cmatch;
2391 if (completion_ignore_case)
2392 cmatch = (downcase (pat_char) == downcase (str_char));
2393 else
2394 cmatch = (pat_char == str_char);
2395
2396 /* Compute match cost M[i][j], i.e. replace its infinite
2397 value with something finite. */
2398 if (cmatch)
2399 {
2400 if (!match_seen)
2401 {
2402 match_seen = true;
2403 prev_match = jcopy;
2404 }
2405 int pmatch_cost = MAT (M, i.x - 1, j.x - 1);
2406 int pgap_cost = MAT (D, i.x - 1, j.x - 1);
2407
2408 if (pmatch_cost <= pgap_cost)
2409 {
2410 /* Not only did the previous char also match (else
2411 pmatch_cost would have been infinite) but following
2412 it up with this match is best overall. */
2413 MAT (M, i.x, j.x) = pmatch_cost;
2414 MAT (P, i.x, j.x) = j.x - 1;
2415 }
2416 else
2417 {
2418 /* Gapping is best, regardless of whether the previous
2419 char also matched. That is, it's better to arrive at
2420 this match from a gap. */
2421 MAT (M, i.x, j.x) = pgap_cost;
2422 MAT (P, i.x, j.x) = prev_gap_origin;
2423 }
2424 }
2425
2426 /* Regardless of a match here, compute D[i,j], the best
2427 accumulated gapping cost at this point, considering whether
2428 it's more advantageous to open from a previous match on
2429 this row (a cost which may well be infinite if no such
2430 match ever existed) or extend a gap started sometime
2431 before. The next iteration will take this into account,
2432 and so will the next row when analyzing a possible match
2433 for the j+1-th string character. */
2434 int open_cost = MAT (M, i.x, j.x - 1) + gap_open_cost;
2435 int extend_cost = MAT (D, i.x, j.x - 1) + gap_extend_cost;
2436
2437 if (open_cost < extend_cost)
2438 {
2439 MAT (D, i.x, j.x) = open_cost;
2440 gap_origin = j.x - 1; /* New gap. */
2441 }
2442 else
2443 MAT (D, i.x, j.x) = extend_cost; /* Extend gap. */
2444 }
2445 prev_gap_origin = gap_origin;
2446 }
2447
2448 /* Find best (lowest) cost in last row. */
2449 int best_cost = pos_inf;
2450 int lastcol = -1;
2451
2452 for (int j = 0; j < strlen; j++)
2453 {
2454 int cost = MAT (M, patlen - 1, j);
2455 if (cost < best_cost)
2456 {
2457 best_cost = cost;
2458 lastcol = j;
2459 }
2460 }
2461
2462 if (lastcol < 0 || best_cost >= pos_inf)
2463 return Qnil;
2464
2465 /* Build match positions list by tracing back through P matrix. */
2466 Lisp_Object matches = Qnil;
2467 for (int i = patlen - 1, l = lastcol; i >= 0 && l >= 0; i--)
2468 {
2469 matches = Fcons (make_fixnum (l), matches);
2470 l = MAT (P, i, l);
2471 }
2472
2473 return Fcons (make_fixnum (best_cost), matches);
2474#undef MAT
2475
2476}
2477
2282void 2478void
2283syms_of_minibuf (void) 2479syms_of_minibuf (void)
2284{ 2480{
@@ -2541,6 +2737,7 @@ showing the *Completions* buffer, if any. */);
2541 defsubr (&Stest_completion); 2737 defsubr (&Stest_completion);
2542 defsubr (&Sassoc_string); 2738 defsubr (&Sassoc_string);
2543 defsubr (&Scompleting_read); 2739 defsubr (&Scompleting_read);
2740 defsubr (&Scompletion__flex_score_gotoh);
2544 DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); 2741 DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit");
2545 DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); 2742 DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer");
2546 DEFSYM (Qcompleting_read_function, "completing-read-function"); 2743 DEFSYM (Qcompleting_read_function, "completing-read-function");