aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJoão Távora2026-02-03 12:14:03 +0000
committerJoão Távora2026-02-13 23:46:11 +0000
commitaa181cd35220242a23fe98932386facaba18c4c5 (patch)
tree0ba9c1d7b47bfa434baf62512a67249f9d4c422a /src
parent52b24ed7aacb2e879c68e0f51f3b32c1700c9324 (diff)
downloademacs-aa181cd35220242a23fe98932386facaba18c4c5.tar.gz
emacs-aa181cd35220242a23fe98932386facaba18c4c5.zip
Rewrite flex completion with Gotoh algorithm
The greedy regexp matching, broken scoring and broken highlight were sources of frequent complaints about the 'flex' matching style. This commit fixes that. Inspired by the 'hotfuzz' style (available at https://github.com/axelf4/hotfuzz) it uses a modified version of Gotoh's 1982 dynamic programming algorithm. It is strictly more correct than the "old" flex. For example, when matching the pattern 'goto' to no longer will 'eglot-format' be sorted before some hypothetical much better 'goobarbaz-goto'. And of course the highlighting is also correctly placed on the 'goto', not scattered across the candidate. Regarding performance, it is faster than the naive 'flex', primarily because of the Elisp rewrite in minibuffer.el. The matching and costing algorithm matters but is not the bottleneck. The Elisp parts of the style were almost completely decoupled from the pcm/substring styles in lisp/minibuffer.el. Only 'completion-flex-try-completion' uses some of pcm's code for pattern augmentation. * src/minibuf.c (completion--flex-cost-gotoh): New function. * lisp/minibuffer.el (completion-flex--pattern-str): New variable. (flex-score-match-tightness): Make obsolete. (completion--flex-all-completions-1): New helper function. (completion-flex-try-completion, completion-flex-all-completions): Rewrite. (completion-substring--all-completions): No longer take transform-pattern-fn. (completion--flex-adjust-metadata): Tweak. (completion--flex-score, completion--flex-score-1) (completion--flex-score-last-md, completion-flex--make-flex-pattern): Delete. * test/lisp/minibuffer-tests.el (completion--sorted-flex-completions): New helper function. (completion-flex-test-non-ascii): New test. (completion--pcm-score): Delete. (completion-pcm-test-3, completion-pcm-test-4) (completion-substring-test-1, completion-substring-test-2) (completion-flex-test-2, completion-flex-test-3): Tweak. * etc/NEWS: Describe change.
Diffstat (limited to 'src')
-rw-r--r--src/minibuf.c183
1 files changed, 183 insertions, 0 deletions
diff --git a/src/minibuf.c b/src/minibuf.c
index 5dc2b230883..e49663e2f86 100644
--- a/src/minibuf.c
+++ b/src/minibuf.c
@@ -2279,6 +2279,188 @@ init_minibuf_once_for_pdumper (void)
2279 last_minibuf_string = Qnil; 2279 last_minibuf_string = Qnil;
2280} 2280}
2281 2281
2282/* FLEX/GOTOH algorithm for the 'flex' completion-style. Adapted from
2283 GOTOH, Osamu. An improved algorithm for matching biological
2284 sequences. Journal of molecular biology, 1982, 162.3: 705-708.
2285
2286 This algorithm matches patterns to candidate strings, or needles to
2287 haystacks. It works with cost matrices: imagine rows of these
2288 matrices as pattern characters, and columns as the candidate string
2289 characters. There is a -1 row, and a -1 column. The values there
2290 hold real costs used for situations "before the first ever" match of
2291 a pattern character to a string character.
2292
2293 M and D are cost matrices. At the end of the algorithm, M will have
2294 non-infinite values only for the spots where a pattern character
2295 matches a string character. So a non-infinite M[i,j] means the i-th
2296 character of the pattern matches the j-th character of the string.
2297 The value stored is the lowest possible cost the algorithm had to
2298 "pay" to be able to make that match there, given everything that may
2299 have happened before/to the left. An infinite value simply means no
2300 match at this pattern/string position. Note that both rows and
2301 columns of M may have more than one match at multiple indices. If
2302 some row or column of M has no match at all, the final cost will be
2303 infinite, and the funtion will return nil.
2304
2305 D (originally stands for 'Distance' in the Gotoh paper) has "running
2306 costs". Each value D[i,j] represents what the algorithm has to pay
2307 to make or extend a gap when a match is found at i+1, j+1. By that
2308 time, that cost may or may not be lower than continuing from a match
2309 that had also been found at i,j. We always pick the lowest cost, and
2310 by the time we reach the final column, we know we have picked the
2311 cheapest possible path choosing when to gap, and when to follow up.
2312
2313 Here's an illustration of the final state of M and D when matching
2314 "goto" to "eglot--goto". Notice how the algorithm detects but
2315 ultimately discards the scattered match at indexes 1,3,4,8, which
2316 would have total cost 27, and ultimately settles on the match at
2317 positions 7,8,9,10 which has cost 5:
2318
2319 -1 0 1 2 3 4 5 6 7 8 9 10
2320 - e g l o t - - g o t o
2321 M: -1 - ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
2322 0 g ∞ ∞ 5 ∞ ∞ ∞ ∞ ∞ 5 ∞ ∞ ∞
2323 1 o ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞ 16
2324 2 t ∞ ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞
2325 3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 27 ∞ 5
2326
2327 D: -1 - 0 5 5 5 5 5 5 5 5 5 5 5
2328 0 g ∞ ∞ ∞ 15 16 17 18 19 20 15 16 17
2329 1 o ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15 16
2330 2 t ∞ ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15
2331 3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 37 38
2332 **/
2333DEFUN ("completion--flex-cost-gotoh", Fcompletion__flex_cost_gotoh,
2334 Scompletion__flex_cost_gotoh, 2, 2, 0,
2335 doc: /* Compute cost of PAT matching STR using modified Gotoh
2336algorithm. Return nil if no match found, else return (COST . MATCHES)
2337where COST is a fixnum (lower is better) and MATCHES is a list of the
2338same length as PAT. Each i-th element is a FIXNUM indicating where in
2339STR the i-th character of PAT matched. */)
2340 (Lisp_Object pat, Lisp_Object str)
2341 {
2342 /* Pre-allocated matrices for flex completion scoring. */
2343#define FLEX_MAX_STR_SIZE 512
2344#define FLEX_MAX_PAT_SIZE 128
2345#define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE
2346 /* Macro for 2D indexing into "flat" arrays. */
2347#define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)])
2348
2349 CHECK_STRING (pat);
2350 CHECK_STRING (str);
2351
2352 ptrdiff_t patlen = SCHARS (pat);
2353 ptrdiff_t strlen = SCHARS (str);
2354 ptrdiff_t width = strlen + 1;
2355 ptrdiff_t size = (patlen + 1) * width;
2356 const int gap_open_cost = 10;
2357 const int gap_extend_cost = 1;
2358 const int pos_inf = INT_MAX / 2;
2359 static int M[FLEX_MAX_MATRIX_SIZE];
2360 static int D[FLEX_MAX_MATRIX_SIZE];
2361
2362 /* Bail if strings are empty or matrix too large. */
2363 if (patlen == 0 || strlen == 0 || size > FLEX_MAX_MATRIX_SIZE)
2364 return Qnil;
2365
2366 /* Initialize M and D with positive infinity... */
2367 for (int j = 0; j < size; j++)
2368 M[j] = D[j] = pos_inf;
2369 /* ...except for D[-1,-1], which is 0 to promote matches at the
2370 beginning. Rest of first row has gap_open_cost/2 for cheaper
2371 leading gaps. */
2372 for (int j = 0; j < width; j++)
2373 D[j] = gap_open_cost / 2;
2374 D[0] = 0;
2375
2376 /* Poor man's iterator type: cur char idx, next char idx, next byte idx. */
2377 typedef struct iter { int x; ptrdiff_t n; ptrdiff_t b; } iter_t;
2378
2379 /* Position of first match found in the previous row, to save
2380 iterations. */
2381 iter_t prev_match = { 0, 0, 0 };
2382
2383 /* Forward pass. */
2384 for (iter_t i = { 0, 0, 0 }; i.x < patlen; i.x++)
2385 {
2386 int pat_char = fetch_string_char_advance (pat, &i.n, &i.b);
2387 bool match_seen = false;
2388
2389 for (iter_t j = prev_match; j.x < strlen; j.x++)
2390 {
2391 iter_t jcopy = j; /* else advance function destroys it... */
2392 int str_char = fetch_string_char_advance (str, &j.n, &j.b);
2393
2394 /* Check if characters match (case-insensitive if needed). */
2395 bool cmatch;
2396 if (completion_ignore_case)
2397 cmatch = (downcase (pat_char) == downcase (str_char));
2398 else
2399 cmatch = (pat_char == str_char);
2400
2401 if (cmatch)
2402 {
2403 /* There is a match here, so compute match cost
2404 M[i][j], i.e. replace its infinite value with
2405 something finite. */
2406 if (!match_seen)
2407 {
2408 match_seen = true;
2409 prev_match = jcopy;
2410 }
2411 /* Compute M[i,j]. If we pick M[i-1,j-1] it means that
2412 not only did the previous char also match (else
2413 M[i-1,j-1] would have been infinite) but following
2414 it up with this match is best overall. If we pick
2415 D[i-1, j-1] it means that gapping is best,
2416 regardless of whether the previous char also
2417 matched. That is, it's better to arrive at this
2418 match from a gap. */
2419 MAT (M, i.x, j.x) = min (MAT (M, i.x - 1, j.x - 1),
2420 MAT (D, i.x - 1, j.x - 1));
2421 }
2422 /* Regardless of a match here, compute D[i,j], the best
2423 accumulated gapping cost at this point, considering
2424 whether it's more advantageous to open from a previous
2425 match on this row (a cost which may well be infinite if
2426 no such match ever existed) or extend a gap started
2427 sometime before. The next iteration will take this
2428 into account, and so will the next row when analyzing a
2429 possible match for the j+1-th string character. */
2430 MAT (D, i.x, j.x)
2431 = min (MAT (M, i.x, j.x - 1) + gap_open_cost,
2432 MAT (D, i.x, j.x - 1) + gap_extend_cost);
2433 }
2434 }
2435 /* Find lowest cost in last row. */
2436 int best_cost = pos_inf;
2437 int lastcol = -1;
2438 for (int j = 0; j < strlen; j++)
2439 {
2440 int cost = MAT (M, patlen - 1, j);
2441 if (cost < best_cost)
2442 {
2443 best_cost = cost;
2444 lastcol = j;
2445 }
2446 }
2447
2448 /* Return early if no match. */
2449 if (lastcol < 0 || best_cost >= pos_inf)
2450 return Qnil;
2451
2452 /* Go backwards to build match positions list. */
2453 Lisp_Object matches = Fcons (make_fixnum (lastcol), Qnil);
2454 for (int i = patlen - 2, l = lastcol; i >= 0; --i)
2455 {
2456 do --l; while (l >= 0 && MAT (M, i, l) >= MAT (D, i, l));
2457 matches = Fcons (make_fixnum (l), matches);
2458 }
2459
2460 return Fcons (make_fixnum (best_cost), matches);
2461#undef MAT
2462 }
2463
2282void 2464void
2283syms_of_minibuf (void) 2465syms_of_minibuf (void)
2284{ 2466{
@@ -2541,6 +2723,7 @@ showing the *Completions* buffer, if any. */);
2541 defsubr (&Stest_completion); 2723 defsubr (&Stest_completion);
2542 defsubr (&Sassoc_string); 2724 defsubr (&Sassoc_string);
2543 defsubr (&Scompleting_read); 2725 defsubr (&Scompleting_read);
2726 defsubr (&Scompletion__flex_cost_gotoh);
2544 DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); 2727 DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit");
2545 DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); 2728 DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer");
2546 DEFSYM (Qcompleting_read_function, "completing-read-function"); 2729 DEFSYM (Qcompleting_read_function, "completing-read-function");