diff options
| author | João Távora | 2026-02-03 12:14:03 +0000 |
|---|---|---|
| committer | João Távora | 2026-02-13 23:46:11 +0000 |
| commit | aa181cd35220242a23fe98932386facaba18c4c5 (patch) | |
| tree | 0ba9c1d7b47bfa434baf62512a67249f9d4c422a /src | |
| parent | 52b24ed7aacb2e879c68e0f51f3b32c1700c9324 (diff) | |
| download | emacs-aa181cd35220242a23fe98932386facaba18c4c5.tar.gz emacs-aa181cd35220242a23fe98932386facaba18c4c5.zip | |
Rewrite flex completion with Gotoh algorithm
The greedy regexp matching, broken scoring and broken highlight were
sources of frequent complaints about the 'flex' matching style. This
commit fixes that.
Inspired by the 'hotfuzz' style (available at
https://github.com/axelf4/hotfuzz) it uses a modified version of Gotoh's
1982 dynamic programming algorithm. It is strictly more correct than the
"old" flex. For example, when matching the pattern 'goto' to no longer
will 'eglot-format' be sorted before some hypothetical much better
'goobarbaz-goto'. And of course the highlighting is also correctly
placed on the 'goto', not scattered across the candidate.
Regarding performance, it is faster than the naive 'flex', primarily
because of the Elisp rewrite in minibuffer.el. The matching and costing
algorithm matters but is not the bottleneck.
The Elisp parts of the style were almost completely decoupled from the
pcm/substring styles in lisp/minibuffer.el. Only
'completion-flex-try-completion' uses some of pcm's code for pattern
augmentation.
* src/minibuf.c (completion--flex-cost-gotoh): New function.
* lisp/minibuffer.el (completion-flex--pattern-str): New variable.
(flex-score-match-tightness): Make obsolete.
(completion--flex-all-completions-1): New helper function.
(completion-flex-try-completion, completion-flex-all-completions): Rewrite.
(completion-substring--all-completions): No longer take transform-pattern-fn.
(completion--flex-adjust-metadata): Tweak.
(completion--flex-score, completion--flex-score-1)
(completion--flex-score-last-md, completion-flex--make-flex-pattern): Delete.
* test/lisp/minibuffer-tests.el (completion--sorted-flex-completions):
New helper function.
(completion-flex-test-non-ascii): New test.
(completion--pcm-score): Delete.
(completion-pcm-test-3, completion-pcm-test-4)
(completion-substring-test-1, completion-substring-test-2)
(completion-flex-test-2, completion-flex-test-3): Tweak.
* etc/NEWS: Describe change.
Diffstat (limited to 'src')
| -rw-r--r-- | src/minibuf.c | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/src/minibuf.c b/src/minibuf.c index 5dc2b230883..e49663e2f86 100644 --- a/src/minibuf.c +++ b/src/minibuf.c | |||
| @@ -2279,6 +2279,188 @@ init_minibuf_once_for_pdumper (void) | |||
| 2279 | last_minibuf_string = Qnil; | 2279 | last_minibuf_string = Qnil; |
| 2280 | } | 2280 | } |
| 2281 | 2281 | ||
| 2282 | /* FLEX/GOTOH algorithm for the 'flex' completion-style. Adapted from | ||
| 2283 | GOTOH, Osamu. An improved algorithm for matching biological | ||
| 2284 | sequences. Journal of molecular biology, 1982, 162.3: 705-708. | ||
| 2285 | |||
| 2286 | This algorithm matches patterns to candidate strings, or needles to | ||
| 2287 | haystacks. It works with cost matrices: imagine rows of these | ||
| 2288 | matrices as pattern characters, and columns as the candidate string | ||
| 2289 | characters. There is a -1 row, and a -1 column. The values there | ||
| 2290 | hold real costs used for situations "before the first ever" match of | ||
| 2291 | a pattern character to a string character. | ||
| 2292 | |||
| 2293 | M and D are cost matrices. At the end of the algorithm, M will have | ||
| 2294 | non-infinite values only for the spots where a pattern character | ||
| 2295 | matches a string character. So a non-infinite M[i,j] means the i-th | ||
| 2296 | character of the pattern matches the j-th character of the string. | ||
| 2297 | The value stored is the lowest possible cost the algorithm had to | ||
| 2298 | "pay" to be able to make that match there, given everything that may | ||
| 2299 | have happened before/to the left. An infinite value simply means no | ||
| 2300 | match at this pattern/string position. Note that both rows and | ||
| 2301 | columns of M may have more than one match at multiple indices. If | ||
| 2302 | some row or column of M has no match at all, the final cost will be | ||
| 2303 | infinite, and the funtion will return nil. | ||
| 2304 | |||
| 2305 | D (originally stands for 'Distance' in the Gotoh paper) has "running | ||
| 2306 | costs". Each value D[i,j] represents what the algorithm has to pay | ||
| 2307 | to make or extend a gap when a match is found at i+1, j+1. By that | ||
| 2308 | time, that cost may or may not be lower than continuing from a match | ||
| 2309 | that had also been found at i,j. We always pick the lowest cost, and | ||
| 2310 | by the time we reach the final column, we know we have picked the | ||
| 2311 | cheapest possible path choosing when to gap, and when to follow up. | ||
| 2312 | |||
| 2313 | Here's an illustration of the final state of M and D when matching | ||
| 2314 | "goto" to "eglot--goto". Notice how the algorithm detects but | ||
| 2315 | ultimately discards the scattered match at indexes 1,3,4,8, which | ||
| 2316 | would have total cost 27, and ultimately settles on the match at | ||
| 2317 | positions 7,8,9,10 which has cost 5: | ||
| 2318 | |||
| 2319 | -1 0 1 2 3 4 5 6 7 8 9 10 | ||
| 2320 | - e g l o t - - g o t o | ||
| 2321 | M: -1 - ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ | ||
| 2322 | 0 g ∞ ∞ 5 ∞ ∞ ∞ ∞ ∞ 5 ∞ ∞ ∞ | ||
| 2323 | 1 o ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞ 16 | ||
| 2324 | 2 t ∞ ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞ | ||
| 2325 | 3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 27 ∞ 5 | ||
| 2326 | |||
| 2327 | D: -1 - 0 5 5 5 5 5 5 5 5 5 5 5 | ||
| 2328 | 0 g ∞ ∞ ∞ 15 16 17 18 19 20 15 16 17 | ||
| 2329 | 1 o ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15 16 | ||
| 2330 | 2 t ∞ ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15 | ||
| 2331 | 3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 37 38 | ||
| 2332 | **/ | ||
| 2333 | DEFUN ("completion--flex-cost-gotoh", Fcompletion__flex_cost_gotoh, | ||
| 2334 | Scompletion__flex_cost_gotoh, 2, 2, 0, | ||
| 2335 | doc: /* Compute cost of PAT matching STR using modified Gotoh | ||
| 2336 | algorithm. Return nil if no match found, else return (COST . MATCHES) | ||
| 2337 | where COST is a fixnum (lower is better) and MATCHES is a list of the | ||
| 2338 | same length as PAT. Each i-th element is a FIXNUM indicating where in | ||
| 2339 | STR the i-th character of PAT matched. */) | ||
| 2340 | (Lisp_Object pat, Lisp_Object str) | ||
| 2341 | { | ||
| 2342 | /* Pre-allocated matrices for flex completion scoring. */ | ||
| 2343 | #define FLEX_MAX_STR_SIZE 512 | ||
| 2344 | #define FLEX_MAX_PAT_SIZE 128 | ||
| 2345 | #define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE | ||
| 2346 | /* Macro for 2D indexing into "flat" arrays. */ | ||
| 2347 | #define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)]) | ||
| 2348 | |||
| 2349 | CHECK_STRING (pat); | ||
| 2350 | CHECK_STRING (str); | ||
| 2351 | |||
| 2352 | ptrdiff_t patlen = SCHARS (pat); | ||
| 2353 | ptrdiff_t strlen = SCHARS (str); | ||
| 2354 | ptrdiff_t width = strlen + 1; | ||
| 2355 | ptrdiff_t size = (patlen + 1) * width; | ||
| 2356 | const int gap_open_cost = 10; | ||
| 2357 | const int gap_extend_cost = 1; | ||
| 2358 | const int pos_inf = INT_MAX / 2; | ||
| 2359 | static int M[FLEX_MAX_MATRIX_SIZE]; | ||
| 2360 | static int D[FLEX_MAX_MATRIX_SIZE]; | ||
| 2361 | |||
| 2362 | /* Bail if strings are empty or matrix too large. */ | ||
| 2363 | if (patlen == 0 || strlen == 0 || size > FLEX_MAX_MATRIX_SIZE) | ||
| 2364 | return Qnil; | ||
| 2365 | |||
| 2366 | /* Initialize M and D with positive infinity... */ | ||
| 2367 | for (int j = 0; j < size; j++) | ||
| 2368 | M[j] = D[j] = pos_inf; | ||
| 2369 | /* ...except for D[-1,-1], which is 0 to promote matches at the | ||
| 2370 | beginning. Rest of first row has gap_open_cost/2 for cheaper | ||
| 2371 | leading gaps. */ | ||
| 2372 | for (int j = 0; j < width; j++) | ||
| 2373 | D[j] = gap_open_cost / 2; | ||
| 2374 | D[0] = 0; | ||
| 2375 | |||
| 2376 | /* Poor man's iterator type: cur char idx, next char idx, next byte idx. */ | ||
| 2377 | typedef struct iter { int x; ptrdiff_t n; ptrdiff_t b; } iter_t; | ||
| 2378 | |||
| 2379 | /* Position of first match found in the previous row, to save | ||
| 2380 | iterations. */ | ||
| 2381 | iter_t prev_match = { 0, 0, 0 }; | ||
| 2382 | |||
| 2383 | /* Forward pass. */ | ||
| 2384 | for (iter_t i = { 0, 0, 0 }; i.x < patlen; i.x++) | ||
| 2385 | { | ||
| 2386 | int pat_char = fetch_string_char_advance (pat, &i.n, &i.b); | ||
| 2387 | bool match_seen = false; | ||
| 2388 | |||
| 2389 | for (iter_t j = prev_match; j.x < strlen; j.x++) | ||
| 2390 | { | ||
| 2391 | iter_t jcopy = j; /* else advance function destroys it... */ | ||
| 2392 | int str_char = fetch_string_char_advance (str, &j.n, &j.b); | ||
| 2393 | |||
| 2394 | /* Check if characters match (case-insensitive if needed). */ | ||
| 2395 | bool cmatch; | ||
| 2396 | if (completion_ignore_case) | ||
| 2397 | cmatch = (downcase (pat_char) == downcase (str_char)); | ||
| 2398 | else | ||
| 2399 | cmatch = (pat_char == str_char); | ||
| 2400 | |||
| 2401 | if (cmatch) | ||
| 2402 | { | ||
| 2403 | /* There is a match here, so compute match cost | ||
| 2404 | M[i][j], i.e. replace its infinite value with | ||
| 2405 | something finite. */ | ||
| 2406 | if (!match_seen) | ||
| 2407 | { | ||
| 2408 | match_seen = true; | ||
| 2409 | prev_match = jcopy; | ||
| 2410 | } | ||
| 2411 | /* Compute M[i,j]. If we pick M[i-1,j-1] it means that | ||
| 2412 | not only did the previous char also match (else | ||
| 2413 | M[i-1,j-1] would have been infinite) but following | ||
| 2414 | it up with this match is best overall. If we pick | ||
| 2415 | D[i-1, j-1] it means that gapping is best, | ||
| 2416 | regardless of whether the previous char also | ||
| 2417 | matched. That is, it's better to arrive at this | ||
| 2418 | match from a gap. */ | ||
| 2419 | MAT (M, i.x, j.x) = min (MAT (M, i.x - 1, j.x - 1), | ||
| 2420 | MAT (D, i.x - 1, j.x - 1)); | ||
| 2421 | } | ||
| 2422 | /* Regardless of a match here, compute D[i,j], the best | ||
| 2423 | accumulated gapping cost at this point, considering | ||
| 2424 | whether it's more advantageous to open from a previous | ||
| 2425 | match on this row (a cost which may well be infinite if | ||
| 2426 | no such match ever existed) or extend a gap started | ||
| 2427 | sometime before. The next iteration will take this | ||
| 2428 | into account, and so will the next row when analyzing a | ||
| 2429 | possible match for the j+1-th string character. */ | ||
| 2430 | MAT (D, i.x, j.x) | ||
| 2431 | = min (MAT (M, i.x, j.x - 1) + gap_open_cost, | ||
| 2432 | MAT (D, i.x, j.x - 1) + gap_extend_cost); | ||
| 2433 | } | ||
| 2434 | } | ||
| 2435 | /* Find lowest cost in last row. */ | ||
| 2436 | int best_cost = pos_inf; | ||
| 2437 | int lastcol = -1; | ||
| 2438 | for (int j = 0; j < strlen; j++) | ||
| 2439 | { | ||
| 2440 | int cost = MAT (M, patlen - 1, j); | ||
| 2441 | if (cost < best_cost) | ||
| 2442 | { | ||
| 2443 | best_cost = cost; | ||
| 2444 | lastcol = j; | ||
| 2445 | } | ||
| 2446 | } | ||
| 2447 | |||
| 2448 | /* Return early if no match. */ | ||
| 2449 | if (lastcol < 0 || best_cost >= pos_inf) | ||
| 2450 | return Qnil; | ||
| 2451 | |||
| 2452 | /* Go backwards to build match positions list. */ | ||
| 2453 | Lisp_Object matches = Fcons (make_fixnum (lastcol), Qnil); | ||
| 2454 | for (int i = patlen - 2, l = lastcol; i >= 0; --i) | ||
| 2455 | { | ||
| 2456 | do --l; while (l >= 0 && MAT (M, i, l) >= MAT (D, i, l)); | ||
| 2457 | matches = Fcons (make_fixnum (l), matches); | ||
| 2458 | } | ||
| 2459 | |||
| 2460 | return Fcons (make_fixnum (best_cost), matches); | ||
| 2461 | #undef MAT | ||
| 2462 | } | ||
| 2463 | |||
| 2282 | void | 2464 | void |
| 2283 | syms_of_minibuf (void) | 2465 | syms_of_minibuf (void) |
| 2284 | { | 2466 | { |
| @@ -2541,6 +2723,7 @@ showing the *Completions* buffer, if any. */); | |||
| 2541 | defsubr (&Stest_completion); | 2723 | defsubr (&Stest_completion); |
| 2542 | defsubr (&Sassoc_string); | 2724 | defsubr (&Sassoc_string); |
| 2543 | defsubr (&Scompleting_read); | 2725 | defsubr (&Scompleting_read); |
| 2726 | defsubr (&Scompletion__flex_cost_gotoh); | ||
| 2544 | DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); | 2727 | DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit"); |
| 2545 | DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); | 2728 | DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer"); |
| 2546 | DEFSYM (Qcompleting_read_function, "completing-read-function"); | 2729 | DEFSYM (Qcompleting_read_function, "completing-read-function"); |