Rewrite flex completion with Gotoh algorithm

The greedy regexp matching, broken scoring and broken highlight were sources of frequent complaints about the 'flex' matching style. This commit fixes that. Inspired by the 'hotfuzz' style (available at https://github.com/axelf4/hotfuzz) it uses a modified version of Gotoh's 1982 dynamic programming algorithm. It is strictly more correct than the "old" flex. For example, when matching the pattern 'goto' to no longer will 'eglot-format' be sorted before some hypothetical much better 'goobarbaz-goto'. And of course the highlighting is also correctly placed on the 'goto', not scattered across the candidate. Regarding performance, it is faster than the naive 'flex', primarily because of the Elisp rewrite in minibuffer.el. The matching and costing algorithm matters but is not the bottleneck. The Elisp parts of the style were almost completely decoupled from the pcm/substring styles in lisp/minibuffer.el. Only 'completion-flex-try-completion' uses some of pcm's code for pattern augmentation. * src/minibuf.c (completion--flex-cost-gotoh): New function. * lisp/minibuffer.el (completion-flex--pattern-str): New variable. (flex-score-match-tightness): Make obsolete. (completion--flex-all-completions-1): New helper function. (completion-flex-try-completion, completion-flex-all-completions): Rewrite. (completion-substring--all-completions): No longer take transform-pattern-fn. (completion--flex-adjust-metadata): Tweak. (completion--flex-score, completion--flex-score-1) (completion--flex-score-last-md, completion-flex--make-flex-pattern): Delete. * test/lisp/minibuffer-tests.el (completion--sorted-flex-completions): New helper function. (completion-flex-test-non-ascii): New test. (completion--pcm-score): Delete. (completion-pcm-test-3, completion-pcm-test-4) (completion-substring-test-1, completion-substring-test-2) (completion-flex-test-2, completion-flex-test-3): Tweak. * etc/NEWS: Describe change.
author: João Távora 2026-02-03 12:14:03 +0000
committer: João Távora 2026-02-13 23:46:11 +0000
commit: aa181cd35220242a23fe98932386facaba18c4c5 (patch)
tree: 0ba9c1d7b47bfa434baf62512a67249f9d4c422a /src
parent: 52b24ed7aacb2e879c68e0f51f3b32c1700c9324 (diff)
download: emacs-aa181cd35220242a23fe98932386facaba18c4c5.tar.gz
emacs-aa181cd35220242a23fe98932386facaba18c4c5.zip
1 files changed, 183 insertions, 0 deletions
diff --git a/src/minibuf.c b/src/minibuf.c
index 5dc2b230883..e49663e2f86 100644
--- a/src/minibuf.c
+++ b/src/minibuf.c
@@ -2279,6 +2279,188 @@ init_minibuf_once_for_pdumper (void)
  last_minibuf_string = Qnil;
 }
+/* FLEX/GOTOH algorithm for the 'flex' completion-style.  Adapted from
+   GOTOH, Osamu.  An improved algorithm for matching biological
+   sequences. Journal of molecular biology, 1982, 162.3: 705-708.
+   This algorithm matches patterns to candidate strings, or needles to
+   haystacks.  It works with cost matrices: imagine rows of these
+   matrices as pattern characters, and columns as the candidate string
+   characters.  There is a -1 row, and a -1 column.  The values there
+   hold real costs used for situations "before the first ever" match of
+   a pattern character to a string character.
+   M and D are cost matrices.  At the end of the algorithm, M will have
+   non-infinite values only for the spots where a pattern character
+   matches a string character.  So a non-infinite M[i,j] means the i-th
+   character of the pattern matches the j-th character of the string.
+   The value stored is the lowest possible cost the algorithm had to
+   "pay" to be able to make that match there, given everything that may
+   have happened before/to the left.  An infinite value simply means no
+   match at this pattern/string position.  Note that both rows and
+   columns of M may have more than one match at multiple indices.  If
+   some row or column of M has no match at all, the final cost will be
+   infinite, and the funtion will return nil.
+   D (originally stands for 'Distance' in the Gotoh paper) has "running
+   costs".  Each value D[i,j] represents what the algorithm has to pay
+   to make or extend a gap when a match is found at i+1, j+1.  By that
+   time, that cost may or may not be lower than continuing from a match
+   that had also been found at i,j.  We always pick the lowest cost, and
+   by the time we reach the final column, we know we have picked the
+   cheapest possible path choosing when to gap, and when to follow up.
+   Here's an illustration of the final state of M and D when matching
+   "goto" to "eglot--goto".  Notice how the algorithm detects but
+   ultimately discards the scattered match at indexes 1,3,4,8, which
+   would have total cost 27, and ultimately settles on the match at
+   positions 7,8,9,10 which has cost 5:
+             -1  0  1  2  3  4  5  6  7  8  9 10
+              -  e  g  l  o  t  -  -  g  o  t  o
+   M:  -1  -  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞
+        0  g  ∞  ∞  5  ∞  ∞  ∞  ∞  ∞  5  ∞  ∞  ∞
+        1  o  ∞  ∞  ∞  ∞ 15  ∞  ∞  ∞  ∞  5  ∞ 16
+        2  t  ∞  ∞  ∞  ∞  ∞ 15  ∞  ∞  ∞  ∞  5  ∞
+        3  o  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞ 27  ∞  5
+   D:  -1  -  0  5  5  5  5  5  5  5  5  5  5  5
+        0  g  ∞  ∞  ∞ 15 16 17 18 19 20 15 16 17
+        1  o  ∞  ∞  ∞  ∞  ∞ 25 26 27 28 29 15 16
+        2  t  ∞  ∞  ∞  ∞  ∞  ∞ 25 26 27 28 29 15
+        3  o  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞  ∞ 37 38
+ **/
+DEFUN ("completion--flex-cost-gotoh", Fcompletion__flex_cost_gotoh,
+       Scompletion__flex_cost_gotoh, 2, 2, 0,
+       doc: /* Compute cost of PAT matching STR using modified Gotoh
+algorithm.  Return nil if no match found, else return (COST . MATCHES)
+where COST is a fixnum (lower is better) and MATCHES is a list of the
+same length as PAT.  Each i-th element is a FIXNUM indicating where in
+STR the i-th character of PAT matched.  */)
+  (Lisp_Object pat, Lisp_Object str)
+  {
+    /* Pre-allocated matrices for flex completion scoring.  */
+#define FLEX_MAX_STR_SIZE 512
+#define FLEX_MAX_PAT_SIZE 128
+#define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE
+    /* Macro for 2D indexing into "flat" arrays.  */
+#define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)])
+    CHECK_STRING (pat);
+    CHECK_STRING (str);
+    ptrdiff_t patlen = SCHARS (pat);
+    ptrdiff_t strlen = SCHARS (str);
+    ptrdiff_t width = strlen + 1;
+    ptrdiff_t size = (patlen + 1) * width;
+    const int gap_open_cost = 10;
+    const int gap_extend_cost = 1;
+    const int pos_inf = INT_MAX / 2;
+    static int M[FLEX_MAX_MATRIX_SIZE];
+    static int D[FLEX_MAX_MATRIX_SIZE];
+    /* Bail if strings are empty or matrix too large.  */
+    if (patlen == 0 || strlen == 0 || size > FLEX_MAX_MATRIX_SIZE)
+      return Qnil;
+    /* Initialize M and D with positive infinity...  */
+    for (int j = 0; j < size; j++)
+      M[j] = D[j] = pos_inf;
+    /* ...except for D[-1,-1], which is 0 to promote matches at the
+       beginning.  Rest of first row has gap_open_cost/2 for cheaper
+       leading gaps.  */
+    for (int j = 0; j < width; j++)
+      D[j] = gap_open_cost / 2;
+    D[0] = 0;
+    /* Poor man's iterator type: cur char idx, next char idx, next byte idx.  */
+    typedef struct iter { int x; ptrdiff_t n; ptrdiff_t b; } iter_t;
+    /* Position of first match found in the previous row, to save
+       iterations.  */
+    iter_t prev_match = { 0, 0, 0 };
+    /* Forward pass.  */
+    for (iter_t i = { 0, 0, 0 }; i.x < patlen; i.x++)
+      {
+        int pat_char = fetch_string_char_advance (pat, &i.n, &i.b);
+        bool match_seen = false;
+        for (iter_t j = prev_match; j.x < strlen; j.x++)
+          {
+            iter_t jcopy = j; /* else advance function destroys it...  */
+            int str_char = fetch_string_char_advance (str, &j.n, &j.b);
+            /* Check if characters match (case-insensitive if needed).  */
+            bool cmatch;
+            if (completion_ignore_case)
+              cmatch = (downcase (pat_char) == downcase (str_char));
+            else
+              cmatch = (pat_char == str_char);
+            if (cmatch)
+              {
+                /* There is a match here, so compute match cost
+                   M[i][j], i.e. replace its infinite value with
+                   something finite.  */
+                if (!match_seen)
+                  {
+                    match_seen = true;
+                    prev_match = jcopy;
+                  }
+                /* Compute M[i,j]. If we pick M[i-1,j-1] it means that
+                   not only did the previous char also match (else
+                   M[i-1,j-1] would have been infinite) but following
+                   it up with this match is best overall.  If we pick
+                   D[i-1, j-1] it means that gapping is best,
+                   regardless of whether the previous char also
+                   matched.  That is, it's better to arrive at this
+                   match from a gap.  */
+                MAT (M, i.x, j.x) = min (MAT (M, i.x - 1, j.x - 1),
+                                         MAT (D, i.x - 1, j.x - 1));
+              }
+            /* Regardless of a match here, compute D[i,j], the best
+               accumulated gapping cost at this point, considering
+               whether it's more advantageous to open from a previous
+               match on this row (a cost which may well be infinite if
+               no such match ever existed) or extend a gap started
+               sometime before.  The next iteration will take this
+               into account, and so will the next row when analyzing a
+               possible match for the j+1-th string character.  */
+            MAT (D, i.x, j.x)
+              = min (MAT (M, i.x, j.x - 1) + gap_open_cost,
+                     MAT (D, i.x, j.x - 1) + gap_extend_cost);
+          }
+      }
+    /* Find lowest cost in last row.  */
+    int best_cost = pos_inf;
+    int lastcol = -1;
+    for (int j = 0; j < strlen; j++)
+      {
+        int cost = MAT (M, patlen - 1, j);
+        if (cost < best_cost)
+          {
+            best_cost = cost;
+            lastcol = j;
+          }
+      }
+    /* Return early if no match.  */
+    if (lastcol < 0 || best_cost >= pos_inf)
+      return Qnil;
+    /* Go backwards to build match positions list.  */
+    Lisp_Object matches = Fcons (make_fixnum (lastcol), Qnil);
+    for (int i = patlen - 2, l = lastcol; i >= 0; --i)
+      {
+        do --l; while (l >= 0 && MAT (M, i, l) >= MAT (D, i, l));
+        matches = Fcons (make_fixnum (l), matches);
+      }
+    return Fcons (make_fixnum (best_cost), matches);
+#undef MAT
+  }
 void
 syms_of_minibuf (void)
 {
@@ -2541,6 +2723,7 @@ showing the *Completions* buffer, if any.  */);
  defsubr (&Stest_completion);
  defsubr (&Sassoc_string);
  defsubr (&Scompleting_read);
+  defsubr (&Scompletion__flex_cost_gotoh);
  DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit");
  DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer");
  DEFSYM (Qcompleting_read_function, "completing-read-function");
author	João Távora	2026-02-03 12:14:03 +0000
committer	João Távora	2026-02-13 23:46:11 +0000
commit	aa181cd35220242a23fe98932386facaba18c4c5 (patch)
tree	0ba9c1d7b47bfa434baf62512a67249f9d4c422a /src
parent	52b24ed7aacb2e879c68e0f51f3b32c1700c9324 (diff)
download	emacs-aa181cd35220242a23fe98932386facaba18c4c5.tar.gz emacs-aa181cd35220242a23fe98932386facaba18c4c5.zip

diff --git a/src/minibuf.c b/src/minibuf.c index 5dc2b230883..e49663e2f86 100644 --- a/src/minibuf.c +++ b/src/minibuf.c
@@ -2279,6 +2279,188 @@ init_minibuf_once_for_pdumper (void)
2279	last_minibuf_string = Qnil;	2279	last_minibuf_string = Qnil;
2280	}	2280	}
2281		2281
		2282	/* FLEX/GOTOH algorithm for the 'flex' completion-style. Adapted from
		2283	GOTOH, Osamu. An improved algorithm for matching biological
		2284	sequences. Journal of molecular biology, 1982, 162.3: 705-708.
		2285
		2286	This algorithm matches patterns to candidate strings, or needles to
		2287	haystacks. It works with cost matrices: imagine rows of these
		2288	matrices as pattern characters, and columns as the candidate string
		2289	characters. There is a -1 row, and a -1 column. The values there
		2290	hold real costs used for situations "before the first ever" match of
		2291	a pattern character to a string character.
		2292
		2293	M and D are cost matrices. At the end of the algorithm, M will have
		2294	non-infinite values only for the spots where a pattern character
		2295	matches a string character. So a non-infinite M[i,j] means the i-th
		2296	character of the pattern matches the j-th character of the string.
		2297	The value stored is the lowest possible cost the algorithm had to
		2298	"pay" to be able to make that match there, given everything that may
		2299	have happened before/to the left. An infinite value simply means no
		2300	match at this pattern/string position. Note that both rows and
		2301	columns of M may have more than one match at multiple indices. If
		2302	some row or column of M has no match at all, the final cost will be
		2303	infinite, and the funtion will return nil.
		2304
		2305	D (originally stands for 'Distance' in the Gotoh paper) has "running
		2306	costs". Each value D[i,j] represents what the algorithm has to pay
		2307	to make or extend a gap when a match is found at i+1, j+1. By that
		2308	time, that cost may or may not be lower than continuing from a match
		2309	that had also been found at i,j. We always pick the lowest cost, and
		2310	by the time we reach the final column, we know we have picked the
		2311	cheapest possible path choosing when to gap, and when to follow up.
		2312
		2313	Here's an illustration of the final state of M and D when matching
		2314	"goto" to "eglot--goto". Notice how the algorithm detects but
		2315	ultimately discards the scattered match at indexes 1,3,4,8, which
		2316	would have total cost 27, and ultimately settles on the match at
		2317	positions 7,8,9,10 which has cost 5:
		2318
		2319	-1 0 1 2 3 4 5 6 7 8 9 10
		2320	- e g l o t - - g o t o
		2321	M: -1 - ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
		2322	0 g ∞ ∞ 5 ∞ ∞ ∞ ∞ ∞ 5 ∞ ∞ ∞
		2323	1 o ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞ 16
		2324	2 t ∞ ∞ ∞ ∞ ∞ 15 ∞ ∞ ∞ ∞ 5 ∞
		2325	3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 27 ∞ 5
		2326
		2327	D: -1 - 0 5 5 5 5 5 5 5 5 5 5 5
		2328	0 g ∞ ∞ ∞ 15 16 17 18 19 20 15 16 17
		2329	1 o ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15 16
		2330	2 t ∞ ∞ ∞ ∞ ∞ ∞ 25 26 27 28 29 15
		2331	3 o ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ 37 38
		2332	**/
		2333	DEFUN ("completion--flex-cost-gotoh", Fcompletion__flex_cost_gotoh,
		2334	Scompletion__flex_cost_gotoh, 2, 2, 0,
		2335	doc: /* Compute cost of PAT matching STR using modified Gotoh
		2336	algorithm. Return nil if no match found, else return (COST . MATCHES)
		2337	where COST is a fixnum (lower is better) and MATCHES is a list of the
		2338	same length as PAT. Each i-th element is a FIXNUM indicating where in
		2339	STR the i-th character of PAT matched. */)
		2340	(Lisp_Object pat, Lisp_Object str)
		2341	{
		2342	/* Pre-allocated matrices for flex completion scoring. */
		2343	#define FLEX_MAX_STR_SIZE 512
		2344	#define FLEX_MAX_PAT_SIZE 128
		2345	#define FLEX_MAX_MATRIX_SIZE FLEX_MAX_PAT_SIZE * FLEX_MAX_STR_SIZE
		2346	/* Macro for 2D indexing into "flat" arrays. */
		2347	#define MAT(matrix, i, j) ((matrix)[((i) + 1) * width + ((j) + 1)])
		2348
		2349	CHECK_STRING (pat);
		2350	CHECK_STRING (str);
		2351
		2352	ptrdiff_t patlen = SCHARS (pat);
		2353	ptrdiff_t strlen = SCHARS (str);
		2354	ptrdiff_t width = strlen + 1;
		2355	ptrdiff_t size = (patlen + 1) * width;
		2356	const int gap_open_cost = 10;
		2357	const int gap_extend_cost = 1;
		2358	const int pos_inf = INT_MAX / 2;
		2359	static int M[FLEX_MAX_MATRIX_SIZE];
		2360	static int D[FLEX_MAX_MATRIX_SIZE];
		2361
		2362	/* Bail if strings are empty or matrix too large. */
		2363	if (patlen == 0 \|\| strlen == 0 \|\| size > FLEX_MAX_MATRIX_SIZE)
		2364	return Qnil;
		2365
		2366	/* Initialize M and D with positive infinity... */
		2367	for (int j = 0; j < size; j++)
		2368	M[j] = D[j] = pos_inf;
		2369	/* ...except for D[-1,-1], which is 0 to promote matches at the
		2370	beginning. Rest of first row has gap_open_cost/2 for cheaper
		2371	leading gaps. */
		2372	for (int j = 0; j < width; j++)
		2373	D[j] = gap_open_cost / 2;
		2374	D[0] = 0;
		2375
		2376	/* Poor man's iterator type: cur char idx, next char idx, next byte idx. */
		2377	typedef struct iter { int x; ptrdiff_t n; ptrdiff_t b; } iter_t;
		2378
		2379	/* Position of first match found in the previous row, to save
		2380	iterations. */
		2381	iter_t prev_match = { 0, 0, 0 };
		2382
		2383	/* Forward pass. */
		2384	for (iter_t i = { 0, 0, 0 }; i.x < patlen; i.x++)
		2385	{
		2386	int pat_char = fetch_string_char_advance (pat, &i.n, &i.b);
		2387	bool match_seen = false;
		2388
		2389	for (iter_t j = prev_match; j.x < strlen; j.x++)
		2390	{
		2391	iter_t jcopy = j; /* else advance function destroys it... */
		2392	int str_char = fetch_string_char_advance (str, &j.n, &j.b);
		2393
		2394	/* Check if characters match (case-insensitive if needed). */
		2395	bool cmatch;
		2396	if (completion_ignore_case)
		2397	cmatch = (downcase (pat_char) == downcase (str_char));
		2398	else
		2399	cmatch = (pat_char == str_char);
		2400
		2401	if (cmatch)
		2402	{
		2403	/* There is a match here, so compute match cost
		2404	M[i][j], i.e. replace its infinite value with
		2405	something finite. */
		2406	if (!match_seen)
		2407	{
		2408	match_seen = true;
		2409	prev_match = jcopy;
		2410	}
		2411	/* Compute M[i,j]. If we pick M[i-1,j-1] it means that
		2412	not only did the previous char also match (else
		2413	M[i-1,j-1] would have been infinite) but following
		2414	it up with this match is best overall. If we pick
		2415	D[i-1, j-1] it means that gapping is best,
		2416	regardless of whether the previous char also
		2417	matched. That is, it's better to arrive at this
		2418	match from a gap. */
		2419	MAT (M, i.x, j.x) = min (MAT (M, i.x - 1, j.x - 1),
		2420	MAT (D, i.x - 1, j.x - 1));
		2421	}
		2422	/* Regardless of a match here, compute D[i,j], the best
		2423	accumulated gapping cost at this point, considering
		2424	whether it's more advantageous to open from a previous
		2425	match on this row (a cost which may well be infinite if
		2426	no such match ever existed) or extend a gap started
		2427	sometime before. The next iteration will take this
		2428	into account, and so will the next row when analyzing a
		2429	possible match for the j+1-th string character. */
		2430	MAT (D, i.x, j.x)
		2431	= min (MAT (M, i.x, j.x - 1) + gap_open_cost,
		2432	MAT (D, i.x, j.x - 1) + gap_extend_cost);
		2433	}
		2434	}
		2435	/* Find lowest cost in last row. */
		2436	int best_cost = pos_inf;
		2437	int lastcol = -1;
		2438	for (int j = 0; j < strlen; j++)
		2439	{
		2440	int cost = MAT (M, patlen - 1, j);
		2441	if (cost < best_cost)
		2442	{
		2443	best_cost = cost;
		2444	lastcol = j;
		2445	}
		2446	}
		2447
		2448	/* Return early if no match. */
		2449	if (lastcol < 0 \|\| best_cost >= pos_inf)
		2450	return Qnil;
		2451
		2452	/* Go backwards to build match positions list. */
		2453	Lisp_Object matches = Fcons (make_fixnum (lastcol), Qnil);
		2454	for (int i = patlen - 2, l = lastcol; i >= 0; --i)
		2455	{
		2456	do --l; while (l >= 0 && MAT (M, i, l) >= MAT (D, i, l));
		2457	matches = Fcons (make_fixnum (l), matches);
		2458	}
		2459
		2460	return Fcons (make_fixnum (best_cost), matches);
		2461	#undef MAT
		2462	}
		2463
2282	void	2464	void
2283	syms_of_minibuf (void)	2465	syms_of_minibuf (void)
2284	{	2466	{
@@ -2541,6 +2723,7 @@ showing the Completions buffer, if any. */);
2541	defsubr (&Stest_completion);	2723	defsubr (&Stest_completion);
2542	defsubr (&Sassoc_string);	2724	defsubr (&Sassoc_string);
2543	defsubr (&Scompleting_read);	2725	defsubr (&Scompleting_read);
		2726	defsubr (&Scompletion__flex_cost_gotoh);
2544	DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit");	2727	DEFSYM (Qminibuffer_quit_recursive_edit, "minibuffer-quit-recursive-edit");
2545	DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer");	2728	DEFSYM (Qinternal_complete_buffer, "internal-complete-buffer");
2546	DEFSYM (Qcompleting_read_function, "completing-read-function");	2729	DEFSYM (Qcompleting_read_function, "completing-read-function");