aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMattias Engdegård2022-10-06 17:46:02 +0200
committerMattias Engdegård2022-10-07 13:57:54 +0200
commitdef6fa4246502befa174aa6409166b0967621f7b (patch)
tree42ec7fa50fc85593ac2357e1da445099b3b691b0 /src
parent6b4c17dec06b7cac4025317daef68c302c61d4e6 (diff)
downloademacs-def6fa4246502befa174aa6409166b0967621f7b.tar.gz
emacs-def6fa4246502befa174aa6409166b0967621f7b.zip
Speed up string-lessp for multibyte strings
Improve comparison speed when both arguments are multibyte strings, at least one of them containing a non-ASCII character. (All-ASCII multibyte strings are already fast.) The speed-up is about 2× for strings of 10 chars, 10× for strings of 100 chars. * src/fns.c (Fstring_lessp): Quickly skip the common prefix by comparing words.
Diffstat (limited to 'src')
-rw-r--r--src/fns.c51
1 files changed, 39 insertions, 12 deletions
diff --git a/src/fns.c b/src/fns.c
index 22e66d3653d..bc4915eb25b 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -454,23 +454,50 @@ Symbols are also allowed; their print names are used instead. */)
454 && (!STRING_MULTIBYTE (string2) || SCHARS (string2) == SBYTES (string2))) 454 && (!STRING_MULTIBYTE (string2) || SCHARS (string2) == SBYTES (string2)))
455 { 455 {
456 /* Each argument is either unibyte or all-ASCII multibyte: 456 /* Each argument is either unibyte or all-ASCII multibyte:
457 we can compare bytewise. 457 we can compare bytewise. */
458 (Arbitrary multibyte strings cannot be compared bytewise because
459 that would give a different order for raw bytes 80..FF.) */
460 int d = memcmp (SSDATA (string1), SSDATA (string2), n); 458 int d = memcmp (SSDATA (string1), SSDATA (string2), n);
461 return d < 0 || (d == 0 && n < SCHARS (string2)) ? Qt : Qnil; 459 return d < 0 || (d == 0 && n < SCHARS (string2)) ? Qt : Qnil;
462 } 460 }
463 else if (STRING_MULTIBYTE (string1) && STRING_MULTIBYTE (string2)) 461 else if (STRING_MULTIBYTE (string1) && STRING_MULTIBYTE (string2))
464 { 462 {
465 ptrdiff_t i1 = 0, i1_byte = 0, i2 = 0, i2_byte = 0; 463 /* Two arbitrary multibyte strings: we cannot use memcmp because
466 while (i1 < n) 464 the encoding for raw bytes would sort those between U+007F and U+0080
467 { 465 which isn't where we want them.
468 int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte); 466 Instead, we skip the longest common prefix and look at
469 int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte); 467 what follows. */
470 if (c1 != c2) 468 ptrdiff_t nb1 = SBYTES (string1);
471 return c1 < c2 ? Qt : Qnil; 469 ptrdiff_t nb2 = SBYTES (string2);
472 } 470 ptrdiff_t nb = min (nb1, nb2);
473 return i1 < SCHARS (string2) ? Qt : Qnil; 471
472 /* First compare entire machine words. (String data is allocated
473 with word alignment.) */
474 typedef size_t word_t;
475 int ws = sizeof (word_t);
476 const word_t *w1 = (const word_t *) SDATA (string1);
477 const word_t *w2 = (const word_t *) SDATA (string2);
478 ptrdiff_t b = 0;
479 while (b < nb - ws + 1 && w1[b / ws] == w2[b / ws])
480 b += ws;
481
482 /* Scan forward to the differing byte (at most ws-1 bytes). */
483 while (b < nb && SREF (string1, b) == SREF (string2, b))
484 b++;
485
486 if (b >= nb)
487 /* One string is a prefix of the other. */
488 return b < nb2 ? Qt : Qnil;
489
490 /* Now back up to the start of the differing characters:
491 it's the last byte not having the bit pattern 10xxxxxx. */
492 while ((SREF (string1, b) & 0xc0) == 0x80)
493 b--;
494
495 /* Compare the differing characters. */
496 ptrdiff_t i1 = 0, i2 = 0;
497 ptrdiff_t i1_byte = b, i2_byte = b;
498 int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte);
499 int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte);
500 return c1 < c2 ? Qt : Qnil;
474 } 501 }
475 else if (STRING_MULTIBYTE (string1)) 502 else if (STRING_MULTIBYTE (string1))
476 { 503 {