diff options
| author | Mattias Engdegård | 2022-10-06 17:46:02 +0200 |
|---|---|---|
| committer | Mattias Engdegård | 2022-10-07 13:57:54 +0200 |
| commit | def6fa4246502befa174aa6409166b0967621f7b (patch) | |
| tree | 42ec7fa50fc85593ac2357e1da445099b3b691b0 /src | |
| parent | 6b4c17dec06b7cac4025317daef68c302c61d4e6 (diff) | |
| download | emacs-def6fa4246502befa174aa6409166b0967621f7b.tar.gz emacs-def6fa4246502befa174aa6409166b0967621f7b.zip | |
Speed up string-lessp for multibyte strings
Improve comparison speed when both arguments are multibyte strings,
at least one of them containing a non-ASCII character. (All-ASCII
multibyte strings are already fast.)
The speed-up is about 2× for strings of 10 chars, 10× for strings of
100 chars.
* src/fns.c (Fstring_lessp): Quickly skip the common prefix by
comparing words.
Diffstat (limited to 'src')
| -rw-r--r-- | src/fns.c | 51 |
1 files changed, 39 insertions, 12 deletions
| @@ -454,23 +454,50 @@ Symbols are also allowed; their print names are used instead. */) | |||
| 454 | && (!STRING_MULTIBYTE (string2) || SCHARS (string2) == SBYTES (string2))) | 454 | && (!STRING_MULTIBYTE (string2) || SCHARS (string2) == SBYTES (string2))) |
| 455 | { | 455 | { |
| 456 | /* Each argument is either unibyte or all-ASCII multibyte: | 456 | /* Each argument is either unibyte or all-ASCII multibyte: |
| 457 | we can compare bytewise. | 457 | we can compare bytewise. */ |
| 458 | (Arbitrary multibyte strings cannot be compared bytewise because | ||
| 459 | that would give a different order for raw bytes 80..FF.) */ | ||
| 460 | int d = memcmp (SSDATA (string1), SSDATA (string2), n); | 458 | int d = memcmp (SSDATA (string1), SSDATA (string2), n); |
| 461 | return d < 0 || (d == 0 && n < SCHARS (string2)) ? Qt : Qnil; | 459 | return d < 0 || (d == 0 && n < SCHARS (string2)) ? Qt : Qnil; |
| 462 | } | 460 | } |
| 463 | else if (STRING_MULTIBYTE (string1) && STRING_MULTIBYTE (string2)) | 461 | else if (STRING_MULTIBYTE (string1) && STRING_MULTIBYTE (string2)) |
| 464 | { | 462 | { |
| 465 | ptrdiff_t i1 = 0, i1_byte = 0, i2 = 0, i2_byte = 0; | 463 | /* Two arbitrary multibyte strings: we cannot use memcmp because |
| 466 | while (i1 < n) | 464 | the encoding for raw bytes would sort those between U+007F and U+0080 |
| 467 | { | 465 | which isn't where we want them. |
| 468 | int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte); | 466 | Instead, we skip the longest common prefix and look at |
| 469 | int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte); | 467 | what follows. */ |
| 470 | if (c1 != c2) | 468 | ptrdiff_t nb1 = SBYTES (string1); |
| 471 | return c1 < c2 ? Qt : Qnil; | 469 | ptrdiff_t nb2 = SBYTES (string2); |
| 472 | } | 470 | ptrdiff_t nb = min (nb1, nb2); |
| 473 | return i1 < SCHARS (string2) ? Qt : Qnil; | 471 | |
| 472 | /* First compare entire machine words. (String data is allocated | ||
| 473 | with word alignment.) */ | ||
| 474 | typedef size_t word_t; | ||
| 475 | int ws = sizeof (word_t); | ||
| 476 | const word_t *w1 = (const word_t *) SDATA (string1); | ||
| 477 | const word_t *w2 = (const word_t *) SDATA (string2); | ||
| 478 | ptrdiff_t b = 0; | ||
| 479 | while (b < nb - ws + 1 && w1[b / ws] == w2[b / ws]) | ||
| 480 | b += ws; | ||
| 481 | |||
| 482 | /* Scan forward to the differing byte (at most ws-1 bytes). */ | ||
| 483 | while (b < nb && SREF (string1, b) == SREF (string2, b)) | ||
| 484 | b++; | ||
| 485 | |||
| 486 | if (b >= nb) | ||
| 487 | /* One string is a prefix of the other. */ | ||
| 488 | return b < nb2 ? Qt : Qnil; | ||
| 489 | |||
| 490 | /* Now back up to the start of the differing characters: | ||
| 491 | it's the last byte not having the bit pattern 10xxxxxx. */ | ||
| 492 | while ((SREF (string1, b) & 0xc0) == 0x80) | ||
| 493 | b--; | ||
| 494 | |||
| 495 | /* Compare the differing characters. */ | ||
| 496 | ptrdiff_t i1 = 0, i2 = 0; | ||
| 497 | ptrdiff_t i1_byte = b, i2_byte = b; | ||
| 498 | int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte); | ||
| 499 | int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte); | ||
| 500 | return c1 < c2 ? Qt : Qnil; | ||
| 474 | } | 501 | } |
| 475 | else if (STRING_MULTIBYTE (string1)) | 502 | else if (STRING_MULTIBYTE (string1)) |
| 476 | { | 503 | { |