diff options
| author | Mattias EngdegÄrd | 2024-02-13 14:52:39 +0100 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2024-02-14 14:29:54 +0100 |
| commit | 3a93e301ddc913758abe05c876aa3016e8b23af8 (patch) | |
| tree | 60e17da4c915a15af930efe1c7fc0bc8532d58fb /src | |
| parent | decfdd4f1a1e3b1539eafdaaf11191e8477f0636 (diff) | |
| download | emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.tar.gz emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.zip | |
String hashing improvements (spread and performance)
Fix gaps in hashing coverage in the middle and end of even fairly short
strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact
same value but with the patch, there are no collisions among the ~160000
symbols in the Emacs tree.
This change also improves average hashing speed by using fewer mixing
operations.
* src/fns.c (hash_string):
Use unit stride for fairly short strings, while retaining the cap of 8
samples for long ones.
Always hash the last word to ensure that the end of the string is
covered. For strings shorter than a word, use fewer loads and a single
reduction step.
Diffstat (limited to 'src')
| -rw-r--r-- | src/fns.c | 49 |
1 files changed, 37 insertions, 12 deletions
| @@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len) | |||
| 5069 | EMACS_UINT hash = len; | 5069 | EMACS_UINT hash = len; |
| 5070 | /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course, | 5070 | /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course, |
| 5071 | * but dividing by 8 is cheaper. */ | 5071 | * but dividing by 8 is cheaper. */ |
| 5072 | ptrdiff_t step = sizeof hash + ((end - p) >> 3); | 5072 | ptrdiff_t step = max (sizeof hash, ((end - p) >> 3)); |
| 5073 | 5073 | ||
| 5074 | while (p + sizeof hash <= end) | 5074 | if (p + sizeof hash <= end) |
| 5075 | { | 5075 | { |
| 5076 | do | ||
| 5077 | { | ||
| 5078 | EMACS_UINT c; | ||
| 5079 | /* We presume that the compiler will replace this `memcpy` with | ||
| 5080 | a single load/move instruction when applicable. */ | ||
| 5081 | memcpy (&c, p, sizeof hash); | ||
| 5082 | p += step; | ||
| 5083 | hash = sxhash_combine (hash, c); | ||
| 5084 | } | ||
| 5085 | while (p + sizeof hash <= end); | ||
| 5086 | /* Hash the last wordful of bytes in the string, because that is | ||
| 5087 | is often the part where strings differ. This may cause some | ||
| 5088 | bytes to be hashed twice but we assume that's not a big problem. */ | ||
| 5076 | EMACS_UINT c; | 5089 | EMACS_UINT c; |
| 5077 | /* We presume that the compiler will replace this `memcpy` with | 5090 | memcpy (&c, end - sizeof c, sizeof c); |
| 5078 | a single load/move instruction when applicable. */ | ||
| 5079 | memcpy (&c, p, sizeof hash); | ||
| 5080 | p += step; | ||
| 5081 | hash = sxhash_combine (hash, c); | 5091 | hash = sxhash_combine (hash, c); |
| 5082 | } | 5092 | } |
| 5083 | /* A few last bytes may remain (smaller than an EMACS_UINT). */ | 5093 | else |
| 5084 | /* FIXME: We could do this without a loop, but it'd require | ||
| 5085 | endian-dependent code :-( */ | ||
| 5086 | while (p < end) | ||
| 5087 | { | 5094 | { |
| 5088 | unsigned char c = *p++; | 5095 | /* String is shorter than an EMACS_UINT. Use smaller loads. */ |
| 5089 | hash = sxhash_combine (hash, c); | 5096 | eassume (p <= end && end - p < sizeof (EMACS_UINT)); |
| 5097 | EMACS_UINT tail = 0; | ||
| 5098 | if (end - p >= 4) | ||
| 5099 | { | ||
| 5100 | uint32_t c; | ||
| 5101 | memcpy (&c, p, sizeof c); | ||
| 5102 | tail = (tail << (8 * sizeof c)) + c; | ||
| 5103 | p += sizeof c; | ||
| 5104 | } | ||
| 5105 | if (end - p >= 2) | ||
| 5106 | { | ||
| 5107 | uint16_t c; | ||
| 5108 | memcpy (&c, p, sizeof c); | ||
| 5109 | tail = (tail << (8 * sizeof c)) + c; | ||
| 5110 | p += sizeof c; | ||
| 5111 | } | ||
| 5112 | if (p < end) | ||
| 5113 | tail = (tail << 8) + (unsigned char)*p; | ||
| 5114 | hash = sxhash_combine (hash, tail); | ||
| 5090 | } | 5115 | } |
| 5091 | 5116 | ||
| 5092 | return hash; | 5117 | return hash; |