aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMattias EngdegÄrd2024-02-13 14:52:39 +0100
committerMattias EngdegÄrd2024-02-14 14:29:54 +0100
commit3a93e301ddc913758abe05c876aa3016e8b23af8 (patch)
tree60e17da4c915a15af930efe1c7fc0bc8532d58fb /src
parentdecfdd4f1a1e3b1539eafdaaf11191e8477f0636 (diff)
downloademacs-3a93e301ddc913758abe05c876aa3016e8b23af8.tar.gz
emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.zip
String hashing improvements (spread and performance)
Fix gaps in hashing coverage in the middle and end of even fairly short strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact same value but with the patch, there are no collisions among the ~160000 symbols in the Emacs tree. This change also improves average hashing speed by using fewer mixing operations. * src/fns.c (hash_string): Use unit stride for fairly short strings, while retaining the cap of 8 samples for long ones. Always hash the last word to ensure that the end of the string is covered. For strings shorter than a word, use fewer loads and a single reduction step.
Diffstat (limited to 'src')
-rw-r--r--src/fns.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/src/fns.c b/src/fns.c
index 918ba0370e8..f94e8519957 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len)
5069 EMACS_UINT hash = len; 5069 EMACS_UINT hash = len;
5070 /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course, 5070 /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course,
5071 * but dividing by 8 is cheaper. */ 5071 * but dividing by 8 is cheaper. */
5072 ptrdiff_t step = sizeof hash + ((end - p) >> 3); 5072 ptrdiff_t step = max (sizeof hash, ((end - p) >> 3));
5073 5073
5074 while (p + sizeof hash <= end) 5074 if (p + sizeof hash <= end)
5075 { 5075 {
5076 do
5077 {
5078 EMACS_UINT c;
5079 /* We presume that the compiler will replace this `memcpy` with
5080 a single load/move instruction when applicable. */
5081 memcpy (&c, p, sizeof hash);
5082 p += step;
5083 hash = sxhash_combine (hash, c);
5084 }
5085 while (p + sizeof hash <= end);
5086 /* Hash the last wordful of bytes in the string, because that is
5087 is often the part where strings differ. This may cause some
5088 bytes to be hashed twice but we assume that's not a big problem. */
5076 EMACS_UINT c; 5089 EMACS_UINT c;
5077 /* We presume that the compiler will replace this `memcpy` with 5090 memcpy (&c, end - sizeof c, sizeof c);
5078 a single load/move instruction when applicable. */
5079 memcpy (&c, p, sizeof hash);
5080 p += step;
5081 hash = sxhash_combine (hash, c); 5091 hash = sxhash_combine (hash, c);
5082 } 5092 }
5083 /* A few last bytes may remain (smaller than an EMACS_UINT). */ 5093 else
5084 /* FIXME: We could do this without a loop, but it'd require
5085 endian-dependent code :-( */
5086 while (p < end)
5087 { 5094 {
5088 unsigned char c = *p++; 5095 /* String is shorter than an EMACS_UINT. Use smaller loads. */
5089 hash = sxhash_combine (hash, c); 5096 eassume (p <= end && end - p < sizeof (EMACS_UINT));
5097 EMACS_UINT tail = 0;
5098 if (end - p >= 4)
5099 {
5100 uint32_t c;
5101 memcpy (&c, p, sizeof c);
5102 tail = (tail << (8 * sizeof c)) + c;
5103 p += sizeof c;
5104 }
5105 if (end - p >= 2)
5106 {
5107 uint16_t c;
5108 memcpy (&c, p, sizeof c);
5109 tail = (tail << (8 * sizeof c)) + c;
5110 p += sizeof c;
5111 }
5112 if (p < end)
5113 tail = (tail << 8) + (unsigned char)*p;
5114 hash = sxhash_combine (hash, tail);
5090 } 5115 }
5091 5116
5092 return hash; 5117 return hash;