String hashing improvements (spread and performance)

Fix gaps in hashing coverage in the middle and end of even fairly short strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact same value but with the patch, there are no collisions among the ~160000 symbols in the Emacs tree. This change also improves average hashing speed by using fewer mixing operations. * src/fns.c (hash_string): Use unit stride for fairly short strings, while retaining the cap of 8 samples for long ones. Always hash the last word to ensure that the end of the string is covered. For strings shorter than a word, use fewer loads and a single reduction step.
author: Mattias Engdegård 2024-02-13 14:52:39 +0100
committer: Mattias Engdegård 2024-02-14 14:29:54 +0100
commit: 3a93e301ddc913758abe05c876aa3016e8b23af8 (patch)
tree: 60e17da4c915a15af930efe1c7fc0bc8532d58fb /src
parent: decfdd4f1a1e3b1539eafdaaf11191e8477f0636 (diff)
download: emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.tar.gz
emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.zip
1 files changed, 37 insertions, 12 deletions
diff --git a/src/fns.c b/src/fns.c
index 918ba0370e8..f94e8519957 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len)
  EMACS_UINT hash = len;
  /* At most 8 steps.  We could reuse SXHASH_MAX_LEN, of course,
   * but dividing by 8 is cheaper.  */
-  ptrdiff_t step = sizeof hash + ((end - p) >> 3);
+  ptrdiff_t step = max (sizeof hash, ((end - p) >> 3));
-  while (p + sizeof hash <= end)
+  if (p + sizeof hash <= end)
    {
+      do
+        {
+          EMACS_UINT c;
+          /* We presume that the compiler will replace this `memcpy` with
+             a single load/move instruction when applicable.  */
+          memcpy (&c, p, sizeof hash);
+          p += step;
+          hash = sxhash_combine (hash, c);
+        }
+      while (p + sizeof hash <= end);
+      /* Hash the last wordful of bytes in the string, because that is
+         is often the part where strings differ.  This may cause some
+         bytes to be hashed twice but we assume that's not a big problem.  */
      EMACS_UINT c;
-      /* We presume that the compiler will replace this `memcpy` with
+      memcpy (&c, end - sizeof c, sizeof c);
-         a single load/move instruction when applicable.  */
-      memcpy (&c, p, sizeof hash);
-      p += step;
      hash = sxhash_combine (hash, c);
    }
-  /* A few last bytes may remain (smaller than an EMACS_UINT).  */
+  else
-  /* FIXME: We could do this without a loop, but it'd require
-     endian-dependent code :-(  */
-  while (p < end)
    {
-      unsigned char c = *p++;
+      /* String is shorter than an EMACS_UINT.  Use smaller loads.  */
-      hash = sxhash_combine (hash, c);
+      eassume (p <= end && end - p < sizeof (EMACS_UINT));
+      EMACS_UINT tail = 0;
+      if (end - p >= 4)
+        {
+          uint32_t c;
+          memcpy (&c, p, sizeof c);
+          tail = (tail << (8 * sizeof c)) + c;
+          p += sizeof c;
+        }
+      if (end - p >= 2)
+        {
+          uint16_t c;
+          memcpy (&c, p, sizeof c);
+          tail = (tail << (8 * sizeof c)) + c;
+          p += sizeof c;
+        }
+      if (p < end)
+        tail = (tail << 8) + (unsigned char)*p;
+      hash = sxhash_combine (hash, tail);
    }
  return hash;
author	Mattias Engdegård	2024-02-13 14:52:39 +0100
committer	Mattias Engdegård	2024-02-14 14:29:54 +0100
commit	3a93e301ddc913758abe05c876aa3016e8b23af8 (patch)
tree	60e17da4c915a15af930efe1c7fc0bc8532d58fb /src
parent	decfdd4f1a1e3b1539eafdaaf11191e8477f0636 (diff)
download	emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.tar.gz emacs-3a93e301ddc913758abe05c876aa3016e8b23af8.zip

diff --git a/src/fns.c b/src/fns.c index 918ba0370e8..f94e8519957 100644 --- a/src/fns.c +++ b/src/fns.c
@@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len)
5069	EMACS_UINT hash = len;	5069	EMACS_UINT hash = len;
5070	/* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course,	5070	/* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course,
5071	* but dividing by 8 is cheaper. */	5071	* but dividing by 8 is cheaper. */
5072	ptrdiff_t step = sizeof hash + ((end - p) >> 3);	5072	ptrdiff_t step = max (sizeof hash, ((end - p) >> 3));
5073		5073
5074	while (p + sizeof hash <= end)	5074	if (p + sizeof hash <= end)
5075	{	5075	{
		5076	do
		5077	{
		5078	EMACS_UINT c;
		5079	/* We presume that the compiler will replace this `memcpy` with
		5080	a single load/move instruction when applicable. */
		5081	memcpy (&c, p, sizeof hash);
		5082	p += step;
		5083	hash = sxhash_combine (hash, c);
		5084	}
		5085	while (p + sizeof hash <= end);
		5086	/* Hash the last wordful of bytes in the string, because that is
		5087	is often the part where strings differ. This may cause some
		5088	bytes to be hashed twice but we assume that's not a big problem. */
5076	EMACS_UINT c;	5089	EMACS_UINT c;
5077	/* We presume that the compiler will replace this `memcpy` with	5090	memcpy (&c, end - sizeof c, sizeof c);
5078	a single load/move instruction when applicable. */
5079	memcpy (&c, p, sizeof hash);
5080	p += step;
5081	hash = sxhash_combine (hash, c);	5091	hash = sxhash_combine (hash, c);
5082	}	5092	}
5083	/* A few last bytes may remain (smaller than an EMACS_UINT). */	5093	else
5084	/* FIXME: We could do this without a loop, but it'd require
5085	endian-dependent code :-( */
5086	while (p < end)
5087	{	5094	{
5088	unsigned char c = *p++;	5095	/* String is shorter than an EMACS_UINT. Use smaller loads. */
5089	hash = sxhash_combine (hash, c);	5096	eassume (p <= end && end - p < sizeof (EMACS_UINT));
		5097	EMACS_UINT tail = 0;
		5098	if (end - p >= 4)
		5099	{
		5100	uint32_t c;
		5101	memcpy (&c, p, sizeof c);
		5102	tail = (tail << (8 * sizeof c)) + c;
		5103	p += sizeof c;
		5104	}
		5105	if (end - p >= 2)
		5106	{
		5107	uint16_t c;
		5108	memcpy (&c, p, sizeof c);
		5109	tail = (tail << (8 * sizeof c)) + c;
		5110	p += sizeof c;
		5111	}
		5112	if (p < end)
		5113	tail = (tail << 8) + (unsigned char)*p;
		5114	hash = sxhash_combine (hash, tail);
5090	}	5115	}
5091		5116
5092	return hash;	5117	return hash;