Improve string_char_and_length speed

This tweak improved the CPU time performance of ‘make compile-always’ by about 1.7% on my platform. * src/character.c (string_char): Remove; no longer used. * src/character.h (string_char_and_length): Redo so that it needn’t call string_char. This helps the caller, which can now become a leaf function.
author: Paul Eggert 2020-04-26 15:18:49 -0700
committer: Paul Eggert 2020-04-26 19:31:54 -0700
commit: ed2def7d5e423388ca75c6e10fd7b42e0c4789c7 (patch)
tree: a488de7c0a4729937cfa8fca01093433a609374f /src
parent: 895a18eafb84bca68045e552437dbb00a15a9f56 (diff)
download: emacs-ed2def7d5e423388ca75c6e10fd7b42e0c4789c7.tar.gz
emacs-ed2def7d5e423388ca75c6e10fd7b42e0c4789c7.zip
2 files changed, 27 insertions, 65 deletions
diff --git a/src/character.c b/src/character.c
index edcec5f1c79..4902e564b1d 100644
--- a/src/character.c
+++ b/src/character.c
@@ -141,51 +141,6 @@ char_string (unsigned int c, unsigned char *p)
 }
-/* Return a character whose multibyte form is at P.  Set *LEN to the
-   byte length of the multibyte form.  */
-int
-string_char (const unsigned char *p, int *len)
-{
-  int c;
-  const unsigned char *saved_p = p;
-  if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
-    {
-      /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
-      c = string_char_advance (&p);
-    }
-  else if (! (*p & 0x08))
-    {
-      /* A 4-byte sequence of this form:
-         11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
-      c = ((((p)[0] & 0x7) << 18)
-           | (((p)[1] & 0x3F) << 12)
-           | (((p)[2] & 0x3F) << 6)
-           | ((p)[3] & 0x3F));
-      p += 4;
-    }
-  else
-    {
-      /* A 5-byte sequence of this form:
-         111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-         Note that the top 4 `x's are always 0, so shifting p[1] can
-         never exceed the maximum valid character codepoint. */
-      c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
-           (((p)[1] & 0x3F) << 18)
-           | (((p)[2] & 0x3F) << 12)
-           | (((p)[3] & 0x3F) << 6)
-           | ((p)[4] & 0x3F));
-      p += 5;
-    }
-  *len = p - saved_p;
-  return c;
-}
 /* Translate character C by translation table TABLE.  If no translation is
   found in TABLE, return the untranslated character.  If TABLE is a list,
   elements are char tables.  In that case, recursively translate C by all the
diff --git a/src/character.h b/src/character.h
index 4887473b27e..d4d77504426 100644
--- a/src/character.h
+++ b/src/character.h
@@ -85,7 +85,6 @@ enum
 };
 extern int char_string (unsigned, unsigned char *);
-extern int string_char (const unsigned char *, int *);
 /* UTF-8 encodings.  Use \x escapes, so they are portable to pre-C11
   compilers and can be concatenated with ordinary string literals.  */
@@ -371,33 +370,41 @@ raw_prev_char_len (unsigned char const *p)
 INLINE int
 string_char_and_length (unsigned char const *p, int *length)
 {
-  int c, len;
+  int c = p[0];
+  if (! (c & 0x80))
+    {
+      *length = 1;
+      return c;
+    }
+  eassume (0xC0 <= c);
-  if (! (p[0] & 0x80))
+  int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80);
+  if (! (c & 0x20))
    {
-      len = 1;
+      *length = 2;
-      c = p[0];
+      return d + (c < 0xC2 ? 0x3FFF80 : 0);
    }
-  else if (! (p[0] & 0x20))
+  d = (d << 6) + p[2] - ((0x20 << 12) + 0x80);
+  if (! (c & 0x10))
    {
-      len = 2;
+      *length = 3;
-      c = ((((p[0] & 0x1F) << 6)
+      eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR);
-            | (p[1] & 0x3F))
+      return d;
-           + (p[0] < 0xC2 ? 0x3FFF80 : 0));
    }
-  else if (! (p[0] & 0x10))
+  d = (d << 6) + p[3] - ((0x10 << 18) + 0x80);
+  if (! (c & 0x08))
    {
-      len = 3;
+      *length = 4;
-      c = (((p[0] & 0x0F) << 12)
+      eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR);
-           | ((p[1] & 0x3F) << 6)
+      return d;
-           | (p[2] & 0x3F));
    }
-  else
-    c = string_char (p, &len);
-  eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH);
+  d = (d << 6) + p[4] - ((0x08 << 24) + 0x80);
-  *length = len;
+  *length = 5;
-  return c;
+  eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR);
+  return d;
 }
 /* Return the character code of character whose multibyte form is at P.  */
author	Paul Eggert	2020-04-26 15:18:49 -0700
committer	Paul Eggert	2020-04-26 19:31:54 -0700
commit	ed2def7d5e423388ca75c6e10fd7b42e0c4789c7 (patch)
tree	a488de7c0a4729937cfa8fca01093433a609374f /src
parent	895a18eafb84bca68045e552437dbb00a15a9f56 (diff)
download	emacs-ed2def7d5e423388ca75c6e10fd7b42e0c4789c7.tar.gz emacs-ed2def7d5e423388ca75c6e10fd7b42e0c4789c7.zip

diff --git a/src/character.c b/src/character.c index edcec5f1c79..4902e564b1d 100644 --- a/src/character.c +++ b/src/character.c
@@ -141,51 +141,6 @@ char_string (unsigned int c, unsigned char *p)
141	}	141	}
142		142
143		143
144	/* Return a character whose multibyte form is at P. Set *LEN to the
145	byte length of the multibyte form. */
146
147	int
148	string_char (const unsigned char p, int len)
149	{
150	int c;
151	const unsigned char *saved_p = p;
152
153	if (p < 0x80 \|\| ! (p & 0x20) \|\| ! (*p & 0x10))
154	{
155	/* 1-, 2-, and 3-byte sequences can be handled by the macro. */
156	c = string_char_advance (&p);
157	}
158	else if (! (*p & 0x08))
159	{
160	/* A 4-byte sequence of this form:
161	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
162	c = ((((p)[0] & 0x7) << 18)
163	\| (((p)[1] & 0x3F) << 12)
164	\| (((p)[2] & 0x3F) << 6)
165	\| ((p)[3] & 0x3F));
166	p += 4;
167	}
168	else
169	{
170	/* A 5-byte sequence of this form:
171
172	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
173
174	Note that the top 4 `x's are always 0, so shifting p[1] can
175	never exceed the maximum valid character codepoint. */
176	c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
177	(((p)[1] & 0x3F) << 18)
178	\| (((p)[2] & 0x3F) << 12)
179	\| (((p)[3] & 0x3F) << 6)
180	\| ((p)[4] & 0x3F));
181	p += 5;
182	}
183
184	*len = p - saved_p;
185	return c;
186	}
187
188
189	/* Translate character C by translation table TABLE. If no translation is	144	/* Translate character C by translation table TABLE. If no translation is
190	found in TABLE, return the untranslated character. If TABLE is a list,	145	found in TABLE, return the untranslated character. If TABLE is a list,
191	elements are char tables. In that case, recursively translate C by all the	146	elements are char tables. In that case, recursively translate C by all the


diff --git a/src/character.h b/src/character.h index 4887473b27e..d4d77504426 100644 --- a/src/character.h +++ b/src/character.h
@@ -85,7 +85,6 @@ enum
85	};	85	};
86		86
87	extern int char_string (unsigned, unsigned char *);	87	extern int char_string (unsigned, unsigned char *);
88	extern int string_char (const unsigned char , int );
89		88
90	/* UTF-8 encodings. Use \x escapes, so they are portable to pre-C11	89	/* UTF-8 encodings. Use \x escapes, so they are portable to pre-C11
91	compilers and can be concatenated with ordinary string literals. */	90	compilers and can be concatenated with ordinary string literals. */
@@ -371,33 +370,41 @@ raw_prev_char_len (unsigned char const *p)
371	INLINE int	370	INLINE int
372	string_char_and_length (unsigned char const p, int length)	371	string_char_and_length (unsigned char const p, int length)
373	{	372	{
374	int c, len;	373	int c = p[0];
		374	if (! (c & 0x80))
		375	{
		376	*length = 1;
		377	return c;
		378	}
		379	eassume (0xC0 <= c);
375		380
376	if (! (p[0] & 0x80))	381	int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80);
		382	if (! (c & 0x20))
377	{	383	{
378	len = 1;	384	*length = 2;
379	c = p[0];	385	return d + (c < 0xC2 ? 0x3FFF80 : 0);
380	}	386	}
381	else if (! (p[0] & 0x20))	387
		388	d = (d << 6) + p[2] - ((0x20 << 12) + 0x80);
		389	if (! (c & 0x10))
382	{	390	{
383	len = 2;	391	*length = 3;
384	c = ((((p[0] & 0x1F) << 6)	392	eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR);
385	\| (p[1] & 0x3F))	393	return d;
386	+ (p[0] < 0xC2 ? 0x3FFF80 : 0));
387	}	394	}
388	else if (! (p[0] & 0x10))	395
		396	d = (d << 6) + p[3] - ((0x10 << 18) + 0x80);
		397	if (! (c & 0x08))
389	{	398	{
390	len = 3;	399	*length = 4;
391	c = (((p[0] & 0x0F) << 12)	400	eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR);
392	\| ((p[1] & 0x3F) << 6)	401	return d;
393	\| (p[2] & 0x3F));
394	}	402	}
395	else
396	c = string_char (p, &len);
397		403
398	eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH);	404	d = (d << 6) + p[4] - ((0x08 << 24) + 0x80);
399	*length = len;	405	*length = 5;
400	return c;	406	eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR);
		407	return d;
401	}	408	}
402		409
403	/* Return the character code of character whose multibyte form is at P. */	410	/* Return the character code of character whose multibyte form is at P. */