Use 'ucs-names' for character name escapes

* lread.c (invalid_character_name, check_scalar_value) (parse_code_after_prefix, character_name_to_code): New helper functions that use 'ucs-names' and parsing for CJK ideographs. (read_escape): Use helper functions. (syms_of_lread): New symbol 'ucs-names'. * test/src/lread-tests.el: New tests; fix a couple of bugs in existing tests.
author: Philipp Stephani 2016-04-21 14:51:30 -0700
committer: Paul Eggert 2016-04-21 19:29:40 -0700
commit: e7cb38edc946ff60c1c878b30b068376d6ef56d2 (patch)
tree: 8fc810e91650a328dd6f6d95dd2a5f1f52b86c4a /src
parent: 753c875714f708c0257a2d352635c5616be66fdc (diff)
download: emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.tar.gz
emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.zip
1 files changed, 88 insertions, 49 deletions
diff --git a/src/lread.c b/src/lread.c
index dbe51bb06c8..c3b6bd79e42 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -44,6 +44,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #include "termhooks.h"
 #include "blockinput.h"
 #include <c-ctype.h>
+#include <string.h>
 #ifdef MSDOS
 #include "msdos.h"
@@ -2150,36 +2151,90 @@ grow_read_buffer (void)
                         MAX_MULTIBYTE_LENGTH, -1, 1);
 }
-/* Hash table that maps Unicode character names to code points.  */
+/* Signal an invalid-read-syntax error indicating that the character
-static Lisp_Object character_names;
+   name in an \N{…} literal is invalid.  */
+static _Noreturn void
+invalid_character_name (Lisp_Object name)
+{
+  AUTO_STRING (format, "\\N{%s}");
+  xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
+}
-/* Length of the longest Unicode character name, in bytes. */
+/* Check that CODE is a valid Unicode scalar value, and return its
-static ptrdiff_t max_character_name_length;
+   value.  CODE should be parsed from the character name given by
+   NAME.  NAME is used for error messages.  */
+static int
+check_scalar_value (Lisp_Object code, Lisp_Object name)
+{
+  if (! NUMBERP (code))
+    invalid_character_name (name);
+  EMACS_INT i = XINT (code);
+  if (! (0 <= i && i <= MAX_UNICODE_CHAR)
+      /* Don't allow surrogates.  */
+      || (0xD800 <= code && code <= 0xDFFF))
+    invalid_character_name (name);
+  return i;
+}
-/* Initializes `character_names' and `max_character_name_length'.
+/* If NAME starts with PREFIX, interpret the rest as a hexadecimal
-   Called by `read_escape'.  */
+   number and return its value.  Raise invalid-read-syntax if the
-void init_character_names (void)
+   number is not a valid scalar value.  Return −1 if NAME doesn’t
+   start with PREFIX.  */
+static int
+parse_code_after_prefix (Lisp_Object name, const char *prefix)
 {
-  character_names = CALLN (Fmake_hash_table,
+  ptrdiff_t name_len = SBYTES (name);
-                           QCtest, Qequal,
+  ptrdiff_t prefix_len = strlen (prefix);
-                           /* Currently around 100,000 Unicode
+  /* Allow between one and eight hexadecimal digits after the
-                              characters are defined.  */
+     prefix.  */
-                           QCsize, make_natnum (100000));
+  if (prefix_len < name_len && name_len <= prefix_len + 8
-  Lisp_Object get_property =
+      && memcmp (SDATA (name), prefix, prefix_len) == 0)
-    Fsymbol_function (intern_c_string ("get-char-code-property"));
-  ptrdiff_t length = 0;
-  for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
    {
-      Lisp_Object code = make_natnum (i);
+      Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false);
-      Lisp_Object name = call2 (get_property, code, Qname);
+      if (NUMBERP (code))
-      if (NILP (name)) continue;
+        return check_scalar_value (code, name);
-      CHECK_STRING (name);
+    }
-      length = max (length, SBYTES (name));
+  return -1;
-      Fputhash (name, code, character_names);
+}
+/* Returns the scalar value that has the Unicode character name NAME.
+   Raises `invalid-read-syntax' if there is no such character.  */
+static int
+character_name_to_code (Lisp_Object name)
+{
+  /* Code point as U+N, where N is between 1 and 8 hexadecimal
+     digits.  */
+  int code = parse_code_after_prefix (name, "U+");
+  if (code >= 0)
+    return code;
+  /* CJK ideographs are not contained in the association list returned
+     by `ucs-names'.  But they follow a predictable naming pattern: a
+     fixed prefix plus the hexadecimal codepoint value.  */
+  code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
+  if (code >= 0)
+    {
+      /* Various ranges of CJK characters; see UnicodeData.txt.  */
+      if ((0x3400 <= code && code <= 0x4DB5)
+          || (0x4E00 <= code && code <= 0x9FD5)
+          || (0x20000 <= code && code <= 0x2A6D6)
+          || (0x2A700 <= code && code <= 0x2B734)
+          || (0x2B740 <= code && code <= 0x2B81D)
+          || (0x2B820 <= code && code <= 0x2CEA1))
+        return code;
+      else
+        invalid_character_name (name);
    }
-  max_character_name_length = length;
+  /* Look up the name in the table returned by `ucs-names'.  */
+  Lisp_Object names = call0 (Qucs_names);
+  return check_scalar_value (CDR (Fassoc (name, names)), name);
 }
+/* Bound on the length of a Unicode character name.  As of
+   Unicode 9.0.0 the maximum is 83, so this should be safe. */
+enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
 /* Read a \-escape sequence, assuming we already read the `\'.
   If the escape sequence forces unibyte, return eight-bit char.  */
@@ -2393,10 +2448,7 @@ read_escape (Lisp_Object readcharfun, bool stringp)
        c = READCHAR;
        if (c != '{')
          invalid_syntax ("Expected opening brace after \\N");
-        if (NILP (character_names))
+        char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
-          init_character_names ();
-        USE_SAFE_ALLOCA;
-        char *name = SAFE_ALLOCA (max_character_name_length + 1);
        bool whitespace = false;
        ptrdiff_t length = 0;
        while (true)
@@ -2407,11 +2459,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
            if (c == '}')
              break;
            if (! c_isascii (c))
-              xsignal1 (Qinvalid_read_syntax,
+              {
-                        CALLN (Fformat,
+                AUTO_STRING (format,
-                               build_pure_c_string ("Non-ASCII character U+%04X"
+                             "Non-ASCII character U+%04X in character name");
-                                                    " in character name"),
+                xsignal1 (Qinvalid_read_syntax,
-                               make_natnum (c)));
+                          CALLN (Fformat, format, make_natnum (c)));
+              }
            /* We treat multiple adjacent whitespace characters as a
               single space character.  This makes it easier to use
               character names in e.g. multi-line strings.  */
@@ -2425,25 +2478,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
            else
              whitespace = false;
            name[length++] = c;
-            if (length >= max_character_name_length)
+            if (length >= sizeof name)
              invalid_syntax ("Character name too long");
          }
        if (length == 0)
          invalid_syntax ("Empty character name");
-        name[length] = 0;
+        return character_name_to_code (make_unibyte_string (name, length));
-        Lisp_Object lisp_name = make_unibyte_string (name, length);
-        Lisp_Object code =
-          (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
-          /* Code point as U+N, where N is between 1 and 8 hexadecimal
-             digits.  */
-          string_to_number (name + 2, 16, false) :
-          Fgethash (lisp_name, character_names, Qnil);
-        SAFE_FREE ();
-        if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
-          xsignal1 (Qinvalid_read_syntax,
-                    CALLN (Fformat,
-                           build_pure_c_string ("\\N{%s}"), lisp_name));
-        return XINT (code);
      }
    default:
@@ -4835,6 +4875,5 @@ that are loaded before your customizations are read!  */);
  DEFSYM (Qrehash_size, "rehash-size");
  DEFSYM (Qrehash_threshold, "rehash-threshold");
-  character_names = Qnil;
+  DEFSYM (Qucs_names, "ucs-names");
-  staticpro (&character_names);
 }
author	Philipp Stephani	2016-04-21 14:51:30 -0700
committer	Paul Eggert	2016-04-21 19:29:40 -0700
commit	e7cb38edc946ff60c1c878b30b068376d6ef56d2 (patch)
tree	8fc810e91650a328dd6f6d95dd2a5f1f52b86c4a /src
parent	753c875714f708c0257a2d352635c5616be66fdc (diff)
download	emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.tar.gz emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.zip

diff --git a/src/lread.c b/src/lread.c index dbe51bb06c8..c3b6bd79e42 100644 --- a/src/lread.c +++ b/src/lread.c
@@ -44,6 +44,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
44	#include "termhooks.h"	44	#include "termhooks.h"
45	#include "blockinput.h"	45	#include "blockinput.h"
46	#include <c-ctype.h>	46	#include <c-ctype.h>
		47	#include <string.h>
47		48
48	#ifdef MSDOS	49	#ifdef MSDOS
49	#include "msdos.h"	50	#include "msdos.h"
@@ -2150,36 +2151,90 @@ grow_read_buffer (void)
2150	MAX_MULTIBYTE_LENGTH, -1, 1);	2151	MAX_MULTIBYTE_LENGTH, -1, 1);
2151	}	2152	}
2152		2153
2153	/* Hash table that maps Unicode character names to code points. */	2154	/* Signal an invalid-read-syntax error indicating that the character
2154	static Lisp_Object character_names;	2155	name in an \N{…} literal is invalid. */
		2156	static _Noreturn void
		2157	invalid_character_name (Lisp_Object name)
		2158	{
		2159	AUTO_STRING (format, "\\N{%s}");
		2160	xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
		2161	}
2155		2162
2156	/* Length of the longest Unicode character name, in bytes. */	2163	/* Check that CODE is a valid Unicode scalar value, and return its
2157	static ptrdiff_t max_character_name_length;	2164	value. CODE should be parsed from the character name given by
		2165	NAME. NAME is used for error messages. */
		2166	static int
		2167	check_scalar_value (Lisp_Object code, Lisp_Object name)
		2168	{
		2169	if (! NUMBERP (code))
		2170	invalid_character_name (name);
		2171	EMACS_INT i = XINT (code);
		2172	if (! (0 <= i && i <= MAX_UNICODE_CHAR)
		2173	/* Don't allow surrogates. */
		2174	\|\| (0xD800 <= code && code <= 0xDFFF))
		2175	invalid_character_name (name);
		2176	return i;
		2177	}
2158		2178
2159	/* Initializes `character_names' and `max_character_name_length'.	2179	/* If NAME starts with PREFIX, interpret the rest as a hexadecimal
2160	Called by `read_escape'. */	2180	number and return its value. Raise invalid-read-syntax if the
2161	void init_character_names (void)	2181	number is not a valid scalar value. Return −1 if NAME doesn’t
		2182	start with PREFIX. */
		2183	static int
		2184	parse_code_after_prefix (Lisp_Object name, const char *prefix)
2162	{	2185	{
2163	character_names = CALLN (Fmake_hash_table,	2186	ptrdiff_t name_len = SBYTES (name);
2164	QCtest, Qequal,	2187	ptrdiff_t prefix_len = strlen (prefix);
2165	/* Currently around 100,000 Unicode	2188	/* Allow between one and eight hexadecimal digits after the
2166	characters are defined. */	2189	prefix. */
2167	QCsize, make_natnum (100000));	2190	if (prefix_len < name_len && name_len <= prefix_len + 8
2168	Lisp_Object get_property =	2191	&& memcmp (SDATA (name), prefix, prefix_len) == 0)
2169	Fsymbol_function (intern_c_string ("get-char-code-property"));
2170	ptrdiff_t length = 0;
2171	for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
2172	{	2192	{
2173	Lisp_Object code = make_natnum (i);	2193	Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false);
2174	Lisp_Object name = call2 (get_property, code, Qname);	2194	if (NUMBERP (code))
2175	if (NILP (name)) continue;	2195	return check_scalar_value (code, name);
2176	CHECK_STRING (name);	2196	}
2177	length = max (length, SBYTES (name));	2197	return -1;
2178	Fputhash (name, code, character_names);	2198	}
		2199
		2200	/* Returns the scalar value that has the Unicode character name NAME.
		2201	Raises `invalid-read-syntax' if there is no such character. */
		2202	static int
		2203	character_name_to_code (Lisp_Object name)
		2204	{
		2205	/* Code point as U+N, where N is between 1 and 8 hexadecimal
		2206	digits. */
		2207	int code = parse_code_after_prefix (name, "U+");
		2208	if (code >= 0)
		2209	return code;
		2210
		2211	/* CJK ideographs are not contained in the association list returned
		2212	by `ucs-names'. But they follow a predictable naming pattern: a
		2213	fixed prefix plus the hexadecimal codepoint value. */
		2214	code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
		2215	if (code >= 0)
		2216	{
		2217	/* Various ranges of CJK characters; see UnicodeData.txt. */
		2218	if ((0x3400 <= code && code <= 0x4DB5)
		2219	\|\| (0x4E00 <= code && code <= 0x9FD5)
		2220	\|\| (0x20000 <= code && code <= 0x2A6D6)
		2221	\|\| (0x2A700 <= code && code <= 0x2B734)
		2222	\|\| (0x2B740 <= code && code <= 0x2B81D)
		2223	\|\| (0x2B820 <= code && code <= 0x2CEA1))
		2224	return code;
		2225	else
		2226	invalid_character_name (name);
2179	}	2227	}
2180	max_character_name_length = length;	2228
		2229	/* Look up the name in the table returned by `ucs-names'. */
		2230	Lisp_Object names = call0 (Qucs_names);
		2231	return check_scalar_value (CDR (Fassoc (name, names)), name);
2181	}	2232	}
2182		2233
		2234	/* Bound on the length of a Unicode character name. As of
		2235	Unicode 9.0.0 the maximum is 83, so this should be safe. */
		2236	enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
		2237
2183	/* Read a \-escape sequence, assuming we already read the `\'.	2238	/* Read a \-escape sequence, assuming we already read the `\'.
2184	If the escape sequence forces unibyte, return eight-bit char. */	2239	If the escape sequence forces unibyte, return eight-bit char. */
2185		2240
@@ -2393,10 +2448,7 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2393	c = READCHAR;	2448	c = READCHAR;
2394	if (c != '{')	2449	if (c != '{')
2395	invalid_syntax ("Expected opening brace after \\N");	2450	invalid_syntax ("Expected opening brace after \\N");
2396	if (NILP (character_names))	2451	char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
2397	init_character_names ();
2398	USE_SAFE_ALLOCA;
2399	char *name = SAFE_ALLOCA (max_character_name_length + 1);
2400	bool whitespace = false;	2452	bool whitespace = false;
2401	ptrdiff_t length = 0;	2453	ptrdiff_t length = 0;
2402	while (true)	2454	while (true)
@@ -2407,11 +2459,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2407	if (c == '}')	2459	if (c == '}')
2408	break;	2460	break;
2409	if (! c_isascii (c))	2461	if (! c_isascii (c))
2410	xsignal1 (Qinvalid_read_syntax,	2462	{
2411	CALLN (Fformat,	2463	AUTO_STRING (format,
2412	build_pure_c_string ("Non-ASCII character U+%04X"	2464	"Non-ASCII character U+%04X in character name");
2413	" in character name"),	2465	xsignal1 (Qinvalid_read_syntax,
2414	make_natnum (c)));	2466	CALLN (Fformat, format, make_natnum (c)));
		2467	}
2415	/* We treat multiple adjacent whitespace characters as a	2468	/* We treat multiple adjacent whitespace characters as a
2416	single space character. This makes it easier to use	2469	single space character. This makes it easier to use
2417	character names in e.g. multi-line strings. */	2470	character names in e.g. multi-line strings. */
@@ -2425,25 +2478,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2425	else	2478	else
2426	whitespace = false;	2479	whitespace = false;
2427	name[length++] = c;	2480	name[length++] = c;
2428	if (length >= max_character_name_length)	2481	if (length >= sizeof name)
2429	invalid_syntax ("Character name too long");	2482	invalid_syntax ("Character name too long");
2430	}	2483	}
2431	if (length == 0)	2484	if (length == 0)
2432	invalid_syntax ("Empty character name");	2485	invalid_syntax ("Empty character name");
2433	name[length] = 0;	2486	return character_name_to_code (make_unibyte_string (name, length));
2434	Lisp_Object lisp_name = make_unibyte_string (name, length);
2435	Lisp_Object code =
2436	(length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
2437	/* Code point as U+N, where N is between 1 and 8 hexadecimal
2438	digits. */
2439	string_to_number (name + 2, 16, false) :
2440	Fgethash (lisp_name, character_names, Qnil);
2441	SAFE_FREE ();
2442	if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
2443	xsignal1 (Qinvalid_read_syntax,
2444	CALLN (Fformat,
2445	build_pure_c_string ("\\N{%s}"), lisp_name));
2446	return XINT (code);
2447	}	2487	}
2448		2488
2449	default:	2489	default:
@@ -4835,6 +4875,5 @@ that are loaded before your customizations are read! */);
4835	DEFSYM (Qrehash_size, "rehash-size");	4875	DEFSYM (Qrehash_size, "rehash-size");
4836	DEFSYM (Qrehash_threshold, "rehash-threshold");	4876	DEFSYM (Qrehash_threshold, "rehash-threshold");
4837		4877
4838	character_names = Qnil;	4878	DEFSYM (Qucs_names, "ucs-names");
4839	staticpro (&character_names);
4840	}	4879	}