Implement named character escapes, similar to Perl

* lread.c (init_character_names): New function. (read_escape): Read Perl-style named character escape sequences. (syms_of_lread): Initialize new variable 'character_names'. * test/src/lread-tests.el (lread-char-empty-name): Add test file for src/lread.c.
author: Philipp Stephani 2016-04-21 14:45:22 -0700
committer: Paul Eggert 2016-04-21 19:29:40 -0700
commit: de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73 (patch)
tree: d929612c6d9ae099782dfe67b2d03041b9395a4f
parent: 7621a521452d988b27e761c76ad8e667e932192e (diff)
download: emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.tar.gz
emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.zip
2 files changed, 150 insertions, 0 deletions
diff --git a/src/lread.c b/src/lread.c
index fedfcb807c8..9fa46a875be 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -43,6 +43,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #include "systime.h"
 #include "termhooks.h"
 #include "blockinput.h"
+#include <c-ctype.h>
 #ifdef MSDOS
 #include "msdos.h"
@@ -2149,6 +2150,36 @@ grow_read_buffer (void)
                         MAX_MULTIBYTE_LENGTH, -1, 1);
 }
+/* Hash table that maps Unicode character names to code points.  */
+static Lisp_Object character_names;
+/* Length of the longest Unicode character name, in bytes. */
+static ptrdiff_t max_character_name_length;
+/* Initializes `character_names' and `max_character_name_length'.
+   Called by `read_escape'.  */
+void init_character_names ()
+{
+  character_names = CALLN (Fmake_hash_table,
+                           QCtest, Qequal,
+                           /* Currently around 100,000 Unicode
+                              characters are defined.  */
+                           QCsize, make_natnum (100000));
+  const Lisp_Object get_property =
+    Fsymbol_function (intern_c_string ("get-char-code-property"));
+  ptrdiff_t length = 0;
+  for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
+    {
+      const Lisp_Object code = make_natnum (i);
+      const Lisp_Object name = call2 (get_property, code, Qname);
+      if (NILP (name)) continue;
+      CHECK_STRING (name);
+      length = max (length, SBYTES (name));
+      Fputhash (name, code, character_names);
+    }
+  max_character_name_length = length;
+}
 /* Read a \-escape sequence, assuming we already read the `\'.
   If the escape sequence forces unibyte, return eight-bit char.  */
@@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp)
        return i;
      }
+    case 'N':
+      /* Named character.  */
+      {
+        c = READCHAR;
+        if (c != '{')
+          invalid_syntax ("Expected opening brace after \\N");
+        if (NILP (character_names))
+          init_character_names ();
+        USE_SAFE_ALLOCA;
+        char *name = SAFE_ALLOCA (max_character_name_length + 1);
+        bool whitespace = false;
+        ptrdiff_t length = 0;
+        while (true)
+          {
+            c = READCHAR;
+            if (c < 0)
+              end_of_file_error ();
+            if (c == '}')
+              break;
+            if (! c_isascii (c))
+              xsignal1 (Qinvalid_read_syntax,
+                        CALLN (Fformat,
+                               build_pure_c_string ("Non-ASCII character U+%04X"
+                                                    " in character name"),
+                               make_natnum (c)));
+            /* We treat multiple adjacent whitespace characters as a
+               single space character.  This makes it easier to use
+               character names in e.g. multi-line strings.  */
+            if (c_isspace (c))
+              {
+                if (! whitespace)
+                  {
+                    whitespace = true;
+                    name[length++] = ' ';
+                  }
+              }
+            else
+              {
+                whitespace = false;
+                name[length++] = c;
+              }
+            if (length >= max_character_name_length)
+              invalid_syntax ("Character name too long");
+          }
+        if (length == 0)
+          invalid_syntax ("Empty character name");
+        name[length] = 0;
+        const Lisp_Object lisp_name = make_unibyte_string (name, length);
+        const Lisp_Object code =
+          (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
+          /* Code point as U+N, where N is between 1 and 8 hexadecimal
+             digits.  */
+          string_to_number (name + 2, 16, false) :
+          Fgethash (lisp_name, character_names, Qnil);
+        SAFE_FREE ();
+        if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
+          xsignal1 (Qinvalid_read_syntax,
+                    CALLN (Fformat,
+                           build_pure_c_string ("\\N{%s}"), lisp_name));
+        return XINT (code);
+      }
    default:
      return c;
    }
@@ -4744,4 +4837,7 @@ that are loaded before your customizations are read!  */);
  DEFSYM (Qweakness, "weakness");
  DEFSYM (Qrehash_size, "rehash-size");
  DEFSYM (Qrehash_threshold, "rehash-threshold");
+  character_names = Qnil;
+  staticpro (&character_names);
 }
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
new file mode 100644
index 00000000000..1f873340c56
--- /dev/null
+++ b/test/src/lread-tests.el
@@ -0,0 +1,54 @@
+;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*-
+;; Copyright (C) 2016  Google Inc.
+;; Author: Philipp Stephani <phst@google.com>
+;; This file is part of GNU Emacs.
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;;; Commentary:
+;; Unit tests for code in src/lread.c.
+;;; Code:
+(ert-deftest lread-char-number ()
+  (should (equal ?\N{U+A817} #xA817)))
+(ert-deftest lread-char-name ()
+  (should (equal ?\N{SYLOTI  NAGRI LETTER
+                 DHO}
+                 #xA817)))
+(ert-deftest lread-char-invalid-number ()
+  (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
+(ert-deftest lread-char-invalid-name ()
+  (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
+(ert-deftest lread-char-non-ascii-name ()
+  (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
+(ert-deftest lread-char-empty-name ()
+  (should-error (read "?\\N{}")) 'invalid-read-syntax)
+(ert-deftest lread-string-char-number ()
+  (should (equal "a\N{U+A817}b" "a\uA817b")))
+(ert-deftest lread-string-char-name ()
+  (should (equal "a\N{SYLOTI NAGRI  LETTER DHO}b" "a\uA817b")))
+;;; lread-tests.el ends here
author	Philipp Stephani	2016-04-21 14:45:22 -0700
committer	Paul Eggert	2016-04-21 19:29:40 -0700
commit	de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73 (patch)
tree	d929612c6d9ae099782dfe67b2d03041b9395a4f
parent	7621a521452d988b27e761c76ad8e667e932192e (diff)
download	emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.tar.gz emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.zip

diff --git a/src/lread.c b/src/lread.c index fedfcb807c8..9fa46a875be 100644 --- a/src/lread.c +++ b/src/lread.c
@@ -43,6 +43,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
43	#include "systime.h"	43	#include "systime.h"
44	#include "termhooks.h"	44	#include "termhooks.h"
45	#include "blockinput.h"	45	#include "blockinput.h"
		46	#include <c-ctype.h>
46		47
47	#ifdef MSDOS	48	#ifdef MSDOS
48	#include "msdos.h"	49	#include "msdos.h"
@@ -2149,6 +2150,36 @@ grow_read_buffer (void)
2149	MAX_MULTIBYTE_LENGTH, -1, 1);	2150	MAX_MULTIBYTE_LENGTH, -1, 1);
2150	}	2151	}
2151		2152
		2153	/* Hash table that maps Unicode character names to code points. */
		2154	static Lisp_Object character_names;
		2155
		2156	/* Length of the longest Unicode character name, in bytes. */
		2157	static ptrdiff_t max_character_name_length;
		2158
		2159	/* Initializes `character_names' and `max_character_name_length'.
		2160	Called by `read_escape'. */
		2161	void init_character_names ()
		2162	{
		2163	character_names = CALLN (Fmake_hash_table,
		2164	QCtest, Qequal,
		2165	/* Currently around 100,000 Unicode
		2166	characters are defined. */
		2167	QCsize, make_natnum (100000));
		2168	const Lisp_Object get_property =
		2169	Fsymbol_function (intern_c_string ("get-char-code-property"));
		2170	ptrdiff_t length = 0;
		2171	for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
		2172	{
		2173	const Lisp_Object code = make_natnum (i);
		2174	const Lisp_Object name = call2 (get_property, code, Qname);
		2175	if (NILP (name)) continue;
		2176	CHECK_STRING (name);
		2177	length = max (length, SBYTES (name));
		2178	Fputhash (name, code, character_names);
		2179	}
		2180	max_character_name_length = length;
		2181	}
		2182
2152	/* Read a \-escape sequence, assuming we already read the `\'.	2183	/* Read a \-escape sequence, assuming we already read the `\'.
2153	If the escape sequence forces unibyte, return eight-bit char. */	2184	If the escape sequence forces unibyte, return eight-bit char. */
2154		2185
@@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2356	return i;	2387	return i;
2357	}	2388	}
2358		2389
		2390	case 'N':
		2391	/* Named character. */
		2392	{
		2393	c = READCHAR;
		2394	if (c != '{')
		2395	invalid_syntax ("Expected opening brace after \\N");
		2396	if (NILP (character_names))
		2397	init_character_names ();
		2398	USE_SAFE_ALLOCA;
		2399	char *name = SAFE_ALLOCA (max_character_name_length + 1);
		2400	bool whitespace = false;
		2401	ptrdiff_t length = 0;
		2402	while (true)
		2403	{
		2404	c = READCHAR;
		2405	if (c < 0)
		2406	end_of_file_error ();
		2407	if (c == '}')
		2408	break;
		2409	if (! c_isascii (c))
		2410	xsignal1 (Qinvalid_read_syntax,
		2411	CALLN (Fformat,
		2412	build_pure_c_string ("Non-ASCII character U+%04X"
		2413	" in character name"),
		2414	make_natnum (c)));
		2415	/* We treat multiple adjacent whitespace characters as a
		2416	single space character. This makes it easier to use
		2417	character names in e.g. multi-line strings. */
		2418	if (c_isspace (c))
		2419	{
		2420	if (! whitespace)
		2421	{
		2422	whitespace = true;
		2423	name[length++] = ' ';
		2424	}
		2425	}
		2426	else
		2427	{
		2428	whitespace = false;
		2429	name[length++] = c;
		2430	}
		2431	if (length >= max_character_name_length)
		2432	invalid_syntax ("Character name too long");
		2433	}
		2434	if (length == 0)
		2435	invalid_syntax ("Empty character name");
		2436	name[length] = 0;
		2437	const Lisp_Object lisp_name = make_unibyte_string (name, length);
		2438	const Lisp_Object code =
		2439	(length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
		2440	/* Code point as U+N, where N is between 1 and 8 hexadecimal
		2441	digits. */
		2442	string_to_number (name + 2, 16, false) :
		2443	Fgethash (lisp_name, character_names, Qnil);
		2444	SAFE_FREE ();
		2445	if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
		2446	xsignal1 (Qinvalid_read_syntax,
		2447	CALLN (Fformat,
		2448	build_pure_c_string ("\\N{%s}"), lisp_name));
		2449	return XINT (code);
		2450	}
		2451
2359	default:	2452	default:
2360	return c;	2453	return c;
2361	}	2454	}
@@ -4744,4 +4837,7 @@ that are loaded before your customizations are read! */);
4744	DEFSYM (Qweakness, "weakness");	4837	DEFSYM (Qweakness, "weakness");
4745	DEFSYM (Qrehash_size, "rehash-size");	4838	DEFSYM (Qrehash_size, "rehash-size");
4746	DEFSYM (Qrehash_threshold, "rehash-threshold");	4839	DEFSYM (Qrehash_threshold, "rehash-threshold");
		4840
		4841	character_names = Qnil;
		4842	staticpro (&character_names);
4747	}	4843	}


diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el new file mode 100644 index 00000000000..1f873340c56 --- /dev/null +++ b/test/src/lread-tests.el
@@ -0,0 +1,54 @@
		1	;;; lread-tests.el --- tests for lread.c -- lexical-binding: t; --
		2
		3	;; Copyright (C) 2016 Google Inc.
		4
		5	;; Author: Philipp Stephani <phst@google.com>
		6
		7	;; This file is part of GNU Emacs.
		8
		9	;; This program is free software; you can redistribute it and/or modify
		10	;; it under the terms of the GNU General Public License as published by
		11	;; the Free Software Foundation, either version 3 of the License, or
		12	;; (at your option) any later version.
		13
		14	;; This program is distributed in the hope that it will be useful,
		15	;; but WITHOUT ANY WARRANTY; without even the implied warranty of
		16	;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		17	;; GNU General Public License for more details.
		18
		19	;; You should have received a copy of the GNU General Public License
		20	;; along with this program. If not, see <http://www.gnu.org/licenses/>.
		21
		22	;;; Commentary:
		23
		24	;; Unit tests for code in src/lread.c.
		25
		26	;;; Code:
		27
		28	(ert-deftest lread-char-number ()
		29	(should (equal ?\N{U+A817} #xA817)))
		30
		31	(ert-deftest lread-char-name ()
		32	(should (equal ?\N{SYLOTI NAGRI LETTER
		33	DHO}
		34	#xA817)))
		35
		36	(ert-deftest lread-char-invalid-number ()
		37	(should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
		38
		39	(ert-deftest lread-char-invalid-name ()
		40	(should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
		41
		42	(ert-deftest lread-char-non-ascii-name ()
		43	(should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
		44
		45	(ert-deftest lread-char-empty-name ()
		46	(should-error (read "?\\N{}")) 'invalid-read-syntax)
		47
		48	(ert-deftest lread-string-char-number ()
		49	(should (equal "a\N{U+A817}b" "a\uA817b")))
		50
		51	(ert-deftest lread-string-char-name ()
		52	(should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b")))
		53
		54	;;; lread-tests.el ends here