Improve character name escapes

* doc/lispref/nonascii.texi (Character Properties): Avoid duplication of Unicode names. Reformat examples to fit in narrow pages. * doc/lispref/objects.texi (General Escape Syntax): Simplify and better-organize explanation of \N{...} escapes. * src/character.h (CHAR_SURROGATE_PAIR_P): Remove; unused. (char_surrogate_p): New inline function. * src/lread.c: Do not include string.h; no longer needed. (invalid_character_name, check_scalar_value): Remove; the ideas behind these functions are now bundled into character_name_to_code. (character_name_to_code): Remove undocumented support for "CJK IDEOGRAPH-XXXX" names, as "U+XXXX" suffices. Reject monstrosities like "\N{U+-0}" and null bytes in \N escapes. Reject floating point in \N escapes instead of returning garbage. Use AUTO_STRING_WITH_LEN to lessen pressure on the garbage collector. * test/src/lread-tests.el (lread-char-number, lread-char-name) (lread-string-char-number, lread-string-char-name): Test runtime behavior, not compile-time, as the test framework is not set up to test compile-time. (lread-char-surrogate-1, lread-char-surrogate-2) (lread-char-surrogate-3, lread-char-surrogate-4) (lread-string-char-number-2, lread-string-char-number-3): New tests. (lread-string-char-number-1): Rename from lread-string-char-number.
author: Paul Eggert 2016-04-21 19:26:34 -0700
committer: Paul Eggert 2016-04-21 19:29:41 -0700
commit: bd1c7ca67e7429e07f78d4ff49163fd7a67a6765 (patch)
tree: 941d5cf573be2a4588468b3a315c0c6cb47e2c97 /src
parent: e7cb38edc946ff60c1c878b30b068376d6ef56d2 (diff)
download: emacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.tar.gz
emacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.zip
2 files changed, 35 insertions, 82 deletions
diff --git a/src/character.h b/src/character.h
index bc3e1557844..586f330fba9 100644
--- a/src/character.h
+++ b/src/character.h
@@ -612,14 +612,13 @@ sanitize_char_width (EMACS_INT width)
   : (c) <= 0xE01EF ? (c) - 0xE0100 + 17        \
   : 0)
-/* If C is a high surrogate, return 1.  If C is a low surrogate,
+/* Return true if C is a surrogate.  */
-   return 2.  Otherwise, return 0.  */
-#define CHAR_SURROGATE_PAIR_P(c)        \
+INLINE bool
-  ((c) < 0xD800 ? 0                     \
+char_surrogate_p (int c)
-   : (c) <= 0xDBFF ? 1                  \
+{
-   : (c) <= 0xDFFF ? 2                  \
+  return 0xD800 <= c && c <= 0xDFFF;
-   : 0)
+}
 /* Data type for Unicode general category.
diff --git a/src/lread.c b/src/lread.c
index c3b6bd79e42..a42c1f60c95 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -44,7 +44,6 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #include "termhooks.h"
 #include "blockinput.h"
 #include <c-ctype.h>
-#include <string.h>
 #ifdef MSDOS
 #include "msdos.h"
@@ -2151,88 +2150,42 @@ grow_read_buffer (void)
                         MAX_MULTIBYTE_LENGTH, -1, 1);
 }
-/* Signal an invalid-read-syntax error indicating that the character
+/* Return the scalar value that has the Unicode character name NAME.
-   name in an \N{…} literal is invalid.  */
+   Raise 'invalid-read-syntax' if there is no such character.  */
-static _Noreturn void
-invalid_character_name (Lisp_Object name)
-{
-  AUTO_STRING (format, "\\N{%s}");
-  xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
-}
-/* Check that CODE is a valid Unicode scalar value, and return its
-   value.  CODE should be parsed from the character name given by
-   NAME.  NAME is used for error messages.  */
 static int
-check_scalar_value (Lisp_Object code, Lisp_Object name)
+character_name_to_code (char const *name, ptrdiff_t name_len)
 {
-  if (! NUMBERP (code))
+  Lisp_Object code;
-    invalid_character_name (name);
-  EMACS_INT i = XINT (code);
-  if (! (0 <= i && i <= MAX_UNICODE_CHAR)
-      /* Don't allow surrogates.  */
-      || (0xD800 <= code && code <= 0xDFFF))
-    invalid_character_name (name);
-  return i;
-}
-/* If NAME starts with PREFIX, interpret the rest as a hexadecimal
+  /* Code point as U+XXXX....  */
-   number and return its value.  Raise invalid-read-syntax if the
+  if (name[0] == 'U' && name[1] == '+')
-   number is not a valid scalar value.  Return −1 if NAME doesn’t
-   start with PREFIX.  */
-static int
-parse_code_after_prefix (Lisp_Object name, const char *prefix)
-{
-  ptrdiff_t name_len = SBYTES (name);
-  ptrdiff_t prefix_len = strlen (prefix);
-  /* Allow between one and eight hexadecimal digits after the
-     prefix.  */
-  if (prefix_len < name_len && name_len <= prefix_len + 8
-      && memcmp (SDATA (name), prefix, prefix_len) == 0)
    {
-      Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false);
+      /* Pass the leading '+' to string_to_number, so that it
-      if (NUMBERP (code))
+         rejects monstrosities such as negative values.  */
-        return check_scalar_value (code, name);
+      code = string_to_number (name + 1, 16, false);
+    }
+  else
+    {
+      /* Look up the name in the table returned by 'ucs-names'.  */
+      AUTO_STRING_WITH_LEN (namestr, name, name_len);
+      Lisp_Object names = call0 (Qucs_names);
+      code = CDR (Fassoc (namestr, names));
    }
-  return -1;
-}
-/* Returns the scalar value that has the Unicode character name NAME.
+  if (! (INTEGERP (code)
-   Raises `invalid-read-syntax' if there is no such character.  */
+         && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR
-static int
+         && ! char_surrogate_p (XINT (code))))
-character_name_to_code (Lisp_Object name)
-{
-  /* Code point as U+N, where N is between 1 and 8 hexadecimal
-     digits.  */
-  int code = parse_code_after_prefix (name, "U+");
-  if (code >= 0)
-    return code;
-  /* CJK ideographs are not contained in the association list returned
-     by `ucs-names'.  But they follow a predictable naming pattern: a
-     fixed prefix plus the hexadecimal codepoint value.  */
-  code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
-  if (code >= 0)
    {
-      /* Various ranges of CJK characters; see UnicodeData.txt.  */
+      AUTO_STRING (format, "\\N{%s}");
-      if ((0x3400 <= code && code <= 0x4DB5)
+      AUTO_STRING_WITH_LEN (namestr, name, name_len);
-          || (0x4E00 <= code && code <= 0x9FD5)
+      xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, namestr));
-          || (0x20000 <= code && code <= 0x2A6D6)
-          || (0x2A700 <= code && code <= 0x2B734)
-          || (0x2B740 <= code && code <= 0x2B81D)
-          || (0x2B820 <= code && code <= 0x2CEA1))
-        return code;
-      else
-        invalid_character_name (name);
    }
-  /* Look up the name in the table returned by `ucs-names'.  */
+  return XINT (code);
-  Lisp_Object names = call0 (Qucs_names);
-  return check_scalar_value (CDR (Fassoc (name, names)), name);
 }
 /* Bound on the length of a Unicode character name.  As of
-   Unicode 9.0.0 the maximum is 83, so this should be safe. */
+   Unicode 9.0.0 the maximum is 83, so this should be safe.  */
 enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
 /* Read a \-escape sequence, assuming we already read the `\'.
@@ -2458,14 +2411,14 @@ read_escape (Lisp_Object readcharfun, bool stringp)
              end_of_file_error ();
            if (c == '}')
              break;
-            if (! c_isascii (c))
+            if (! (0 < c && c < 0x80))
              {
                AUTO_STRING (format,
-                             "Non-ASCII character U+%04X in character name");
+                             "Invalid character U+%04X in character name");
                xsignal1 (Qinvalid_read_syntax,
                          CALLN (Fformat, format, make_natnum (c)));
              }
-            /* We treat multiple adjacent whitespace characters as a
+            /* Treat multiple adjacent whitespace characters as a
               single space character.  This makes it easier to use
               character names in e.g. multi-line strings.  */
            if (c_isspace (c))
@@ -2483,7 +2436,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
          }
        if (length == 0)
          invalid_syntax ("Empty character name");
-        return character_name_to_code (make_unibyte_string (name, length));
+        name[length] = '\0';
+        return character_name_to_code (name, length);
      }
    default:
author	Paul Eggert	2016-04-21 19:26:34 -0700
committer	Paul Eggert	2016-04-21 19:29:41 -0700
commit	bd1c7ca67e7429e07f78d4ff49163fd7a67a6765 (patch)
tree	941d5cf573be2a4588468b3a315c0c6cb47e2c97 /src
parent	e7cb38edc946ff60c1c878b30b068376d6ef56d2 (diff)
download	emacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.tar.gz emacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.zip