aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPaul Eggert2016-04-21 19:26:34 -0700
committerPaul Eggert2016-04-21 19:29:41 -0700
commitbd1c7ca67e7429e07f78d4ff49163fd7a67a6765 (patch)
tree941d5cf573be2a4588468b3a315c0c6cb47e2c97 /src
parente7cb38edc946ff60c1c878b30b068376d6ef56d2 (diff)
downloademacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.tar.gz
emacs-bd1c7ca67e7429e07f78d4ff49163fd7a67a6765.zip
Improve character name escapes
* doc/lispref/nonascii.texi (Character Properties): Avoid duplication of Unicode names. Reformat examples to fit in narrow pages. * doc/lispref/objects.texi (General Escape Syntax): Simplify and better-organize explanation of \N{...} escapes. * src/character.h (CHAR_SURROGATE_PAIR_P): Remove; unused. (char_surrogate_p): New inline function. * src/lread.c: Do not include string.h; no longer needed. (invalid_character_name, check_scalar_value): Remove; the ideas behind these functions are now bundled into character_name_to_code. (character_name_to_code): Remove undocumented support for "CJK IDEOGRAPH-XXXX" names, as "U+XXXX" suffices. Reject monstrosities like "\N{U+-0}" and null bytes in \N escapes. Reject floating point in \N escapes instead of returning garbage. Use AUTO_STRING_WITH_LEN to lessen pressure on the garbage collector. * test/src/lread-tests.el (lread-char-number, lread-char-name) (lread-string-char-number, lread-string-char-name): Test runtime behavior, not compile-time, as the test framework is not set up to test compile-time. (lread-char-surrogate-1, lread-char-surrogate-2) (lread-char-surrogate-3, lread-char-surrogate-4) (lread-string-char-number-2, lread-string-char-number-3): New tests. (lread-string-char-number-1): Rename from lread-string-char-number.
Diffstat (limited to 'src')
-rw-r--r--src/character.h13
-rw-r--r--src/lread.c104
2 files changed, 35 insertions, 82 deletions
diff --git a/src/character.h b/src/character.h
index bc3e1557844..586f330fba9 100644
--- a/src/character.h
+++ b/src/character.h
@@ -612,14 +612,13 @@ sanitize_char_width (EMACS_INT width)
612 : (c) <= 0xE01EF ? (c) - 0xE0100 + 17 \ 612 : (c) <= 0xE01EF ? (c) - 0xE0100 + 17 \
613 : 0) 613 : 0)
614 614
615/* If C is a high surrogate, return 1. If C is a low surrogate, 615/* Return true if C is a surrogate. */
616 return 2. Otherwise, return 0. */
617 616
618#define CHAR_SURROGATE_PAIR_P(c) \ 617INLINE bool
619 ((c) < 0xD800 ? 0 \ 618char_surrogate_p (int c)
620 : (c) <= 0xDBFF ? 1 \ 619{
621 : (c) <= 0xDFFF ? 2 \ 620 return 0xD800 <= c && c <= 0xDFFF;
622 : 0) 621}
623 622
624/* Data type for Unicode general category. 623/* Data type for Unicode general category.
625 624
diff --git a/src/lread.c b/src/lread.c
index c3b6bd79e42..a42c1f60c95 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -44,7 +44,6 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
44#include "termhooks.h" 44#include "termhooks.h"
45#include "blockinput.h" 45#include "blockinput.h"
46#include <c-ctype.h> 46#include <c-ctype.h>
47#include <string.h>
48 47
49#ifdef MSDOS 48#ifdef MSDOS
50#include "msdos.h" 49#include "msdos.h"
@@ -2151,88 +2150,42 @@ grow_read_buffer (void)
2151 MAX_MULTIBYTE_LENGTH, -1, 1); 2150 MAX_MULTIBYTE_LENGTH, -1, 1);
2152} 2151}
2153 2152
2154/* Signal an invalid-read-syntax error indicating that the character 2153/* Return the scalar value that has the Unicode character name NAME.
2155 name in an \N{…} literal is invalid. */ 2154 Raise 'invalid-read-syntax' if there is no such character. */
2156static _Noreturn void
2157invalid_character_name (Lisp_Object name)
2158{
2159 AUTO_STRING (format, "\\N{%s}");
2160 xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
2161}
2162
2163/* Check that CODE is a valid Unicode scalar value, and return its
2164 value. CODE should be parsed from the character name given by
2165 NAME. NAME is used for error messages. */
2166static int 2155static int
2167check_scalar_value (Lisp_Object code, Lisp_Object name) 2156character_name_to_code (char const *name, ptrdiff_t name_len)
2168{ 2157{
2169 if (! NUMBERP (code)) 2158 Lisp_Object code;
2170 invalid_character_name (name);
2171 EMACS_INT i = XINT (code);
2172 if (! (0 <= i && i <= MAX_UNICODE_CHAR)
2173 /* Don't allow surrogates. */
2174 || (0xD800 <= code && code <= 0xDFFF))
2175 invalid_character_name (name);
2176 return i;
2177}
2178 2159
2179/* If NAME starts with PREFIX, interpret the rest as a hexadecimal 2160 /* Code point as U+XXXX.... */
2180 number and return its value. Raise invalid-read-syntax if the 2161 if (name[0] == 'U' && name[1] == '+')
2181 number is not a valid scalar value. Return −1 if NAME doesn’t
2182 start with PREFIX. */
2183static int
2184parse_code_after_prefix (Lisp_Object name, const char *prefix)
2185{
2186 ptrdiff_t name_len = SBYTES (name);
2187 ptrdiff_t prefix_len = strlen (prefix);
2188 /* Allow between one and eight hexadecimal digits after the
2189 prefix. */
2190 if (prefix_len < name_len && name_len <= prefix_len + 8
2191 && memcmp (SDATA (name), prefix, prefix_len) == 0)
2192 { 2162 {
2193 Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false); 2163 /* Pass the leading '+' to string_to_number, so that it
2194 if (NUMBERP (code)) 2164 rejects monstrosities such as negative values. */
2195 return check_scalar_value (code, name); 2165 code = string_to_number (name + 1, 16, false);
2166 }
2167 else
2168 {
2169 /* Look up the name in the table returned by 'ucs-names'. */
2170 AUTO_STRING_WITH_LEN (namestr, name, name_len);
2171 Lisp_Object names = call0 (Qucs_names);
2172 code = CDR (Fassoc (namestr, names));
2196 } 2173 }
2197 return -1;
2198}
2199 2174
2200/* Returns the scalar value that has the Unicode character name NAME. 2175 if (! (INTEGERP (code)
2201 Raises `invalid-read-syntax' if there is no such character. */ 2176 && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR
2202static int 2177 && ! char_surrogate_p (XINT (code))))
2203character_name_to_code (Lisp_Object name)
2204{
2205 /* Code point as U+N, where N is between 1 and 8 hexadecimal
2206 digits. */
2207 int code = parse_code_after_prefix (name, "U+");
2208 if (code >= 0)
2209 return code;
2210
2211 /* CJK ideographs are not contained in the association list returned
2212 by `ucs-names'. But they follow a predictable naming pattern: a
2213 fixed prefix plus the hexadecimal codepoint value. */
2214 code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
2215 if (code >= 0)
2216 { 2178 {
2217 /* Various ranges of CJK characters; see UnicodeData.txt. */ 2179 AUTO_STRING (format, "\\N{%s}");
2218 if ((0x3400 <= code && code <= 0x4DB5) 2180 AUTO_STRING_WITH_LEN (namestr, name, name_len);
2219 || (0x4E00 <= code && code <= 0x9FD5) 2181 xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, namestr));
2220 || (0x20000 <= code && code <= 0x2A6D6)
2221 || (0x2A700 <= code && code <= 0x2B734)
2222 || (0x2B740 <= code && code <= 0x2B81D)
2223 || (0x2B820 <= code && code <= 0x2CEA1))
2224 return code;
2225 else
2226 invalid_character_name (name);
2227 } 2182 }
2228 2183
2229 /* Look up the name in the table returned by `ucs-names'. */ 2184 return XINT (code);
2230 Lisp_Object names = call0 (Qucs_names);
2231 return check_scalar_value (CDR (Fassoc (name, names)), name);
2232} 2185}
2233 2186
2234/* Bound on the length of a Unicode character name. As of 2187/* Bound on the length of a Unicode character name. As of
2235 Unicode 9.0.0 the maximum is 83, so this should be safe. */ 2188 Unicode 9.0.0 the maximum is 83, so this should be safe. */
2236enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; 2189enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
2237 2190
2238/* Read a \-escape sequence, assuming we already read the `\'. 2191/* Read a \-escape sequence, assuming we already read the `\'.
@@ -2458,14 +2411,14 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2458 end_of_file_error (); 2411 end_of_file_error ();
2459 if (c == '}') 2412 if (c == '}')
2460 break; 2413 break;
2461 if (! c_isascii (c)) 2414 if (! (0 < c && c < 0x80))
2462 { 2415 {
2463 AUTO_STRING (format, 2416 AUTO_STRING (format,
2464 "Non-ASCII character U+%04X in character name"); 2417 "Invalid character U+%04X in character name");
2465 xsignal1 (Qinvalid_read_syntax, 2418 xsignal1 (Qinvalid_read_syntax,
2466 CALLN (Fformat, format, make_natnum (c))); 2419 CALLN (Fformat, format, make_natnum (c)));
2467 } 2420 }
2468 /* We treat multiple adjacent whitespace characters as a 2421 /* Treat multiple adjacent whitespace characters as a
2469 single space character. This makes it easier to use 2422 single space character. This makes it easier to use
2470 character names in e.g. multi-line strings. */ 2423 character names in e.g. multi-line strings. */
2471 if (c_isspace (c)) 2424 if (c_isspace (c))
@@ -2483,7 +2436,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2483 } 2436 }
2484 if (length == 0) 2437 if (length == 0)
2485 invalid_syntax ("Empty character name"); 2438 invalid_syntax ("Empty character name");
2486 return character_name_to_code (make_unibyte_string (name, length)); 2439 name[length] = '\0';
2440 return character_name_to_code (name, length);
2487 } 2441 }
2488 2442
2489 default: 2443 default: