aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPhilipp Stephani2016-04-21 14:51:30 -0700
committerPaul Eggert2016-04-21 19:29:40 -0700
commite7cb38edc946ff60c1c878b30b068376d6ef56d2 (patch)
tree8fc810e91650a328dd6f6d95dd2a5f1f52b86c4a /src
parent753c875714f708c0257a2d352635c5616be66fdc (diff)
downloademacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.tar.gz
emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.zip
Use 'ucs-names' for character name escapes
* lread.c (invalid_character_name, check_scalar_value) (parse_code_after_prefix, character_name_to_code): New helper functions that use 'ucs-names' and parsing for CJK ideographs. (read_escape): Use helper functions. (syms_of_lread): New symbol 'ucs-names'. * test/src/lread-tests.el: New tests; fix a couple of bugs in existing tests.
Diffstat (limited to 'src')
-rw-r--r--src/lread.c137
1 files changed, 88 insertions, 49 deletions
diff --git a/src/lread.c b/src/lread.c
index dbe51bb06c8..c3b6bd79e42 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -44,6 +44,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
44#include "termhooks.h" 44#include "termhooks.h"
45#include "blockinput.h" 45#include "blockinput.h"
46#include <c-ctype.h> 46#include <c-ctype.h>
47#include <string.h>
47 48
48#ifdef MSDOS 49#ifdef MSDOS
49#include "msdos.h" 50#include "msdos.h"
@@ -2150,36 +2151,90 @@ grow_read_buffer (void)
2150 MAX_MULTIBYTE_LENGTH, -1, 1); 2151 MAX_MULTIBYTE_LENGTH, -1, 1);
2151} 2152}
2152 2153
2153/* Hash table that maps Unicode character names to code points. */ 2154/* Signal an invalid-read-syntax error indicating that the character
2154static Lisp_Object character_names; 2155 name in an \N{…} literal is invalid. */
2156static _Noreturn void
2157invalid_character_name (Lisp_Object name)
2158{
2159 AUTO_STRING (format, "\\N{%s}");
2160 xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
2161}
2155 2162
2156/* Length of the longest Unicode character name, in bytes. */ 2163/* Check that CODE is a valid Unicode scalar value, and return its
2157static ptrdiff_t max_character_name_length; 2164 value. CODE should be parsed from the character name given by
2165 NAME. NAME is used for error messages. */
2166static int
2167check_scalar_value (Lisp_Object code, Lisp_Object name)
2168{
2169 if (! NUMBERP (code))
2170 invalid_character_name (name);
2171 EMACS_INT i = XINT (code);
2172 if (! (0 <= i && i <= MAX_UNICODE_CHAR)
2173 /* Don't allow surrogates. */
2174 || (0xD800 <= code && code <= 0xDFFF))
2175 invalid_character_name (name);
2176 return i;
2177}
2158 2178
2159/* Initializes `character_names' and `max_character_name_length'. 2179/* If NAME starts with PREFIX, interpret the rest as a hexadecimal
2160 Called by `read_escape'. */ 2180 number and return its value. Raise invalid-read-syntax if the
2161void init_character_names (void) 2181 number is not a valid scalar value. Return −1 if NAME doesn’t
2182 start with PREFIX. */
2183static int
2184parse_code_after_prefix (Lisp_Object name, const char *prefix)
2162{ 2185{
2163 character_names = CALLN (Fmake_hash_table, 2186 ptrdiff_t name_len = SBYTES (name);
2164 QCtest, Qequal, 2187 ptrdiff_t prefix_len = strlen (prefix);
2165 /* Currently around 100,000 Unicode 2188 /* Allow between one and eight hexadecimal digits after the
2166 characters are defined. */ 2189 prefix. */
2167 QCsize, make_natnum (100000)); 2190 if (prefix_len < name_len && name_len <= prefix_len + 8
2168 Lisp_Object get_property = 2191 && memcmp (SDATA (name), prefix, prefix_len) == 0)
2169 Fsymbol_function (intern_c_string ("get-char-code-property"));
2170 ptrdiff_t length = 0;
2171 for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
2172 { 2192 {
2173 Lisp_Object code = make_natnum (i); 2193 Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false);
2174 Lisp_Object name = call2 (get_property, code, Qname); 2194 if (NUMBERP (code))
2175 if (NILP (name)) continue; 2195 return check_scalar_value (code, name);
2176 CHECK_STRING (name); 2196 }
2177 length = max (length, SBYTES (name)); 2197 return -1;
2178 Fputhash (name, code, character_names); 2198}
2199
2200/* Returns the scalar value that has the Unicode character name NAME.
2201 Raises `invalid-read-syntax' if there is no such character. */
2202static int
2203character_name_to_code (Lisp_Object name)
2204{
2205 /* Code point as U+N, where N is between 1 and 8 hexadecimal
2206 digits. */
2207 int code = parse_code_after_prefix (name, "U+");
2208 if (code >= 0)
2209 return code;
2210
2211 /* CJK ideographs are not contained in the association list returned
2212 by `ucs-names'. But they follow a predictable naming pattern: a
2213 fixed prefix plus the hexadecimal codepoint value. */
2214 code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
2215 if (code >= 0)
2216 {
2217 /* Various ranges of CJK characters; see UnicodeData.txt. */
2218 if ((0x3400 <= code && code <= 0x4DB5)
2219 || (0x4E00 <= code && code <= 0x9FD5)
2220 || (0x20000 <= code && code <= 0x2A6D6)
2221 || (0x2A700 <= code && code <= 0x2B734)
2222 || (0x2B740 <= code && code <= 0x2B81D)
2223 || (0x2B820 <= code && code <= 0x2CEA1))
2224 return code;
2225 else
2226 invalid_character_name (name);
2179 } 2227 }
2180 max_character_name_length = length; 2228
2229 /* Look up the name in the table returned by `ucs-names'. */
2230 Lisp_Object names = call0 (Qucs_names);
2231 return check_scalar_value (CDR (Fassoc (name, names)), name);
2181} 2232}
2182 2233
2234/* Bound on the length of a Unicode character name. As of
2235 Unicode 9.0.0 the maximum is 83, so this should be safe. */
2236enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
2237
2183/* Read a \-escape sequence, assuming we already read the `\'. 2238/* Read a \-escape sequence, assuming we already read the `\'.
2184 If the escape sequence forces unibyte, return eight-bit char. */ 2239 If the escape sequence forces unibyte, return eight-bit char. */
2185 2240
@@ -2393,10 +2448,7 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2393 c = READCHAR; 2448 c = READCHAR;
2394 if (c != '{') 2449 if (c != '{')
2395 invalid_syntax ("Expected opening brace after \\N"); 2450 invalid_syntax ("Expected opening brace after \\N");
2396 if (NILP (character_names)) 2451 char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
2397 init_character_names ();
2398 USE_SAFE_ALLOCA;
2399 char *name = SAFE_ALLOCA (max_character_name_length + 1);
2400 bool whitespace = false; 2452 bool whitespace = false;
2401 ptrdiff_t length = 0; 2453 ptrdiff_t length = 0;
2402 while (true) 2454 while (true)
@@ -2407,11 +2459,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2407 if (c == '}') 2459 if (c == '}')
2408 break; 2460 break;
2409 if (! c_isascii (c)) 2461 if (! c_isascii (c))
2410 xsignal1 (Qinvalid_read_syntax, 2462 {
2411 CALLN (Fformat, 2463 AUTO_STRING (format,
2412 build_pure_c_string ("Non-ASCII character U+%04X" 2464 "Non-ASCII character U+%04X in character name");
2413 " in character name"), 2465 xsignal1 (Qinvalid_read_syntax,
2414 make_natnum (c))); 2466 CALLN (Fformat, format, make_natnum (c)));
2467 }
2415 /* We treat multiple adjacent whitespace characters as a 2468 /* We treat multiple adjacent whitespace characters as a
2416 single space character. This makes it easier to use 2469 single space character. This makes it easier to use
2417 character names in e.g. multi-line strings. */ 2470 character names in e.g. multi-line strings. */
@@ -2425,25 +2478,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2425 else 2478 else
2426 whitespace = false; 2479 whitespace = false;
2427 name[length++] = c; 2480 name[length++] = c;
2428 if (length >= max_character_name_length) 2481 if (length >= sizeof name)
2429 invalid_syntax ("Character name too long"); 2482 invalid_syntax ("Character name too long");
2430 } 2483 }
2431 if (length == 0) 2484 if (length == 0)
2432 invalid_syntax ("Empty character name"); 2485 invalid_syntax ("Empty character name");
2433 name[length] = 0; 2486 return character_name_to_code (make_unibyte_string (name, length));
2434 Lisp_Object lisp_name = make_unibyte_string (name, length);
2435 Lisp_Object code =
2436 (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
2437 /* Code point as U+N, where N is between 1 and 8 hexadecimal
2438 digits. */
2439 string_to_number (name + 2, 16, false) :
2440 Fgethash (lisp_name, character_names, Qnil);
2441 SAFE_FREE ();
2442 if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
2443 xsignal1 (Qinvalid_read_syntax,
2444 CALLN (Fformat,
2445 build_pure_c_string ("\\N{%s}"), lisp_name));
2446 return XINT (code);
2447 } 2487 }
2448 2488
2449 default: 2489 default:
@@ -4835,6 +4875,5 @@ that are loaded before your customizations are read! */);
4835 DEFSYM (Qrehash_size, "rehash-size"); 4875 DEFSYM (Qrehash_size, "rehash-size");
4836 DEFSYM (Qrehash_threshold, "rehash-threshold"); 4876 DEFSYM (Qrehash_threshold, "rehash-threshold");
4837 4877
4838 character_names = Qnil; 4878 DEFSYM (Qucs_names, "ucs-names");
4839 staticpro (&character_names);
4840} 4879}