diff options
| author | Philipp Stephani | 2016-04-21 14:51:30 -0700 |
|---|---|---|
| committer | Paul Eggert | 2016-04-21 19:29:40 -0700 |
| commit | e7cb38edc946ff60c1c878b30b068376d6ef56d2 (patch) | |
| tree | 8fc810e91650a328dd6f6d95dd2a5f1f52b86c4a /src | |
| parent | 753c875714f708c0257a2d352635c5616be66fdc (diff) | |
| download | emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.tar.gz emacs-e7cb38edc946ff60c1c878b30b068376d6ef56d2.zip | |
Use 'ucs-names' for character name escapes
* lread.c (invalid_character_name, check_scalar_value)
(parse_code_after_prefix, character_name_to_code): New helper
functions that use 'ucs-names' and parsing for CJK ideographs.
(read_escape): Use helper functions.
(syms_of_lread): New symbol 'ucs-names'.
* test/src/lread-tests.el: New tests; fix a couple of bugs in
existing tests.
Diffstat (limited to 'src')
| -rw-r--r-- | src/lread.c | 137 |
1 files changed, 88 insertions, 49 deletions
diff --git a/src/lread.c b/src/lread.c index dbe51bb06c8..c3b6bd79e42 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -44,6 +44,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */ | |||
| 44 | #include "termhooks.h" | 44 | #include "termhooks.h" |
| 45 | #include "blockinput.h" | 45 | #include "blockinput.h" |
| 46 | #include <c-ctype.h> | 46 | #include <c-ctype.h> |
| 47 | #include <string.h> | ||
| 47 | 48 | ||
| 48 | #ifdef MSDOS | 49 | #ifdef MSDOS |
| 49 | #include "msdos.h" | 50 | #include "msdos.h" |
| @@ -2150,36 +2151,90 @@ grow_read_buffer (void) | |||
| 2150 | MAX_MULTIBYTE_LENGTH, -1, 1); | 2151 | MAX_MULTIBYTE_LENGTH, -1, 1); |
| 2151 | } | 2152 | } |
| 2152 | 2153 | ||
| 2153 | /* Hash table that maps Unicode character names to code points. */ | 2154 | /* Signal an invalid-read-syntax error indicating that the character |
| 2154 | static Lisp_Object character_names; | 2155 | name in an \N{…} literal is invalid. */ |
| 2156 | static _Noreturn void | ||
| 2157 | invalid_character_name (Lisp_Object name) | ||
| 2158 | { | ||
| 2159 | AUTO_STRING (format, "\\N{%s}"); | ||
| 2160 | xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name)); | ||
| 2161 | } | ||
| 2155 | 2162 | ||
| 2156 | /* Length of the longest Unicode character name, in bytes. */ | 2163 | /* Check that CODE is a valid Unicode scalar value, and return its |
| 2157 | static ptrdiff_t max_character_name_length; | 2164 | value. CODE should be parsed from the character name given by |
| 2165 | NAME. NAME is used for error messages. */ | ||
| 2166 | static int | ||
| 2167 | check_scalar_value (Lisp_Object code, Lisp_Object name) | ||
| 2168 | { | ||
| 2169 | if (! NUMBERP (code)) | ||
| 2170 | invalid_character_name (name); | ||
| 2171 | EMACS_INT i = XINT (code); | ||
| 2172 | if (! (0 <= i && i <= MAX_UNICODE_CHAR) | ||
| 2173 | /* Don't allow surrogates. */ | ||
| 2174 | || (0xD800 <= code && code <= 0xDFFF)) | ||
| 2175 | invalid_character_name (name); | ||
| 2176 | return i; | ||
| 2177 | } | ||
| 2158 | 2178 | ||
| 2159 | /* Initializes `character_names' and `max_character_name_length'. | 2179 | /* If NAME starts with PREFIX, interpret the rest as a hexadecimal |
| 2160 | Called by `read_escape'. */ | 2180 | number and return its value. Raise invalid-read-syntax if the |
| 2161 | void init_character_names (void) | 2181 | number is not a valid scalar value. Return −1 if NAME doesn’t |
| 2182 | start with PREFIX. */ | ||
| 2183 | static int | ||
| 2184 | parse_code_after_prefix (Lisp_Object name, const char *prefix) | ||
| 2162 | { | 2185 | { |
| 2163 | character_names = CALLN (Fmake_hash_table, | 2186 | ptrdiff_t name_len = SBYTES (name); |
| 2164 | QCtest, Qequal, | 2187 | ptrdiff_t prefix_len = strlen (prefix); |
| 2165 | /* Currently around 100,000 Unicode | 2188 | /* Allow between one and eight hexadecimal digits after the |
| 2166 | characters are defined. */ | 2189 | prefix. */ |
| 2167 | QCsize, make_natnum (100000)); | 2190 | if (prefix_len < name_len && name_len <= prefix_len + 8 |
| 2168 | Lisp_Object get_property = | 2191 | && memcmp (SDATA (name), prefix, prefix_len) == 0) |
| 2169 | Fsymbol_function (intern_c_string ("get-char-code-property")); | ||
| 2170 | ptrdiff_t length = 0; | ||
| 2171 | for (int i = 0; i <= MAX_UNICODE_CHAR; ++i) | ||
| 2172 | { | 2192 | { |
| 2173 | Lisp_Object code = make_natnum (i); | 2193 | Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false); |
| 2174 | Lisp_Object name = call2 (get_property, code, Qname); | 2194 | if (NUMBERP (code)) |
| 2175 | if (NILP (name)) continue; | 2195 | return check_scalar_value (code, name); |
| 2176 | CHECK_STRING (name); | 2196 | } |
| 2177 | length = max (length, SBYTES (name)); | 2197 | return -1; |
| 2178 | Fputhash (name, code, character_names); | 2198 | } |
| 2199 | |||
| 2200 | /* Returns the scalar value that has the Unicode character name NAME. | ||
| 2201 | Raises `invalid-read-syntax' if there is no such character. */ | ||
| 2202 | static int | ||
| 2203 | character_name_to_code (Lisp_Object name) | ||
| 2204 | { | ||
| 2205 | /* Code point as U+N, where N is between 1 and 8 hexadecimal | ||
| 2206 | digits. */ | ||
| 2207 | int code = parse_code_after_prefix (name, "U+"); | ||
| 2208 | if (code >= 0) | ||
| 2209 | return code; | ||
| 2210 | |||
| 2211 | /* CJK ideographs are not contained in the association list returned | ||
| 2212 | by `ucs-names'. But they follow a predictable naming pattern: a | ||
| 2213 | fixed prefix plus the hexadecimal codepoint value. */ | ||
| 2214 | code = parse_code_after_prefix (name, "CJK IDEOGRAPH-"); | ||
| 2215 | if (code >= 0) | ||
| 2216 | { | ||
| 2217 | /* Various ranges of CJK characters; see UnicodeData.txt. */ | ||
| 2218 | if ((0x3400 <= code && code <= 0x4DB5) | ||
| 2219 | || (0x4E00 <= code && code <= 0x9FD5) | ||
| 2220 | || (0x20000 <= code && code <= 0x2A6D6) | ||
| 2221 | || (0x2A700 <= code && code <= 0x2B734) | ||
| 2222 | || (0x2B740 <= code && code <= 0x2B81D) | ||
| 2223 | || (0x2B820 <= code && code <= 0x2CEA1)) | ||
| 2224 | return code; | ||
| 2225 | else | ||
| 2226 | invalid_character_name (name); | ||
| 2179 | } | 2227 | } |
| 2180 | max_character_name_length = length; | 2228 | |
| 2229 | /* Look up the name in the table returned by `ucs-names'. */ | ||
| 2230 | Lisp_Object names = call0 (Qucs_names); | ||
| 2231 | return check_scalar_value (CDR (Fassoc (name, names)), name); | ||
| 2181 | } | 2232 | } |
| 2182 | 2233 | ||
| 2234 | /* Bound on the length of a Unicode character name. As of | ||
| 2235 | Unicode 9.0.0 the maximum is 83, so this should be safe. */ | ||
| 2236 | enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; | ||
| 2237 | |||
| 2183 | /* Read a \-escape sequence, assuming we already read the `\'. | 2238 | /* Read a \-escape sequence, assuming we already read the `\'. |
| 2184 | If the escape sequence forces unibyte, return eight-bit char. */ | 2239 | If the escape sequence forces unibyte, return eight-bit char. */ |
| 2185 | 2240 | ||
| @@ -2393,10 +2448,7 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2393 | c = READCHAR; | 2448 | c = READCHAR; |
| 2394 | if (c != '{') | 2449 | if (c != '{') |
| 2395 | invalid_syntax ("Expected opening brace after \\N"); | 2450 | invalid_syntax ("Expected opening brace after \\N"); |
| 2396 | if (NILP (character_names)) | 2451 | char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1]; |
| 2397 | init_character_names (); | ||
| 2398 | USE_SAFE_ALLOCA; | ||
| 2399 | char *name = SAFE_ALLOCA (max_character_name_length + 1); | ||
| 2400 | bool whitespace = false; | 2452 | bool whitespace = false; |
| 2401 | ptrdiff_t length = 0; | 2453 | ptrdiff_t length = 0; |
| 2402 | while (true) | 2454 | while (true) |
| @@ -2407,11 +2459,12 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2407 | if (c == '}') | 2459 | if (c == '}') |
| 2408 | break; | 2460 | break; |
| 2409 | if (! c_isascii (c)) | 2461 | if (! c_isascii (c)) |
| 2410 | xsignal1 (Qinvalid_read_syntax, | 2462 | { |
| 2411 | CALLN (Fformat, | 2463 | AUTO_STRING (format, |
| 2412 | build_pure_c_string ("Non-ASCII character U+%04X" | 2464 | "Non-ASCII character U+%04X in character name"); |
| 2413 | " in character name"), | 2465 | xsignal1 (Qinvalid_read_syntax, |
| 2414 | make_natnum (c))); | 2466 | CALLN (Fformat, format, make_natnum (c))); |
| 2467 | } | ||
| 2415 | /* We treat multiple adjacent whitespace characters as a | 2468 | /* We treat multiple adjacent whitespace characters as a |
| 2416 | single space character. This makes it easier to use | 2469 | single space character. This makes it easier to use |
| 2417 | character names in e.g. multi-line strings. */ | 2470 | character names in e.g. multi-line strings. */ |
| @@ -2425,25 +2478,12 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2425 | else | 2478 | else |
| 2426 | whitespace = false; | 2479 | whitespace = false; |
| 2427 | name[length++] = c; | 2480 | name[length++] = c; |
| 2428 | if (length >= max_character_name_length) | 2481 | if (length >= sizeof name) |
| 2429 | invalid_syntax ("Character name too long"); | 2482 | invalid_syntax ("Character name too long"); |
| 2430 | } | 2483 | } |
| 2431 | if (length == 0) | 2484 | if (length == 0) |
| 2432 | invalid_syntax ("Empty character name"); | 2485 | invalid_syntax ("Empty character name"); |
| 2433 | name[length] = 0; | 2486 | return character_name_to_code (make_unibyte_string (name, length)); |
| 2434 | Lisp_Object lisp_name = make_unibyte_string (name, length); | ||
| 2435 | Lisp_Object code = | ||
| 2436 | (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ? | ||
| 2437 | /* Code point as U+N, where N is between 1 and 8 hexadecimal | ||
| 2438 | digits. */ | ||
| 2439 | string_to_number (name + 2, 16, false) : | ||
| 2440 | Fgethash (lisp_name, character_names, Qnil); | ||
| 2441 | SAFE_FREE (); | ||
| 2442 | if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR)) | ||
| 2443 | xsignal1 (Qinvalid_read_syntax, | ||
| 2444 | CALLN (Fformat, | ||
| 2445 | build_pure_c_string ("\\N{%s}"), lisp_name)); | ||
| 2446 | return XINT (code); | ||
| 2447 | } | 2487 | } |
| 2448 | 2488 | ||
| 2449 | default: | 2489 | default: |
| @@ -4835,6 +4875,5 @@ that are loaded before your customizations are read! */); | |||
| 4835 | DEFSYM (Qrehash_size, "rehash-size"); | 4875 | DEFSYM (Qrehash_size, "rehash-size"); |
| 4836 | DEFSYM (Qrehash_threshold, "rehash-threshold"); | 4876 | DEFSYM (Qrehash_threshold, "rehash-threshold"); |
| 4837 | 4877 | ||
| 4838 | character_names = Qnil; | 4878 | DEFSYM (Qucs_names, "ucs-names"); |
| 4839 | staticpro (&character_names); | ||
| 4840 | } | 4879 | } |