diff options
| author | Philipp Stephani | 2016-04-21 14:45:22 -0700 |
|---|---|---|
| committer | Paul Eggert | 2016-04-21 19:29:40 -0700 |
| commit | de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73 (patch) | |
| tree | d929612c6d9ae099782dfe67b2d03041b9395a4f | |
| parent | 7621a521452d988b27e761c76ad8e667e932192e (diff) | |
| download | emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.tar.gz emacs-de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73.zip | |
Implement named character escapes, similar to Perl
* lread.c (init_character_names): New function.
(read_escape): Read Perl-style named character escape sequences.
(syms_of_lread): Initialize new variable 'character_names'.
* test/src/lread-tests.el (lread-char-empty-name): Add test file
for src/lread.c.
| -rw-r--r-- | src/lread.c | 96 | ||||
| -rw-r--r-- | test/src/lread-tests.el | 54 |
2 files changed, 150 insertions, 0 deletions
diff --git a/src/lread.c b/src/lread.c index fedfcb807c8..9fa46a875be 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -43,6 +43,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */ | |||
| 43 | #include "systime.h" | 43 | #include "systime.h" |
| 44 | #include "termhooks.h" | 44 | #include "termhooks.h" |
| 45 | #include "blockinput.h" | 45 | #include "blockinput.h" |
| 46 | #include <c-ctype.h> | ||
| 46 | 47 | ||
| 47 | #ifdef MSDOS | 48 | #ifdef MSDOS |
| 48 | #include "msdos.h" | 49 | #include "msdos.h" |
| @@ -2149,6 +2150,36 @@ grow_read_buffer (void) | |||
| 2149 | MAX_MULTIBYTE_LENGTH, -1, 1); | 2150 | MAX_MULTIBYTE_LENGTH, -1, 1); |
| 2150 | } | 2151 | } |
| 2151 | 2152 | ||
| 2153 | /* Hash table that maps Unicode character names to code points. */ | ||
| 2154 | static Lisp_Object character_names; | ||
| 2155 | |||
| 2156 | /* Length of the longest Unicode character name, in bytes. */ | ||
| 2157 | static ptrdiff_t max_character_name_length; | ||
| 2158 | |||
| 2159 | /* Initializes `character_names' and `max_character_name_length'. | ||
| 2160 | Called by `read_escape'. */ | ||
| 2161 | void init_character_names () | ||
| 2162 | { | ||
| 2163 | character_names = CALLN (Fmake_hash_table, | ||
| 2164 | QCtest, Qequal, | ||
| 2165 | /* Currently around 100,000 Unicode | ||
| 2166 | characters are defined. */ | ||
| 2167 | QCsize, make_natnum (100000)); | ||
| 2168 | const Lisp_Object get_property = | ||
| 2169 | Fsymbol_function (intern_c_string ("get-char-code-property")); | ||
| 2170 | ptrdiff_t length = 0; | ||
| 2171 | for (int i = 0; i <= MAX_UNICODE_CHAR; ++i) | ||
| 2172 | { | ||
| 2173 | const Lisp_Object code = make_natnum (i); | ||
| 2174 | const Lisp_Object name = call2 (get_property, code, Qname); | ||
| 2175 | if (NILP (name)) continue; | ||
| 2176 | CHECK_STRING (name); | ||
| 2177 | length = max (length, SBYTES (name)); | ||
| 2178 | Fputhash (name, code, character_names); | ||
| 2179 | } | ||
| 2180 | max_character_name_length = length; | ||
| 2181 | } | ||
| 2182 | |||
| 2152 | /* Read a \-escape sequence, assuming we already read the `\'. | 2183 | /* Read a \-escape sequence, assuming we already read the `\'. |
| 2153 | If the escape sequence forces unibyte, return eight-bit char. */ | 2184 | If the escape sequence forces unibyte, return eight-bit char. */ |
| 2154 | 2185 | ||
| @@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2356 | return i; | 2387 | return i; |
| 2357 | } | 2388 | } |
| 2358 | 2389 | ||
| 2390 | case 'N': | ||
| 2391 | /* Named character. */ | ||
| 2392 | { | ||
| 2393 | c = READCHAR; | ||
| 2394 | if (c != '{') | ||
| 2395 | invalid_syntax ("Expected opening brace after \\N"); | ||
| 2396 | if (NILP (character_names)) | ||
| 2397 | init_character_names (); | ||
| 2398 | USE_SAFE_ALLOCA; | ||
| 2399 | char *name = SAFE_ALLOCA (max_character_name_length + 1); | ||
| 2400 | bool whitespace = false; | ||
| 2401 | ptrdiff_t length = 0; | ||
| 2402 | while (true) | ||
| 2403 | { | ||
| 2404 | c = READCHAR; | ||
| 2405 | if (c < 0) | ||
| 2406 | end_of_file_error (); | ||
| 2407 | if (c == '}') | ||
| 2408 | break; | ||
| 2409 | if (! c_isascii (c)) | ||
| 2410 | xsignal1 (Qinvalid_read_syntax, | ||
| 2411 | CALLN (Fformat, | ||
| 2412 | build_pure_c_string ("Non-ASCII character U+%04X" | ||
| 2413 | " in character name"), | ||
| 2414 | make_natnum (c))); | ||
| 2415 | /* We treat multiple adjacent whitespace characters as a | ||
| 2416 | single space character. This makes it easier to use | ||
| 2417 | character names in e.g. multi-line strings. */ | ||
| 2418 | if (c_isspace (c)) | ||
| 2419 | { | ||
| 2420 | if (! whitespace) | ||
| 2421 | { | ||
| 2422 | whitespace = true; | ||
| 2423 | name[length++] = ' '; | ||
| 2424 | } | ||
| 2425 | } | ||
| 2426 | else | ||
| 2427 | { | ||
| 2428 | whitespace = false; | ||
| 2429 | name[length++] = c; | ||
| 2430 | } | ||
| 2431 | if (length >= max_character_name_length) | ||
| 2432 | invalid_syntax ("Character name too long"); | ||
| 2433 | } | ||
| 2434 | if (length == 0) | ||
| 2435 | invalid_syntax ("Empty character name"); | ||
| 2436 | name[length] = 0; | ||
| 2437 | const Lisp_Object lisp_name = make_unibyte_string (name, length); | ||
| 2438 | const Lisp_Object code = | ||
| 2439 | (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ? | ||
| 2440 | /* Code point as U+N, where N is between 1 and 8 hexadecimal | ||
| 2441 | digits. */ | ||
| 2442 | string_to_number (name + 2, 16, false) : | ||
| 2443 | Fgethash (lisp_name, character_names, Qnil); | ||
| 2444 | SAFE_FREE (); | ||
| 2445 | if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR)) | ||
| 2446 | xsignal1 (Qinvalid_read_syntax, | ||
| 2447 | CALLN (Fformat, | ||
| 2448 | build_pure_c_string ("\\N{%s}"), lisp_name)); | ||
| 2449 | return XINT (code); | ||
| 2450 | } | ||
| 2451 | |||
| 2359 | default: | 2452 | default: |
| 2360 | return c; | 2453 | return c; |
| 2361 | } | 2454 | } |
| @@ -4744,4 +4837,7 @@ that are loaded before your customizations are read! */); | |||
| 4744 | DEFSYM (Qweakness, "weakness"); | 4837 | DEFSYM (Qweakness, "weakness"); |
| 4745 | DEFSYM (Qrehash_size, "rehash-size"); | 4838 | DEFSYM (Qrehash_size, "rehash-size"); |
| 4746 | DEFSYM (Qrehash_threshold, "rehash-threshold"); | 4839 | DEFSYM (Qrehash_threshold, "rehash-threshold"); |
| 4840 | |||
| 4841 | character_names = Qnil; | ||
| 4842 | staticpro (&character_names); | ||
| 4747 | } | 4843 | } |
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el new file mode 100644 index 00000000000..1f873340c56 --- /dev/null +++ b/test/src/lread-tests.el | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | ;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*- | ||
| 2 | |||
| 3 | ;; Copyright (C) 2016 Google Inc. | ||
| 4 | |||
| 5 | ;; Author: Philipp Stephani <phst@google.com> | ||
| 6 | |||
| 7 | ;; This file is part of GNU Emacs. | ||
| 8 | |||
| 9 | ;; This program is free software; you can redistribute it and/or modify | ||
| 10 | ;; it under the terms of the GNU General Public License as published by | ||
| 11 | ;; the Free Software Foundation, either version 3 of the License, or | ||
| 12 | ;; (at your option) any later version. | ||
| 13 | |||
| 14 | ;; This program is distributed in the hope that it will be useful, | ||
| 15 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 16 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 17 | ;; GNU General Public License for more details. | ||
| 18 | |||
| 19 | ;; You should have received a copy of the GNU General Public License | ||
| 20 | ;; along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| 21 | |||
| 22 | ;;; Commentary: | ||
| 23 | |||
| 24 | ;; Unit tests for code in src/lread.c. | ||
| 25 | |||
| 26 | ;;; Code: | ||
| 27 | |||
| 28 | (ert-deftest lread-char-number () | ||
| 29 | (should (equal ?\N{U+A817} #xA817))) | ||
| 30 | |||
| 31 | (ert-deftest lread-char-name () | ||
| 32 | (should (equal ?\N{SYLOTI NAGRI LETTER | ||
| 33 | DHO} | ||
| 34 | #xA817))) | ||
| 35 | |||
| 36 | (ert-deftest lread-char-invalid-number () | ||
| 37 | (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) | ||
| 38 | |||
| 39 | (ert-deftest lread-char-invalid-name () | ||
| 40 | (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) | ||
| 41 | |||
| 42 | (ert-deftest lread-char-non-ascii-name () | ||
| 43 | (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax) | ||
| 44 | |||
| 45 | (ert-deftest lread-char-empty-name () | ||
| 46 | (should-error (read "?\\N{}")) 'invalid-read-syntax) | ||
| 47 | |||
| 48 | (ert-deftest lread-string-char-number () | ||
| 49 | (should (equal "a\N{U+A817}b" "a\uA817b"))) | ||
| 50 | |||
| 51 | (ert-deftest lread-string-char-name () | ||
| 52 | (should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b"))) | ||
| 53 | |||
| 54 | ;;; lread-tests.el ends here | ||