diff options
| author | Paul Eggert | 2016-04-25 10:41:29 -0700 |
|---|---|---|
| committer | Paul Eggert | 2016-04-25 10:42:48 -0700 |
| commit | 86d083438dba60dc00e9e96414bf7e832720c05a (patch) | |
| tree | 9ca5fac163acf4b1a3bca0e1e8b5c87af26e5a89 | |
| parent | f069d854508946bcc03e4c77ceb430748e3ab6d7 (diff) | |
| download | emacs-86d083438dba60dc00e9e96414bf7e832720c05a.tar.gz emacs-86d083438dba60dc00e9e96414bf7e832720c05a.zip | |
New function ‘char-from-name’
This also fixes the mishandling of "\N{CJK COMPATIBILITY
IDEOGRAPH-F900}", "\N{VARIATION SELECTOR-1}", etc.
Problem reported by Eli Zaretskii in:
http://lists.gnu.org/archive/html/emacs-devel/2016-04/msg00614.html
* doc/lispref/nonascii.texi (Character Codes), etc/NEWS: Document this.
* lisp/international/mule-cmds.el (char-from-name): New function.
(read-char-by-name): Use it. Document that "BED" is treated as
a name, not as a hexadecimal number. Reject out-of-range integers,
floating-point numbers, and strings with trailing junk.
* src/lread.c (character_name_to_code): Call char-from-name
instead of inspecting ucs-names directly, so that we handle
computed names like "VARIATION SELECTOR-1". Do not use an auto
string, since char-from-name might GC.
* test/src/lread-tests.el: Add tests for new behavior, and
fix some old tests that were wrong.
| -rw-r--r-- | doc/lispref/nonascii.texi | 12 | ||||
| -rw-r--r-- | etc/NEWS | 4 | ||||
| -rw-r--r-- | lisp/international/mule-cmds.el | 43 | ||||
| -rw-r--r-- | src/lread.c | 31 | ||||
| -rw-r--r-- | test/src/lread-tests.el | 48 |
5 files changed, 103 insertions, 35 deletions
diff --git a/doc/lispref/nonascii.texi b/doc/lispref/nonascii.texi index 0e4aa86e48b..fd2ce3248fd 100644 --- a/doc/lispref/nonascii.texi +++ b/doc/lispref/nonascii.texi | |||
| @@ -420,6 +420,18 @@ codepoint can have. | |||
| 420 | @end example | 420 | @end example |
| 421 | @end defun | 421 | @end defun |
| 422 | 422 | ||
| 423 | @defun char-from-name string &optional ignore-case | ||
| 424 | This function returns the character whose Unicode name is @var{string}. | ||
| 425 | If @var{ignore-case} is non-@code{nil}, case is ignored in @var{string}. | ||
| 426 | This function returns @code{nil} if @var{string} does not name a character. | ||
| 427 | |||
| 428 | @example | ||
| 429 | ;; U+03A3 | ||
| 430 | (= (char-from-name "GREEK CAPITAL LETTER SIGMA") #x03A3) | ||
| 431 | @result{} t | ||
| 432 | @end example | ||
| 433 | @end defun | ||
| 434 | |||
| 423 | @defun get-byte &optional pos string | 435 | @defun get-byte &optional pos string |
| 424 | This function returns the byte at character position @var{pos} in the | 436 | This function returns the byte at character position @var{pos} in the |
| 425 | current buffer. If the current buffer is unibyte, this is literally | 437 | current buffer. If the current buffer is unibyte, this is literally |
| @@ -391,6 +391,10 @@ compares their numerical values. According to this predicate, | |||
| 391 | "foo2.png" is smaller than "foo12.png". | 391 | "foo2.png" is smaller than "foo12.png". |
| 392 | 392 | ||
| 393 | +++ | 393 | +++ |
| 394 | ** The new function 'char-from-name' converts a Unicode name string | ||
| 395 | to the corresponding character code. | ||
| 396 | |||
| 397 | +++ | ||
| 394 | ** New functions 'sxhash-eq' and 'sxhash-eql' return hash codes of a | 398 | ** New functions 'sxhash-eq' and 'sxhash-eql' return hash codes of a |
| 395 | Lisp object suitable for use with 'eq' and 'eql' correspondingly. If | 399 | Lisp object suitable for use with 'eq' and 'eql' correspondingly. If |
| 396 | two objects are 'eq' ('eql'), then the result of 'sxhash-eq' | 400 | two objects are 'eq' ('eql'), then the result of 'sxhash-eq' |
diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el index 8eb320acea5..2ce21a88731 100644 --- a/lisp/international/mule-cmds.el +++ b/lisp/international/mule-cmds.el | |||
| @@ -2978,6 +2978,27 @@ on encoding." | |||
| 2978 | (let ((char (assoc name ucs-names))) | 2978 | (let ((char (assoc name ucs-names))) |
| 2979 | (when char (format " (%c)" (cdr char))))) | 2979 | (when char (format " (%c)" (cdr char))))) |
| 2980 | 2980 | ||
| 2981 | (defun char-from-name (string &optional ignore-case) | ||
| 2982 | "Return a character as a number from its Unicode name STRING. | ||
| 2983 | If optional IGNORE-CASE is non-nil, ignore case in STRING. | ||
| 2984 | Return nil if STRING does not name a character." | ||
| 2985 | (or (cdr (assoc-string string (ucs-names) ignore-case)) | ||
| 2986 | (let ((minus (string-match-p "-[0-9A-F]+\\'" string))) | ||
| 2987 | (when minus | ||
| 2988 | ;; Parse names like "VARIATION SELECTOR-17" and "CJK | ||
| 2989 | ;; COMPATIBILITY IDEOGRAPH-F900" that are not in ucs-names. | ||
| 2990 | (ignore-errors | ||
| 2991 | (let* ((case-fold-search ignore-case) | ||
| 2992 | (vs (string-match-p "\\`VARIATION SELECTOR-" string)) | ||
| 2993 | (minus-num (string-to-number (substring string minus) | ||
| 2994 | (if vs 10 16))) | ||
| 2995 | (vs-offset (if vs (if (< minus-num -16) #xE00EF #xFDFF) 0)) | ||
| 2996 | (code (- vs-offset minus-num)) | ||
| 2997 | (name (get-char-code-property code 'name))) | ||
| 2998 | (when (eq t (compare-strings string nil nil name nil nil | ||
| 2999 | ignore-case)) | ||
| 3000 | code))))))) | ||
| 3001 | |||
| 2981 | (defun read-char-by-name (prompt) | 3002 | (defun read-char-by-name (prompt) |
| 2982 | "Read a character by its Unicode name or hex number string. | 3003 | "Read a character by its Unicode name or hex number string. |
| 2983 | Display PROMPT and read a string that represents a character by its | 3004 | Display PROMPT and read a string that represents a character by its |
| @@ -2991,9 +3012,11 @@ preceded by an asterisk `*' and use completion, it will show all | |||
| 2991 | the characters whose names include that substring, not necessarily | 3012 | the characters whose names include that substring, not necessarily |
| 2992 | at the beginning of the name. | 3013 | at the beginning of the name. |
| 2993 | 3014 | ||
| 2994 | This function also accepts a hexadecimal number of Unicode code | 3015 | Accept a name like \"CIRCULATION FUNCTION\", a hexadecimal |
| 2995 | point or a number in hash notation, e.g. #o21430 for octal, | 3016 | number like \"2A10\", or a number in hash notation (e.g., |
| 2996 | #x2318 for hex, or #10r8984 for decimal." | 3017 | \"#x2a10\" for hex, \"10r10768\" for decimal, or \"#o25020\" for |
| 3018 | octal). Treat otherwise-ambiguous strings like \"BED\" (U+1F6CF) | ||
| 3019 | as names, not numbers." | ||
| 2997 | (let* ((enable-recursive-minibuffers t) | 3020 | (let* ((enable-recursive-minibuffers t) |
| 2998 | (completion-ignore-case t) | 3021 | (completion-ignore-case t) |
| 2999 | (input | 3022 | (input |
| @@ -3006,13 +3029,13 @@ point or a number in hash notation, e.g. #o21430 for octal, | |||
| 3006 | (category . unicode-name)) | 3029 | (category . unicode-name)) |
| 3007 | (complete-with-action action (ucs-names) string pred))))) | 3030 | (complete-with-action action (ucs-names) string pred))))) |
| 3008 | (char | 3031 | (char |
| 3009 | (cond | 3032 | (cond |
| 3010 | ((string-match-p "\\`[0-9a-fA-F]+\\'" input) | 3033 | ((char-from-name input t)) |
| 3011 | (string-to-number input 16)) | 3034 | ((string-match-p "\\`[0-9a-fA-F]+\\'" input) |
| 3012 | ((string-match-p "\\`#" input) | 3035 | (ignore-errors (string-to-number input 16))) |
| 3013 | (read input)) | 3036 | ((string-match-p "\\`#\\([bBoOxX]\\|[0-9]+[rR]\\)[0-9a-zA-Z]+\\'" |
| 3014 | (t | 3037 | input) |
| 3015 | (cdr (assoc-string input (ucs-names) t)))))) | 3038 | (ignore-errors (read input)))))) |
| 3016 | (unless (characterp char) | 3039 | (unless (characterp char) |
| 3017 | (error "Invalid character")) | 3040 | (error "Invalid character")) |
| 3018 | char)) | 3041 | char)) |
diff --git a/src/lread.c b/src/lread.c index a42c1f60c95..6e97e079650 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -2155,26 +2155,15 @@ grow_read_buffer (void) | |||
| 2155 | static int | 2155 | static int |
| 2156 | character_name_to_code (char const *name, ptrdiff_t name_len) | 2156 | character_name_to_code (char const *name, ptrdiff_t name_len) |
| 2157 | { | 2157 | { |
| 2158 | Lisp_Object code; | 2158 | /* For "U+XXXX", pass the leading '+' to string_to_number to reject |
| 2159 | 2159 | monstrosities like "U+-0000". */ | |
| 2160 | /* Code point as U+XXXX.... */ | 2160 | Lisp_Object code |
| 2161 | if (name[0] == 'U' && name[1] == '+') | 2161 | = (name[0] == 'U' && name[1] == '+' |
| 2162 | { | 2162 | ? string_to_number (name + 1, 16, false) |
| 2163 | /* Pass the leading '+' to string_to_number, so that it | 2163 | : call2 (Qchar_from_name, make_unibyte_string (name, name_len), Qt)); |
| 2164 | rejects monstrosities such as negative values. */ | 2164 | |
| 2165 | code = string_to_number (name + 1, 16, false); | 2165 | if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR) |
| 2166 | } | 2166 | || char_surrogate_p (XINT (code))) |
| 2167 | else | ||
| 2168 | { | ||
| 2169 | /* Look up the name in the table returned by 'ucs-names'. */ | ||
| 2170 | AUTO_STRING_WITH_LEN (namestr, name, name_len); | ||
| 2171 | Lisp_Object names = call0 (Qucs_names); | ||
| 2172 | code = CDR (Fassoc (namestr, names)); | ||
| 2173 | } | ||
| 2174 | |||
| 2175 | if (! (INTEGERP (code) | ||
| 2176 | && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR | ||
| 2177 | && ! char_surrogate_p (XINT (code)))) | ||
| 2178 | { | 2167 | { |
| 2179 | AUTO_STRING (format, "\\N{%s}"); | 2168 | AUTO_STRING (format, "\\N{%s}"); |
| 2180 | AUTO_STRING_WITH_LEN (namestr, name, name_len); | 2169 | AUTO_STRING_WITH_LEN (namestr, name, name_len); |
| @@ -4829,5 +4818,5 @@ that are loaded before your customizations are read! */); | |||
| 4829 | DEFSYM (Qrehash_size, "rehash-size"); | 4818 | DEFSYM (Qrehash_size, "rehash-size"); |
| 4830 | DEFSYM (Qrehash_threshold, "rehash-threshold"); | 4819 | DEFSYM (Qrehash_threshold, "rehash-threshold"); |
| 4831 | 4820 | ||
| 4832 | DEFSYM (Qucs_names, "ucs-names"); | 4821 | DEFSYM (Qchar_from_name, "char-from-name"); |
| 4833 | } | 4822 | } |
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el index 2ebaf491120..1a82d133a44 100644 --- a/test/src/lread-tests.el +++ b/test/src/lread-tests.el | |||
| @@ -28,15 +28,55 @@ | |||
| 28 | (ert-deftest lread-char-number () | 28 | (ert-deftest lread-char-number () |
| 29 | (should (equal (read "?\\N{U+A817}") #xA817))) | 29 | (should (equal (read "?\\N{U+A817}") #xA817))) |
| 30 | 30 | ||
| 31 | (ert-deftest lread-char-name () | 31 | (ert-deftest lread-char-name-1 () |
| 32 | (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}") | 32 | (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}") |
| 33 | #xA817))) | 33 | #xA817))) |
| 34 | (ert-deftest lread-char-name-2 () | ||
| 35 | (should (equal (read "?\\N{BED}") #x1F6CF))) | ||
| 36 | (ert-deftest lread-char-name-3 () | ||
| 37 | (should (equal (read "?\\N{U+BED}") #xBED))) | ||
| 38 | (ert-deftest lread-char-name-4 () | ||
| 39 | (should (equal (read "?\\N{VARIATION SELECTOR-1}") #xFE00))) | ||
| 40 | (ert-deftest lread-char-name-5 () | ||
| 41 | (should (equal (read "?\\N{VARIATION SELECTOR-16}") #xFE0F))) | ||
| 42 | (ert-deftest lread-char-name-6 () | ||
| 43 | (should (equal (read "?\\N{VARIATION SELECTOR-17}") #xE0100))) | ||
| 44 | (ert-deftest lread-char-name-7 () | ||
| 45 | (should (equal (read "?\\N{VARIATION SELECTOR-256}") #xE01EF))) | ||
| 46 | (ert-deftest lread-char-name-8 () | ||
| 47 | (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F900}") #xF900))) | ||
| 48 | (ert-deftest lread-char-name-9 () | ||
| 49 | (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FAD9}") #xFAD9))) | ||
| 50 | (ert-deftest lread-char-name-10 () | ||
| 51 | (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F800}") #x2F800))) | ||
| 52 | (ert-deftest lread-char-name-11 () | ||
| 53 | (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1D}") #x2FA1D))) | ||
| 34 | 54 | ||
| 35 | (ert-deftest lread-char-invalid-number () | 55 | (ert-deftest lread-char-invalid-number () |
| 36 | (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) | 56 | (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) |
| 37 | 57 | ||
| 38 | (ert-deftest lread-char-invalid-name () | 58 | (ert-deftest lread-char-invalid-name-1 () |
| 39 | (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) | 59 | (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) |
| 60 | (ert-deftest lread-char-invalid-name-2 () | ||
| 61 | (should-error (read "?\\N{VARIATION SELECTOR-0}")) :type 'invalid-read-syntax) | ||
| 62 | (ert-deftest lread-char-invalid-name-3 () | ||
| 63 | (should-error (read "?\\N{VARIATION SELECTOR-257}")) | ||
| 64 | :type 'invalid-read-syntax) | ||
| 65 | (ert-deftest lread-char-invalid-name-4 () | ||
| 66 | (should-error (read "?\\N{VARIATION SELECTOR--0}")) | ||
| 67 | :type 'invalid-read-syntax) | ||
| 68 | (ert-deftest lread-char-invalid-name-5 () | ||
| 69 | (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F8FF}")) | ||
| 70 | :type 'invalid-read-syntax) | ||
| 71 | (ert-deftest lread-char-invalid-name-6 () | ||
| 72 | (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FADA}")) | ||
| 73 | :type 'invalid-read-syntax) | ||
| 74 | (ert-deftest lread-char-invalid-name-7 () | ||
| 75 | (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F7FF}")) | ||
| 76 | :type 'invalid-read-syntax) | ||
| 77 | (ert-deftest lread-char-invalid-name-8 () | ||
| 78 | (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1E}")) | ||
| 79 | :type 'invalid-read-syntax) | ||
| 40 | 80 | ||
| 41 | (ert-deftest lread-char-non-ascii-name () | 81 | (ert-deftest lread-char-non-ascii-name () |
| 42 | (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}") | 82 | (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}") |
| @@ -55,13 +95,13 @@ | |||
| 55 | (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax)) | 95 | (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax)) |
| 56 | 96 | ||
| 57 | (ert-deftest lread-string-char-number-1 () | 97 | (ert-deftest lread-string-char-number-1 () |
| 58 | (should (equal (read "a\\N{U+A817}b") "a\uA817bx"))) | 98 | (should (equal (read "\"a\\N{U+A817}b\"") "a\uA817b"))) |
| 59 | (ert-deftest lread-string-char-number-2 () | 99 | (ert-deftest lread-string-char-number-2 () |
| 60 | (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax)) | 100 | (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax)) |
| 61 | (ert-deftest lread-string-char-number-3 () | 101 | (ert-deftest lread-string-char-number-3 () |
| 62 | (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax)) | 102 | (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax)) |
| 63 | 103 | ||
| 64 | (ert-deftest lread-string-char-name () | 104 | (ert-deftest lread-string-char-name () |
| 65 | (should (equal (read "a\\N{SYLOTI NAGRI LETTER DHO}b") "a\uA817b"))) | 105 | (should (equal (read "\"a\\N{SYLOTI NAGRI LETTER DHO}b\"") "a\uA817b"))) |
| 66 | 106 | ||
| 67 | ;;; lread-tests.el ends here | 107 | ;;; lread-tests.el ends here |