aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Eggert2016-04-25 10:41:29 -0700
committerPaul Eggert2016-04-25 10:42:48 -0700
commit86d083438dba60dc00e9e96414bf7e832720c05a (patch)
tree9ca5fac163acf4b1a3bca0e1e8b5c87af26e5a89
parentf069d854508946bcc03e4c77ceb430748e3ab6d7 (diff)
downloademacs-86d083438dba60dc00e9e96414bf7e832720c05a.tar.gz
emacs-86d083438dba60dc00e9e96414bf7e832720c05a.zip
New function ‘char-from-name’
This also fixes the mishandling of "\N{CJK COMPATIBILITY IDEOGRAPH-F900}", "\N{VARIATION SELECTOR-1}", etc. Problem reported by Eli Zaretskii in: http://lists.gnu.org/archive/html/emacs-devel/2016-04/msg00614.html * doc/lispref/nonascii.texi (Character Codes), etc/NEWS: Document this. * lisp/international/mule-cmds.el (char-from-name): New function. (read-char-by-name): Use it. Document that "BED" is treated as a name, not as a hexadecimal number. Reject out-of-range integers, floating-point numbers, and strings with trailing junk. * src/lread.c (character_name_to_code): Call char-from-name instead of inspecting ucs-names directly, so that we handle computed names like "VARIATION SELECTOR-1". Do not use an auto string, since char-from-name might GC. * test/src/lread-tests.el: Add tests for new behavior, and fix some old tests that were wrong.
-rw-r--r--doc/lispref/nonascii.texi12
-rw-r--r--etc/NEWS4
-rw-r--r--lisp/international/mule-cmds.el43
-rw-r--r--src/lread.c31
-rw-r--r--test/src/lread-tests.el48
5 files changed, 103 insertions, 35 deletions
diff --git a/doc/lispref/nonascii.texi b/doc/lispref/nonascii.texi
index 0e4aa86e48b..fd2ce3248fd 100644
--- a/doc/lispref/nonascii.texi
+++ b/doc/lispref/nonascii.texi
@@ -420,6 +420,18 @@ codepoint can have.
420@end example 420@end example
421@end defun 421@end defun
422 422
423@defun char-from-name string &optional ignore-case
424This function returns the character whose Unicode name is @var{string}.
425If @var{ignore-case} is non-@code{nil}, case is ignored in @var{string}.
426This function returns @code{nil} if @var{string} does not name a character.
427
428@example
429;; U+03A3
430(= (char-from-name "GREEK CAPITAL LETTER SIGMA") #x03A3)
431 @result{} t
432@end example
433@end defun
434
423@defun get-byte &optional pos string 435@defun get-byte &optional pos string
424This function returns the byte at character position @var{pos} in the 436This function returns the byte at character position @var{pos} in the
425current buffer. If the current buffer is unibyte, this is literally 437current buffer. If the current buffer is unibyte, this is literally
diff --git a/etc/NEWS b/etc/NEWS
index 6bdb648a7b0..e401d2db3a9 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -391,6 +391,10 @@ compares their numerical values. According to this predicate,
391"foo2.png" is smaller than "foo12.png". 391"foo2.png" is smaller than "foo12.png".
392 392
393+++ 393+++
394** The new function 'char-from-name' converts a Unicode name string
395to the corresponding character code.
396
397+++
394** New functions 'sxhash-eq' and 'sxhash-eql' return hash codes of a 398** New functions 'sxhash-eq' and 'sxhash-eql' return hash codes of a
395Lisp object suitable for use with 'eq' and 'eql' correspondingly. If 399Lisp object suitable for use with 'eq' and 'eql' correspondingly. If
396two objects are 'eq' ('eql'), then the result of 'sxhash-eq' 400two objects are 'eq' ('eql'), then the result of 'sxhash-eq'
diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el
index 8eb320acea5..2ce21a88731 100644
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@@ -2978,6 +2978,27 @@ on encoding."
2978 (let ((char (assoc name ucs-names))) 2978 (let ((char (assoc name ucs-names)))
2979 (when char (format " (%c)" (cdr char))))) 2979 (when char (format " (%c)" (cdr char)))))
2980 2980
2981(defun char-from-name (string &optional ignore-case)
2982 "Return a character as a number from its Unicode name STRING.
2983If optional IGNORE-CASE is non-nil, ignore case in STRING.
2984Return nil if STRING does not name a character."
2985 (or (cdr (assoc-string string (ucs-names) ignore-case))
2986 (let ((minus (string-match-p "-[0-9A-F]+\\'" string)))
2987 (when minus
2988 ;; Parse names like "VARIATION SELECTOR-17" and "CJK
2989 ;; COMPATIBILITY IDEOGRAPH-F900" that are not in ucs-names.
2990 (ignore-errors
2991 (let* ((case-fold-search ignore-case)
2992 (vs (string-match-p "\\`VARIATION SELECTOR-" string))
2993 (minus-num (string-to-number (substring string minus)
2994 (if vs 10 16)))
2995 (vs-offset (if vs (if (< minus-num -16) #xE00EF #xFDFF) 0))
2996 (code (- vs-offset minus-num))
2997 (name (get-char-code-property code 'name)))
2998 (when (eq t (compare-strings string nil nil name nil nil
2999 ignore-case))
3000 code)))))))
3001
2981(defun read-char-by-name (prompt) 3002(defun read-char-by-name (prompt)
2982 "Read a character by its Unicode name or hex number string. 3003 "Read a character by its Unicode name or hex number string.
2983Display PROMPT and read a string that represents a character by its 3004Display PROMPT and read a string that represents a character by its
@@ -2991,9 +3012,11 @@ preceded by an asterisk `*' and use completion, it will show all
2991the characters whose names include that substring, not necessarily 3012the characters whose names include that substring, not necessarily
2992at the beginning of the name. 3013at the beginning of the name.
2993 3014
2994This function also accepts a hexadecimal number of Unicode code 3015Accept a name like \"CIRCULATION FUNCTION\", a hexadecimal
2995point or a number in hash notation, e.g. #o21430 for octal, 3016number like \"2A10\", or a number in hash notation (e.g.,
2996#x2318 for hex, or #10r8984 for decimal." 3017\"#x2a10\" for hex, \"10r10768\" for decimal, or \"#o25020\" for
3018octal). Treat otherwise-ambiguous strings like \"BED\" (U+1F6CF)
3019as names, not numbers."
2997 (let* ((enable-recursive-minibuffers t) 3020 (let* ((enable-recursive-minibuffers t)
2998 (completion-ignore-case t) 3021 (completion-ignore-case t)
2999 (input 3022 (input
@@ -3006,13 +3029,13 @@ point or a number in hash notation, e.g. #o21430 for octal,
3006 (category . unicode-name)) 3029 (category . unicode-name))
3007 (complete-with-action action (ucs-names) string pred))))) 3030 (complete-with-action action (ucs-names) string pred)))))
3008 (char 3031 (char
3009 (cond 3032 (cond
3010 ((string-match-p "\\`[0-9a-fA-F]+\\'" input) 3033 ((char-from-name input t))
3011 (string-to-number input 16)) 3034 ((string-match-p "\\`[0-9a-fA-F]+\\'" input)
3012 ((string-match-p "\\`#" input) 3035 (ignore-errors (string-to-number input 16)))
3013 (read input)) 3036 ((string-match-p "\\`#\\([bBoOxX]\\|[0-9]+[rR]\\)[0-9a-zA-Z]+\\'"
3014 (t 3037 input)
3015 (cdr (assoc-string input (ucs-names) t)))))) 3038 (ignore-errors (read input))))))
3016 (unless (characterp char) 3039 (unless (characterp char)
3017 (error "Invalid character")) 3040 (error "Invalid character"))
3018 char)) 3041 char))
diff --git a/src/lread.c b/src/lread.c
index a42c1f60c95..6e97e079650 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2155,26 +2155,15 @@ grow_read_buffer (void)
2155static int 2155static int
2156character_name_to_code (char const *name, ptrdiff_t name_len) 2156character_name_to_code (char const *name, ptrdiff_t name_len)
2157{ 2157{
2158 Lisp_Object code; 2158 /* For "U+XXXX", pass the leading '+' to string_to_number to reject
2159 2159 monstrosities like "U+-0000". */
2160 /* Code point as U+XXXX.... */ 2160 Lisp_Object code
2161 if (name[0] == 'U' && name[1] == '+') 2161 = (name[0] == 'U' && name[1] == '+'
2162 { 2162 ? string_to_number (name + 1, 16, false)
2163 /* Pass the leading '+' to string_to_number, so that it 2163 : call2 (Qchar_from_name, make_unibyte_string (name, name_len), Qt));
2164 rejects monstrosities such as negative values. */ 2164
2165 code = string_to_number (name + 1, 16, false); 2165 if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR)
2166 } 2166 || char_surrogate_p (XINT (code)))
2167 else
2168 {
2169 /* Look up the name in the table returned by 'ucs-names'. */
2170 AUTO_STRING_WITH_LEN (namestr, name, name_len);
2171 Lisp_Object names = call0 (Qucs_names);
2172 code = CDR (Fassoc (namestr, names));
2173 }
2174
2175 if (! (INTEGERP (code)
2176 && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR
2177 && ! char_surrogate_p (XINT (code))))
2178 { 2167 {
2179 AUTO_STRING (format, "\\N{%s}"); 2168 AUTO_STRING (format, "\\N{%s}");
2180 AUTO_STRING_WITH_LEN (namestr, name, name_len); 2169 AUTO_STRING_WITH_LEN (namestr, name, name_len);
@@ -4829,5 +4818,5 @@ that are loaded before your customizations are read! */);
4829 DEFSYM (Qrehash_size, "rehash-size"); 4818 DEFSYM (Qrehash_size, "rehash-size");
4830 DEFSYM (Qrehash_threshold, "rehash-threshold"); 4819 DEFSYM (Qrehash_threshold, "rehash-threshold");
4831 4820
4832 DEFSYM (Qucs_names, "ucs-names"); 4821 DEFSYM (Qchar_from_name, "char-from-name");
4833} 4822}
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index 2ebaf491120..1a82d133a44 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -28,15 +28,55 @@
28(ert-deftest lread-char-number () 28(ert-deftest lread-char-number ()
29 (should (equal (read "?\\N{U+A817}") #xA817))) 29 (should (equal (read "?\\N{U+A817}") #xA817)))
30 30
31(ert-deftest lread-char-name () 31(ert-deftest lread-char-name-1 ()
32 (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}") 32 (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}")
33 #xA817))) 33 #xA817)))
34(ert-deftest lread-char-name-2 ()
35 (should (equal (read "?\\N{BED}") #x1F6CF)))
36(ert-deftest lread-char-name-3 ()
37 (should (equal (read "?\\N{U+BED}") #xBED)))
38(ert-deftest lread-char-name-4 ()
39 (should (equal (read "?\\N{VARIATION SELECTOR-1}") #xFE00)))
40(ert-deftest lread-char-name-5 ()
41 (should (equal (read "?\\N{VARIATION SELECTOR-16}") #xFE0F)))
42(ert-deftest lread-char-name-6 ()
43 (should (equal (read "?\\N{VARIATION SELECTOR-17}") #xE0100)))
44(ert-deftest lread-char-name-7 ()
45 (should (equal (read "?\\N{VARIATION SELECTOR-256}") #xE01EF)))
46(ert-deftest lread-char-name-8 ()
47 (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F900}") #xF900)))
48(ert-deftest lread-char-name-9 ()
49 (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FAD9}") #xFAD9)))
50(ert-deftest lread-char-name-10 ()
51 (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F800}") #x2F800)))
52(ert-deftest lread-char-name-11 ()
53 (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1D}") #x2FA1D)))
34 54
35(ert-deftest lread-char-invalid-number () 55(ert-deftest lread-char-invalid-number ()
36 (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) 56 (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
37 57
38(ert-deftest lread-char-invalid-name () 58(ert-deftest lread-char-invalid-name-1 ()
39 (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) 59 (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
60(ert-deftest lread-char-invalid-name-2 ()
61 (should-error (read "?\\N{VARIATION SELECTOR-0}")) :type 'invalid-read-syntax)
62(ert-deftest lread-char-invalid-name-3 ()
63 (should-error (read "?\\N{VARIATION SELECTOR-257}"))
64 :type 'invalid-read-syntax)
65(ert-deftest lread-char-invalid-name-4 ()
66 (should-error (read "?\\N{VARIATION SELECTOR--0}"))
67 :type 'invalid-read-syntax)
68(ert-deftest lread-char-invalid-name-5 ()
69 (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F8FF}"))
70 :type 'invalid-read-syntax)
71(ert-deftest lread-char-invalid-name-6 ()
72 (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FADA}"))
73 :type 'invalid-read-syntax)
74(ert-deftest lread-char-invalid-name-7 ()
75 (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F7FF}"))
76 :type 'invalid-read-syntax)
77(ert-deftest lread-char-invalid-name-8 ()
78 (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1E}"))
79 :type 'invalid-read-syntax)
40 80
41(ert-deftest lread-char-non-ascii-name () 81(ert-deftest lread-char-non-ascii-name ()
42 (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}") 82 (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")
@@ -55,13 +95,13 @@
55 (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax)) 95 (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax))
56 96
57(ert-deftest lread-string-char-number-1 () 97(ert-deftest lread-string-char-number-1 ()
58 (should (equal (read "a\\N{U+A817}b") "a\uA817bx"))) 98 (should (equal (read "\"a\\N{U+A817}b\"") "a\uA817b")))
59(ert-deftest lread-string-char-number-2 () 99(ert-deftest lread-string-char-number-2 ()
60 (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax)) 100 (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax))
61(ert-deftest lread-string-char-number-3 () 101(ert-deftest lread-string-char-number-3 ()
62 (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax)) 102 (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax))
63 103
64(ert-deftest lread-string-char-name () 104(ert-deftest lread-string-char-name ()
65 (should (equal (read "a\\N{SYLOTI NAGRI LETTER DHO}b") "a\uA817b"))) 105 (should (equal (read "\"a\\N{SYLOTI NAGRI LETTER DHO}b\"") "a\uA817b")))
66 106
67;;; lread-tests.el ends here 107;;; lread-tests.el ends here