diff options
| -rw-r--r-- | doc/lispref/nonascii.texi | 15 | ||||
| -rw-r--r-- | doc/lispref/objects.texi | 52 | ||||
| -rw-r--r-- | src/character.h | 13 | ||||
| -rw-r--r-- | src/lread.c | 104 | ||||
| -rw-r--r-- | test/src/lread-tests.el | 32 |
5 files changed, 89 insertions, 127 deletions
diff --git a/doc/lispref/nonascii.texi b/doc/lispref/nonascii.texi index 66ad9aca71e..0e4aa86e48b 100644 --- a/doc/lispref/nonascii.texi +++ b/doc/lispref/nonascii.texi | |||
| @@ -622,18 +622,21 @@ This function returns the value of @var{char}'s @var{propname} property. | |||
| 622 | @result{} Nd | 622 | @result{} Nd |
| 623 | @end group | 623 | @end group |
| 624 | @group | 624 | @group |
| 625 | ;; U+2084 SUBSCRIPT FOUR | 625 | ;; U+2084 |
| 626 | (get-char-code-property ?\u2084 'digit-value) | 626 | (get-char-code-property ?\N@{SUBSCRIPT FOUR@} |
| 627 | 'digit-value) | ||
| 627 | @result{} 4 | 628 | @result{} 4 |
| 628 | @end group | 629 | @end group |
| 629 | @group | 630 | @group |
| 630 | ;; U+2155 VULGAR FRACTION ONE FIFTH | 631 | ;; U+2155 |
| 631 | (get-char-code-property ?\u2155 'numeric-value) | 632 | (get-char-code-property ?\N@{VULGAR FRACTION ONE FIFTH@} |
| 633 | 'numeric-value) | ||
| 632 | @result{} 0.2 | 634 | @result{} 0.2 |
| 633 | @end group | 635 | @end group |
| 634 | @group | 636 | @group |
| 635 | ;; U+2163 ROMAN NUMERAL FOUR | 637 | ;; U+2163 |
| 636 | (get-char-code-property ?\N@{ROMAN NUMERAL FOUR@} 'numeric-value) | 638 | (get-char-code-property ?\N@{ROMAN NUMERAL FOUR@} |
| 639 | 'numeric-value) | ||
| 637 | @result{} 4 | 640 | @result{} 4 |
| 638 | @end group | 641 | @end group |
| 639 | @group | 642 | @group |
diff --git a/doc/lispref/objects.texi b/doc/lispref/objects.texi index 96b334d2b81..54894b8e24e 100644 --- a/doc/lispref/objects.texi +++ b/doc/lispref/objects.texi | |||
| @@ -353,25 +353,32 @@ following text.) | |||
| 353 | control characters, Emacs provides several types of escape syntax that | 353 | control characters, Emacs provides several types of escape syntax that |
| 354 | you can use to specify non-@acronym{ASCII} text characters. | 354 | you can use to specify non-@acronym{ASCII} text characters. |
| 355 | 355 | ||
| 356 | @enumerate | ||
| 357 | @item | ||
| 356 | @cindex @samp{\} in character constant | 358 | @cindex @samp{\} in character constant |
| 357 | @cindex backslash in character constants | 359 | @cindex backslash in character constants |
| 358 | @cindex unicode character escape | 360 | @cindex unicode character escape |
| 359 | Firstly, you can specify characters by their Unicode values. | 361 | You can specify characters by their Unicode names, if any. |
| 360 | @code{?\u@var{nnnn}} represents a character with Unicode code point | 362 | @code{?\N@{@var{NAME}@}} represents the Unicode character named |
| 361 | @samp{U+@var{nnnn}}, where @var{nnnn} is (by convention) a hexadecimal | 363 | @var{NAME}. Thus, @samp{?\N@{LATIN SMALL LETTER A WITH GRAVE@}} is |
| 362 | number with exactly four digits. The backslash indicates that the | 364 | equivalent to @code{?à} and denotes the Unicode character U+00E0. To |
| 363 | subsequent characters form an escape sequence, and the @samp{u} | 365 | simplify entering multi-line strings, you can replace spaces in the |
| 364 | specifies a Unicode escape sequence. | 366 | names by non-empty sequences of whitespace (e.g., newlines). |
| 365 | 367 | ||
| 366 | There is a slightly different syntax for specifying Unicode | 368 | @item |
| 367 | characters with code points higher than @code{U+@var{ffff}}: | 369 | You can specify characters by their Unicode values. |
| 368 | @code{?\U00@var{nnnnnn}} represents the character with code point | 370 | @code{?\N@{U+@var{X}@}} represents a character with Unicode code point |
| 369 | @samp{U+@var{nnnnnn}}, where @var{nnnnnn} is a six-digit hexadecimal | 371 | @var{X}, where @var{X} is a hexadecimal number. Also, |
| 370 | number. The Unicode Standard only defines code points up to | 372 | @code{?\u@var{xxxx}} and @code{?\U@var{xxxxxxxx}} represent code |
| 371 | @samp{U+@var{10ffff}}, so if you specify a code point higher than | 373 | points @var{xxxx} and @var{xxxxxxxx}, respectively, where each @var{x} |
| 372 | that, Emacs signals an error. | 374 | is a single hexadecimal digit. For example, @code{?\N@{U+E0@}}, |
| 373 | 375 | @code{?\u00e0} and @code{?\U000000E0} are all equivalent to @code{?à} | |
| 374 | Secondly, you can specify characters by their hexadecimal character | 376 | and to @samp{?\N@{LATIN SMALL LETTER A WITH GRAVE@}}. The Unicode |
| 377 | Standard defines code points only up to @samp{U+@var{10ffff}}, so if | ||
| 378 | you specify a code point higher than that, Emacs signals an error. | ||
| 379 | |||
| 380 | @item | ||
| 381 | You can specify characters by their hexadecimal character | ||
| 375 | codes. A hexadecimal escape sequence consists of a backslash, | 382 | codes. A hexadecimal escape sequence consists of a backslash, |
| 376 | @samp{x}, and the hexadecimal character code. Thus, @samp{?\x41} is | 383 | @samp{x}, and the hexadecimal character code. Thus, @samp{?\x41} is |
| 377 | the character @kbd{A}, @samp{?\x1} is the character @kbd{C-a}, and | 384 | the character @kbd{A}, @samp{?\x1} is the character @kbd{C-a}, and |
| @@ -379,23 +386,16 @@ the character @kbd{A}, @samp{?\x1} is the character @kbd{C-a}, and | |||
| 379 | You can use any number of hex digits, so you can represent any | 386 | You can use any number of hex digits, so you can represent any |
| 380 | character code in this way. | 387 | character code in this way. |
| 381 | 388 | ||
| 389 | @item | ||
| 382 | @cindex octal character code | 390 | @cindex octal character code |
| 383 | Thirdly, you can specify characters by their character code in | 391 | You can specify characters by their character code in |
| 384 | octal. An octal escape sequence consists of a backslash followed by | 392 | octal. An octal escape sequence consists of a backslash followed by |
| 385 | up to three octal digits; thus, @samp{?\101} for the character | 393 | up to three octal digits; thus, @samp{?\101} for the character |
| 386 | @kbd{A}, @samp{?\001} for the character @kbd{C-a}, and @code{?\002} | 394 | @kbd{A}, @samp{?\001} for the character @kbd{C-a}, and @code{?\002} |
| 387 | for the character @kbd{C-b}. Only characters up to octal code 777 can | 395 | for the character @kbd{C-b}. Only characters up to octal code 777 can |
| 388 | be specified this way. | 396 | be specified this way. |
| 389 | 397 | ||
| 390 | Fourthly, you can specify characters by their name. A character | 398 | @end enumerate |
| 391 | name escape sequence consists of a backslash, @samp{N@{}, the Unicode | ||
| 392 | character name, and @samp{@}}. Alternatively, you can also put the | ||
| 393 | numeric code point value between the braces, using the syntax | ||
| 394 | @samp{\N@{U+nnnn@}}, where @samp{nnnn} denotes between one and eight | ||
| 395 | hexadecimal digits. Thus, @samp{?\N@{LATIN CAPITAL LETTER A@}} and | ||
| 396 | @samp{?\N@{U+41@}} both denote the character @kbd{A}. To simplify | ||
| 397 | entering multi-line strings, you can replace spaces in the character | ||
| 398 | names by arbitrary non-empty sequence of whitespace (e.g., newlines). | ||
| 399 | 399 | ||
| 400 | These escape sequences may also be used in strings. @xref{Non-ASCII | 400 | These escape sequences may also be used in strings. @xref{Non-ASCII |
| 401 | in Strings}. | 401 | in Strings}. |
diff --git a/src/character.h b/src/character.h index bc3e1557844..586f330fba9 100644 --- a/src/character.h +++ b/src/character.h | |||
| @@ -612,14 +612,13 @@ sanitize_char_width (EMACS_INT width) | |||
| 612 | : (c) <= 0xE01EF ? (c) - 0xE0100 + 17 \ | 612 | : (c) <= 0xE01EF ? (c) - 0xE0100 + 17 \ |
| 613 | : 0) | 613 | : 0) |
| 614 | 614 | ||
| 615 | /* If C is a high surrogate, return 1. If C is a low surrogate, | 615 | /* Return true if C is a surrogate. */ |
| 616 | return 2. Otherwise, return 0. */ | ||
| 617 | 616 | ||
| 618 | #define CHAR_SURROGATE_PAIR_P(c) \ | 617 | INLINE bool |
| 619 | ((c) < 0xD800 ? 0 \ | 618 | char_surrogate_p (int c) |
| 620 | : (c) <= 0xDBFF ? 1 \ | 619 | { |
| 621 | : (c) <= 0xDFFF ? 2 \ | 620 | return 0xD800 <= c && c <= 0xDFFF; |
| 622 | : 0) | 621 | } |
| 623 | 622 | ||
| 624 | /* Data type for Unicode general category. | 623 | /* Data type for Unicode general category. |
| 625 | 624 | ||
diff --git a/src/lread.c b/src/lread.c index c3b6bd79e42..a42c1f60c95 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -44,7 +44,6 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */ | |||
| 44 | #include "termhooks.h" | 44 | #include "termhooks.h" |
| 45 | #include "blockinput.h" | 45 | #include "blockinput.h" |
| 46 | #include <c-ctype.h> | 46 | #include <c-ctype.h> |
| 47 | #include <string.h> | ||
| 48 | 47 | ||
| 49 | #ifdef MSDOS | 48 | #ifdef MSDOS |
| 50 | #include "msdos.h" | 49 | #include "msdos.h" |
| @@ -2151,88 +2150,42 @@ grow_read_buffer (void) | |||
| 2151 | MAX_MULTIBYTE_LENGTH, -1, 1); | 2150 | MAX_MULTIBYTE_LENGTH, -1, 1); |
| 2152 | } | 2151 | } |
| 2153 | 2152 | ||
| 2154 | /* Signal an invalid-read-syntax error indicating that the character | 2153 | /* Return the scalar value that has the Unicode character name NAME. |
| 2155 | name in an \N{…} literal is invalid. */ | 2154 | Raise 'invalid-read-syntax' if there is no such character. */ |
| 2156 | static _Noreturn void | ||
| 2157 | invalid_character_name (Lisp_Object name) | ||
| 2158 | { | ||
| 2159 | AUTO_STRING (format, "\\N{%s}"); | ||
| 2160 | xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name)); | ||
| 2161 | } | ||
| 2162 | |||
| 2163 | /* Check that CODE is a valid Unicode scalar value, and return its | ||
| 2164 | value. CODE should be parsed from the character name given by | ||
| 2165 | NAME. NAME is used for error messages. */ | ||
| 2166 | static int | 2155 | static int |
| 2167 | check_scalar_value (Lisp_Object code, Lisp_Object name) | 2156 | character_name_to_code (char const *name, ptrdiff_t name_len) |
| 2168 | { | 2157 | { |
| 2169 | if (! NUMBERP (code)) | 2158 | Lisp_Object code; |
| 2170 | invalid_character_name (name); | ||
| 2171 | EMACS_INT i = XINT (code); | ||
| 2172 | if (! (0 <= i && i <= MAX_UNICODE_CHAR) | ||
| 2173 | /* Don't allow surrogates. */ | ||
| 2174 | || (0xD800 <= code && code <= 0xDFFF)) | ||
| 2175 | invalid_character_name (name); | ||
| 2176 | return i; | ||
| 2177 | } | ||
| 2178 | 2159 | ||
| 2179 | /* If NAME starts with PREFIX, interpret the rest as a hexadecimal | 2160 | /* Code point as U+XXXX.... */ |
| 2180 | number and return its value. Raise invalid-read-syntax if the | 2161 | if (name[0] == 'U' && name[1] == '+') |
| 2181 | number is not a valid scalar value. Return −1 if NAME doesn’t | ||
| 2182 | start with PREFIX. */ | ||
| 2183 | static int | ||
| 2184 | parse_code_after_prefix (Lisp_Object name, const char *prefix) | ||
| 2185 | { | ||
| 2186 | ptrdiff_t name_len = SBYTES (name); | ||
| 2187 | ptrdiff_t prefix_len = strlen (prefix); | ||
| 2188 | /* Allow between one and eight hexadecimal digits after the | ||
| 2189 | prefix. */ | ||
| 2190 | if (prefix_len < name_len && name_len <= prefix_len + 8 | ||
| 2191 | && memcmp (SDATA (name), prefix, prefix_len) == 0) | ||
| 2192 | { | 2162 | { |
| 2193 | Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false); | 2163 | /* Pass the leading '+' to string_to_number, so that it |
| 2194 | if (NUMBERP (code)) | 2164 | rejects monstrosities such as negative values. */ |
| 2195 | return check_scalar_value (code, name); | 2165 | code = string_to_number (name + 1, 16, false); |
| 2166 | } | ||
| 2167 | else | ||
| 2168 | { | ||
| 2169 | /* Look up the name in the table returned by 'ucs-names'. */ | ||
| 2170 | AUTO_STRING_WITH_LEN (namestr, name, name_len); | ||
| 2171 | Lisp_Object names = call0 (Qucs_names); | ||
| 2172 | code = CDR (Fassoc (namestr, names)); | ||
| 2196 | } | 2173 | } |
| 2197 | return -1; | ||
| 2198 | } | ||
| 2199 | 2174 | ||
| 2200 | /* Returns the scalar value that has the Unicode character name NAME. | 2175 | if (! (INTEGERP (code) |
| 2201 | Raises `invalid-read-syntax' if there is no such character. */ | 2176 | && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR |
| 2202 | static int | 2177 | && ! char_surrogate_p (XINT (code)))) |
| 2203 | character_name_to_code (Lisp_Object name) | ||
| 2204 | { | ||
| 2205 | /* Code point as U+N, where N is between 1 and 8 hexadecimal | ||
| 2206 | digits. */ | ||
| 2207 | int code = parse_code_after_prefix (name, "U+"); | ||
| 2208 | if (code >= 0) | ||
| 2209 | return code; | ||
| 2210 | |||
| 2211 | /* CJK ideographs are not contained in the association list returned | ||
| 2212 | by `ucs-names'. But they follow a predictable naming pattern: a | ||
| 2213 | fixed prefix plus the hexadecimal codepoint value. */ | ||
| 2214 | code = parse_code_after_prefix (name, "CJK IDEOGRAPH-"); | ||
| 2215 | if (code >= 0) | ||
| 2216 | { | 2178 | { |
| 2217 | /* Various ranges of CJK characters; see UnicodeData.txt. */ | 2179 | AUTO_STRING (format, "\\N{%s}"); |
| 2218 | if ((0x3400 <= code && code <= 0x4DB5) | 2180 | AUTO_STRING_WITH_LEN (namestr, name, name_len); |
| 2219 | || (0x4E00 <= code && code <= 0x9FD5) | 2181 | xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, namestr)); |
| 2220 | || (0x20000 <= code && code <= 0x2A6D6) | ||
| 2221 | || (0x2A700 <= code && code <= 0x2B734) | ||
| 2222 | || (0x2B740 <= code && code <= 0x2B81D) | ||
| 2223 | || (0x2B820 <= code && code <= 0x2CEA1)) | ||
| 2224 | return code; | ||
| 2225 | else | ||
| 2226 | invalid_character_name (name); | ||
| 2227 | } | 2182 | } |
| 2228 | 2183 | ||
| 2229 | /* Look up the name in the table returned by `ucs-names'. */ | 2184 | return XINT (code); |
| 2230 | Lisp_Object names = call0 (Qucs_names); | ||
| 2231 | return check_scalar_value (CDR (Fassoc (name, names)), name); | ||
| 2232 | } | 2185 | } |
| 2233 | 2186 | ||
| 2234 | /* Bound on the length of a Unicode character name. As of | 2187 | /* Bound on the length of a Unicode character name. As of |
| 2235 | Unicode 9.0.0 the maximum is 83, so this should be safe. */ | 2188 | Unicode 9.0.0 the maximum is 83, so this should be safe. */ |
| 2236 | enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; | 2189 | enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; |
| 2237 | 2190 | ||
| 2238 | /* Read a \-escape sequence, assuming we already read the `\'. | 2191 | /* Read a \-escape sequence, assuming we already read the `\'. |
| @@ -2458,14 +2411,14 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2458 | end_of_file_error (); | 2411 | end_of_file_error (); |
| 2459 | if (c == '}') | 2412 | if (c == '}') |
| 2460 | break; | 2413 | break; |
| 2461 | if (! c_isascii (c)) | 2414 | if (! (0 < c && c < 0x80)) |
| 2462 | { | 2415 | { |
| 2463 | AUTO_STRING (format, | 2416 | AUTO_STRING (format, |
| 2464 | "Non-ASCII character U+%04X in character name"); | 2417 | "Invalid character U+%04X in character name"); |
| 2465 | xsignal1 (Qinvalid_read_syntax, | 2418 | xsignal1 (Qinvalid_read_syntax, |
| 2466 | CALLN (Fformat, format, make_natnum (c))); | 2419 | CALLN (Fformat, format, make_natnum (c))); |
| 2467 | } | 2420 | } |
| 2468 | /* We treat multiple adjacent whitespace characters as a | 2421 | /* Treat multiple adjacent whitespace characters as a |
| 2469 | single space character. This makes it easier to use | 2422 | single space character. This makes it easier to use |
| 2470 | character names in e.g. multi-line strings. */ | 2423 | character names in e.g. multi-line strings. */ |
| 2471 | if (c_isspace (c)) | 2424 | if (c_isspace (c)) |
| @@ -2483,7 +2436,8 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2483 | } | 2436 | } |
| 2484 | if (length == 0) | 2437 | if (length == 0) |
| 2485 | invalid_syntax ("Empty character name"); | 2438 | invalid_syntax ("Empty character name"); |
| 2486 | return character_name_to_code (make_unibyte_string (name, length)); | 2439 | name[length] = '\0'; |
| 2440 | return character_name_to_code (name, length); | ||
| 2487 | } | 2441 | } |
| 2488 | 2442 | ||
| 2489 | default: | 2443 | default: |
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el index ff5d0f655f3..2ebaf491120 100644 --- a/test/src/lread-tests.el +++ b/test/src/lread-tests.el | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | ;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*- | 1 | ;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*- |
| 2 | 2 | ||
| 3 | ;; Copyright (C) 2016 Google Inc. | 3 | ;; Copyright (C) 2016 Free Software Foundation, Inc. |
| 4 | 4 | ||
| 5 | ;; Author: Philipp Stephani <phst@google.com> | 5 | ;; Author: Philipp Stephani <phst@google.com> |
| 6 | 6 | ||
| @@ -26,11 +26,10 @@ | |||
| 26 | ;;; Code: | 26 | ;;; Code: |
| 27 | 27 | ||
| 28 | (ert-deftest lread-char-number () | 28 | (ert-deftest lread-char-number () |
| 29 | (should (equal ?\N{U+A817} #xA817))) | 29 | (should (equal (read "?\\N{U+A817}") #xA817))) |
| 30 | 30 | ||
| 31 | (ert-deftest lread-char-name () | 31 | (ert-deftest lread-char-name () |
| 32 | (should (equal ?\N{SYLOTI NAGRI LETTER | 32 | (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}") |
| 33 | DHO} | ||
| 34 | #xA817))) | 33 | #xA817))) |
| 35 | 34 | ||
| 36 | (ert-deftest lread-char-invalid-number () | 35 | (ert-deftest lread-char-invalid-number () |
| @@ -46,16 +45,23 @@ | |||
| 46 | (ert-deftest lread-char-empty-name () | 45 | (ert-deftest lread-char-empty-name () |
| 47 | (should-error (read "?\\N{}") :type 'invalid-read-syntax)) | 46 | (should-error (read "?\\N{}") :type 'invalid-read-syntax)) |
| 48 | 47 | ||
| 49 | (ert-deftest lread-char-cjk-name () | 48 | (ert-deftest lread-char-surrogate-1 () |
| 50 | (should (equal ?\N{CJK IDEOGRAPH-2B734} #x2B734))) | 49 | (should-error (read "?\\N{U+D800}") :type 'invalid-read-syntax)) |
| 51 | 50 | (ert-deftest lread-char-surrogate-2 () | |
| 52 | (ert-deftest lread-char-invalid-cjk-name () | 51 | (should-error (read "?\\N{U+D801}") :type 'invalid-read-syntax)) |
| 53 | (should-error (read "?\\N{CJK IDEOGRAPH-2B735}") :type 'invalid-read-syntax)) | 52 | (ert-deftest lread-char-surrogate-3 () |
| 54 | 53 | (should-error (read "?\\N{U+Dffe}") :type 'invalid-read-syntax)) | |
| 55 | (ert-deftest lread-string-char-number () | 54 | (ert-deftest lread-char-surrogate-4 () |
| 56 | (should (equal "a\N{U+A817}b" "a\uA817b"))) | 55 | (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax)) |
| 56 | |||
| 57 | (ert-deftest lread-string-char-number-1 () | ||
| 58 | (should (equal (read "a\\N{U+A817}b") "a\uA817bx"))) | ||
| 59 | (ert-deftest lread-string-char-number-2 () | ||
| 60 | (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax)) | ||
| 61 | (ert-deftest lread-string-char-number-3 () | ||
| 62 | (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax)) | ||
| 57 | 63 | ||
| 58 | (ert-deftest lread-string-char-name () | 64 | (ert-deftest lread-string-char-name () |
| 59 | (should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b"))) | 65 | (should (equal (read "a\\N{SYLOTI NAGRI LETTER DHO}b") "a\uA817b"))) |
| 60 | 66 | ||
| 61 | ;;; lread-tests.el ends here | 67 | ;;; lread-tests.el ends here |