diff options
| author | Mattias Engdegård | 2024-03-31 15:00:00 +0200 |
|---|---|---|
| committer | Mattias Engdegård | 2024-04-01 10:41:46 +0200 |
| commit | 734bd005aa0fa955cf1a46d3a60a4d6ef5e7e3d1 (patch) | |
| tree | 28e48c30e5916247065e8e66d9a80d9991ec1c2b | |
| parent | f178a6d8006f1e8afe06bb71d0a413622d73f131 (diff) | |
| download | emacs-734bd005aa0fa955cf1a46d3a60a4d6ef5e7e3d1.tar.gz emacs-734bd005aa0fa955cf1a46d3a60a4d6ef5e7e3d1.zip | |
Faster JSON parsing
Speed up JSON parsing substantially by only UTF-8-parsing string
literals and only exactly once. Previously, json-parse-string always
first parsed the entire input and copied it to a new string, and then
validated each string literal twice.
We no longer create an extra new string when interning an alist key,
nor do we garble plist keys with Unicode characters.
* src/lread.c (intern_c_multibyte): New.
* src/json.c (json_encode): Remove.
(utf8_error): New.
(json_parse_string): Faster and more careful UTF-8 decoding.
Create and return a new multibyte string or symbol without extra
decoding. All callers adapted.
(Fjson_parse_string): Skip expensive input pre-decoding.
* test/src/json-tests.el (json-parse-string/object-unicode-keys)
(json-parse-string/short): New.
(json-parse-string/string, json-parse-string/invalid-unicode):
Adapt tests.
* etc/NEWS: Mentioned change in errors.
| -rw-r--r-- | etc/NEWS | 5 | ||||
| -rw-r--r-- | src/json.c | 187 | ||||
| -rw-r--r-- | src/lisp.h | 2 | ||||
| -rw-r--r-- | src/lread.c | 12 | ||||
| -rw-r--r-- | test/src/json-tests.el | 69 |
5 files changed, 137 insertions, 138 deletions
| @@ -1721,6 +1721,11 @@ Use a float value for the first argument instead. | |||
| 1721 | Instead, use 'eshell-process-wait-time', which supports floating-point | 1721 | Instead, use 'eshell-process-wait-time', which supports floating-point |
| 1722 | values. | 1722 | values. |
| 1723 | 1723 | ||
| 1724 | --- | ||
| 1725 | ** The JSON parser sometimes signals different types of errors. | ||
| 1726 | It will now signal 'json-utf8-decode-error' for inputs that are not | ||
| 1727 | correctly UTF-8 encoded. | ||
| 1728 | |||
| 1724 | 1729 | ||
| 1725 | * Lisp Changes in Emacs 30.1 | 1730 | * Lisp Changes in Emacs 30.1 |
| 1726 | 1731 | ||
diff --git a/src/json.c b/src/json.c index 908db022c50..8749009a24b 100644 --- a/src/json.c +++ b/src/json.c | |||
| @@ -699,24 +699,6 @@ usage: (json-insert OBJECT &rest ARGS) */) | |||
| 699 | } | 699 | } |
| 700 | 700 | ||
| 701 | 701 | ||
| 702 | /* Note that all callers of make_string_from_utf8 and build_string_from_utf8 | ||
| 703 | below either pass only value UTF-8 strings or use the function for | ||
| 704 | formatting error messages; in the latter case correctness isn't | ||
| 705 | critical. */ | ||
| 706 | |||
| 707 | /* Return a unibyte string containing the sequence of UTF-8 encoding | ||
| 708 | units of the UTF-8 representation of STRING. If STRING does not | ||
| 709 | represent a sequence of Unicode scalar values, return a string with | ||
| 710 | unspecified contents. */ | ||
| 711 | |||
| 712 | static Lisp_Object | ||
| 713 | json_encode (Lisp_Object string) | ||
| 714 | { | ||
| 715 | /* FIXME: Raise an error if STRING is not a scalar value | ||
| 716 | sequence. */ | ||
| 717 | return encode_string_utf_8 (string, Qnil, false, Qt, Qt); | ||
| 718 | } | ||
| 719 | |||
| 720 | #define JSON_PARSER_INTERNAL_OBJECT_WORKSPACE_SIZE 64 | 702 | #define JSON_PARSER_INTERNAL_OBJECT_WORKSPACE_SIZE 64 |
| 721 | #define JSON_PARSER_INTERNAL_BYTE_WORKSPACE_SIZE 512 | 703 | #define JSON_PARSER_INTERNAL_BYTE_WORKSPACE_SIZE 512 |
| 722 | 704 | ||
| @@ -1081,52 +1063,21 @@ json_parse_unicode (struct json_parser *parser) | |||
| 1081 | return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3]; | 1063 | return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3]; |
| 1082 | } | 1064 | } |
| 1083 | 1065 | ||
| 1084 | /* Parses an utf-8 code-point encoding (except the first byte), and | 1066 | static AVOID |
| 1085 | returns the numeric value of the code-point (without considering | 1067 | utf8_error (struct json_parser *parser) |
| 1086 | the first byte) */ | ||
| 1087 | static int | ||
| 1088 | json_handle_utf8_tail_bytes (struct json_parser *parser, int n) | ||
| 1089 | { | 1068 | { |
| 1090 | int v = 0; | 1069 | json_signal_error (parser, Qjson_utf8_decode_error); |
| 1091 | for (int i = 0; i < n; i++) | ||
| 1092 | { | ||
| 1093 | int c = json_input_get (parser); | ||
| 1094 | json_byte_workspace_put (parser, c); | ||
| 1095 | if ((c & 0xc0) != 0x80) | ||
| 1096 | json_signal_error (parser, Qjson_utf8_decode_error); | ||
| 1097 | v = (v << 6) | (c & 0x3f); | ||
| 1098 | } | ||
| 1099 | return v; | ||
| 1100 | } | 1070 | } |
| 1101 | 1071 | ||
| 1102 | /* Reads a JSON string, and puts the result into the byte workspace */ | 1072 | /* Parse a string literal. Optionally prepend a ':'. |
| 1103 | static void | 1073 | Return the string or an interned symbol. */ |
| 1104 | json_parse_string (struct json_parser *parser) | 1074 | static Lisp_Object |
| 1105 | { | 1075 | json_parse_string (struct json_parser *parser, bool intern, bool leading_colon) |
| 1106 | /* a single_uninteresting byte can be simply copied from the input | 1076 | { |
| 1107 | to output, it doesn't need any extra care. This means all the | 1077 | json_byte_workspace_reset (parser); |
| 1108 | characters between [0x20;0x7f], except the double quote and | 1078 | if (leading_colon) |
| 1109 | the backslash */ | 1079 | json_byte_workspace_put (parser, ':'); |
| 1110 | static const char is_single_uninteresting[256] = { | 1080 | ptrdiff_t chars_delta = 0; /* nchars - nbytes */ |
| 1111 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
| 1112 | /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1113 | /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1114 | /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1115 | /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1116 | /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1117 | /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, | ||
| 1118 | /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1119 | /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 1120 | /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1121 | /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1122 | /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1123 | /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1124 | /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1125 | /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1126 | /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1127 | /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 1128 | }; | ||
| 1129 | |||
| 1130 | for (;;) | 1081 | for (;;) |
| 1131 | { | 1082 | { |
| 1132 | /* This if is only here for a possible speedup. If there are 4 | 1083 | /* This if is only here for a possible speedup. If there are 4 |
| @@ -1138,10 +1089,10 @@ json_parse_string (struct json_parser *parser) | |||
| 1138 | int c1 = parser->input_current[1]; | 1089 | int c1 = parser->input_current[1]; |
| 1139 | int c2 = parser->input_current[2]; | 1090 | int c2 = parser->input_current[2]; |
| 1140 | int c3 = parser->input_current[3]; | 1091 | int c3 = parser->input_current[3]; |
| 1141 | bool v0 = is_single_uninteresting[c0]; | 1092 | bool v0 = json_plain_char[c0]; |
| 1142 | bool v1 = is_single_uninteresting[c1]; | 1093 | bool v1 = json_plain_char[c1]; |
| 1143 | bool v2 = is_single_uninteresting[c2]; | 1094 | bool v2 = json_plain_char[c2]; |
| 1144 | bool v3 = is_single_uninteresting[c3]; | 1095 | bool v3 = json_plain_char[c3]; |
| 1145 | if (v0 && v1 && v2 && v3) | 1096 | if (v0 && v1 && v2 && v3) |
| 1146 | { | 1097 | { |
| 1147 | json_byte_workspace_put (parser, c0); | 1098 | json_byte_workspace_put (parser, c0); |
| @@ -1156,43 +1107,62 @@ json_parse_string (struct json_parser *parser) | |||
| 1156 | 1107 | ||
| 1157 | int c = json_input_get (parser); | 1108 | int c = json_input_get (parser); |
| 1158 | parser->current_column++; | 1109 | parser->current_column++; |
| 1159 | if (is_single_uninteresting[c]) | 1110 | if (json_plain_char[c]) |
| 1160 | { | 1111 | { |
| 1161 | json_byte_workspace_put (parser, c); | 1112 | json_byte_workspace_put (parser, c); |
| 1162 | continue; | 1113 | continue; |
| 1163 | } | 1114 | } |
| 1164 | 1115 | ||
| 1165 | if (c == '"') | 1116 | if (c == '"') |
| 1166 | return; | ||
| 1167 | else if (c & 0x80) | ||
| 1168 | { | 1117 | { |
| 1169 | /* Handle utf-8 encoding */ | 1118 | ptrdiff_t nbytes |
| 1119 | = parser->byte_workspace_current - parser->byte_workspace; | ||
| 1120 | ptrdiff_t nchars = nbytes - chars_delta; | ||
| 1121 | const char *str = (const char *)parser->byte_workspace; | ||
| 1122 | return intern ? intern_c_multibyte (str, nchars, nbytes) | ||
| 1123 | : make_multibyte_string (str, nchars, nbytes); | ||
| 1124 | } | ||
| 1125 | |||
| 1126 | if (c & 0x80) | ||
| 1127 | { | ||
| 1128 | /* Parse UTF-8, strictly. This is the correct thing to do | ||
| 1129 | whether or not the input is a unibyte or multibyte string. */ | ||
| 1170 | json_byte_workspace_put (parser, c); | 1130 | json_byte_workspace_put (parser, c); |
| 1171 | if (c < 0xc0) | 1131 | unsigned char c1 = json_input_get (parser); |
| 1172 | json_signal_error (parser, Qjson_utf8_decode_error); | 1132 | if ((c1 & 0xc0) != 0x80) |
| 1173 | else if (c < 0xe0) | 1133 | utf8_error (parser); |
| 1134 | json_byte_workspace_put (parser, c1); | ||
| 1135 | if (c <= 0xc1) | ||
| 1136 | utf8_error (parser); | ||
| 1137 | else if (c <= 0xdf) | ||
| 1138 | chars_delta += 1; | ||
| 1139 | else if (c <= 0xef) | ||
| 1174 | { | 1140 | { |
| 1175 | int n = ((c & 0x1f) << 6 | 1141 | unsigned char c2 = json_input_get (parser); |
| 1176 | | json_handle_utf8_tail_bytes (parser, 1)); | 1142 | if ((c2 & 0xc0) != 0x80) |
| 1177 | if (n < 0x80) | 1143 | utf8_error (parser); |
| 1178 | json_signal_error (parser, Qjson_utf8_decode_error); | 1144 | int v = ((c & 0x0f) << 12) + ((c1 & 0x3f) << 6) + (c2 & 0x3f); |
| 1179 | } | 1145 | if (v < 0x800 || (v >= 0xd800 && v <= 0xdfff)) |
| 1180 | else if (c < 0xf0) | 1146 | utf8_error (parser); |
| 1181 | { | 1147 | json_byte_workspace_put (parser, c2); |
| 1182 | int n = ((c & 0xf) << 12 | 1148 | chars_delta += 2; |
| 1183 | | json_handle_utf8_tail_bytes (parser, 2)); | ||
| 1184 | if (n < 0x800 || (n >= 0xd800 && n < 0xe000)) | ||
| 1185 | json_signal_error (parser, Qjson_utf8_decode_error); | ||
| 1186 | } | 1149 | } |
| 1187 | else if (c < 0xf8) | 1150 | else if (c <= 0xf7) |
| 1188 | { | 1151 | { |
| 1189 | int n = ((c & 0x7) << 18 | 1152 | unsigned char c2 = json_input_get (parser); |
| 1190 | | json_handle_utf8_tail_bytes (parser, 3)); | 1153 | unsigned char c3 = json_input_get (parser); |
| 1191 | if (n < 0x10000 || n > 0x10ffff) | 1154 | if ((c2 & 0xc0) != 0x80 || (c3 & 0xc0) != 0x80) |
| 1192 | json_signal_error (parser, Qjson_utf8_decode_error); | 1155 | utf8_error (parser); |
| 1156 | int v = (((c & 0x07) << 18) + ((c1 & 0x3f) << 12) | ||
| 1157 | + ((c2 & 0x3f) << 6) + (c3 & 0x3f)); | ||
| 1158 | if (v < 0x10000 || v > 0x10ffff) | ||
| 1159 | utf8_error (parser); | ||
| 1160 | json_byte_workspace_put (parser, c2); | ||
| 1161 | json_byte_workspace_put (parser, c3); | ||
| 1162 | chars_delta += 3; | ||
| 1193 | } | 1163 | } |
| 1194 | else | 1164 | else |
| 1195 | json_signal_error (parser, Qjson_utf8_decode_error); | 1165 | utf8_error (parser); |
| 1196 | } | 1166 | } |
| 1197 | else if (c == '\\') | 1167 | else if (c == '\\') |
| 1198 | { | 1168 | { |
| @@ -1249,6 +1219,7 @@ json_parse_string (struct json_parser *parser) | |||
| 1249 | json_byte_workspace_put (parser, 0xc0 | num >> 6); | 1219 | json_byte_workspace_put (parser, 0xc0 | num >> 6); |
| 1250 | json_byte_workspace_put (parser, | 1220 | json_byte_workspace_put (parser, |
| 1251 | 0x80 | (num & 0x3f)); | 1221 | 0x80 | (num & 0x3f)); |
| 1222 | chars_delta += 1; | ||
| 1252 | } | 1223 | } |
| 1253 | else if (num < 0x10000) | 1224 | else if (num < 0x10000) |
| 1254 | { | 1225 | { |
| @@ -1258,6 +1229,7 @@ json_parse_string (struct json_parser *parser) | |||
| 1258 | | ((num >> 6) & 0x3f))); | 1229 | | ((num >> 6) & 0x3f))); |
| 1259 | json_byte_workspace_put (parser, | 1230 | json_byte_workspace_put (parser, |
| 1260 | 0x80 | (num & 0x3f)); | 1231 | 0x80 | (num & 0x3f)); |
| 1232 | chars_delta += 2; | ||
| 1261 | } | 1233 | } |
| 1262 | else | 1234 | else |
| 1263 | { | 1235 | { |
| @@ -1270,6 +1242,7 @@ json_parse_string (struct json_parser *parser) | |||
| 1270 | | ((num >> 6) & 0x3f))); | 1242 | | ((num >> 6) & 0x3f))); |
| 1271 | json_byte_workspace_put (parser, | 1243 | json_byte_workspace_put (parser, |
| 1272 | 0x80 | (num & 0x3f)); | 1244 | 0x80 | (num & 0x3f)); |
| 1245 | chars_delta += 3; | ||
| 1273 | } | 1246 | } |
| 1274 | } | 1247 | } |
| 1275 | else | 1248 | else |
| @@ -1566,16 +1539,11 @@ json_parse_object (struct json_parser *parser) | |||
| 1566 | if (c != '"') | 1539 | if (c != '"') |
| 1567 | json_signal_error (parser, Qjson_parse_error); | 1540 | json_signal_error (parser, Qjson_parse_error); |
| 1568 | 1541 | ||
| 1569 | json_byte_workspace_reset (parser); | ||
| 1570 | switch (parser->conf.object_type) | 1542 | switch (parser->conf.object_type) |
| 1571 | { | 1543 | { |
| 1572 | case json_object_hashtable: | 1544 | case json_object_hashtable: |
| 1573 | { | 1545 | { |
| 1574 | json_parse_string (parser); | 1546 | Lisp_Object key = json_parse_string (parser, false, false); |
| 1575 | Lisp_Object key | ||
| 1576 | = make_string_from_utf8 ((char *) parser->byte_workspace, | ||
| 1577 | (parser->byte_workspace_current | ||
| 1578 | - parser->byte_workspace)); | ||
| 1579 | Lisp_Object value = json_parse_object_member_value (parser); | 1547 | Lisp_Object value = json_parse_object_member_value (parser); |
| 1580 | json_make_object_workspace_for (parser, 2); | 1548 | json_make_object_workspace_for (parser, 2); |
| 1581 | parser->object_workspace[parser->object_workspace_current] = key; | 1549 | parser->object_workspace[parser->object_workspace_current] = key; |
| @@ -1586,13 +1554,7 @@ json_parse_object (struct json_parser *parser) | |||
| 1586 | } | 1554 | } |
| 1587 | case json_object_alist: | 1555 | case json_object_alist: |
| 1588 | { | 1556 | { |
| 1589 | json_parse_string (parser); | 1557 | Lisp_Object key = json_parse_string (parser, true, false); |
| 1590 | char *workspace = (char *) parser->byte_workspace; | ||
| 1591 | ptrdiff_t nbytes | ||
| 1592 | = parser->byte_workspace_current - parser->byte_workspace; | ||
| 1593 | Lisp_Object key = Fintern (make_string_from_utf8 (workspace, | ||
| 1594 | nbytes), | ||
| 1595 | Qnil); | ||
| 1596 | Lisp_Object value = json_parse_object_member_value (parser); | 1558 | Lisp_Object value = json_parse_object_member_value (parser); |
| 1597 | Lisp_Object nc = Fcons (Fcons (key, value), Qnil); | 1559 | Lisp_Object nc = Fcons (Fcons (key, value), Qnil); |
| 1598 | *cdr = nc; | 1560 | *cdr = nc; |
| @@ -1601,11 +1563,7 @@ json_parse_object (struct json_parser *parser) | |||
| 1601 | } | 1563 | } |
| 1602 | case json_object_plist: | 1564 | case json_object_plist: |
| 1603 | { | 1565 | { |
| 1604 | json_byte_workspace_put (parser, ':'); | 1566 | Lisp_Object key = json_parse_string (parser, true, true); |
| 1605 | json_parse_string (parser); | ||
| 1606 | Lisp_Object key = intern_1 ((char *) parser->byte_workspace, | ||
| 1607 | (parser->byte_workspace_current | ||
| 1608 | - parser->byte_workspace)); | ||
| 1609 | Lisp_Object value = json_parse_object_member_value (parser); | 1567 | Lisp_Object value = json_parse_object_member_value (parser); |
| 1610 | Lisp_Object nc = Fcons (key, Qnil); | 1568 | Lisp_Object nc = Fcons (key, Qnil); |
| 1611 | *cdr = nc; | 1569 | *cdr = nc; |
| @@ -1692,15 +1650,7 @@ json_parse_value (struct json_parser *parser, int c) | |||
| 1692 | else if (c == '[') | 1650 | else if (c == '[') |
| 1693 | return json_parse_array (parser); | 1651 | return json_parse_array (parser); |
| 1694 | else if (c == '"') | 1652 | else if (c == '"') |
| 1695 | { | 1653 | return json_parse_string (parser, false, false); |
| 1696 | json_byte_workspace_reset (parser); | ||
| 1697 | json_parse_string (parser); | ||
| 1698 | Lisp_Object result | ||
| 1699 | = make_string_from_utf8 ((const char *) parser->byte_workspace, | ||
| 1700 | (parser->byte_workspace_current | ||
| 1701 | - parser->byte_workspace)); | ||
| 1702 | return result; | ||
| 1703 | } | ||
| 1704 | else if ((c >= '0' && c <= '9') || (c == '-')) | 1654 | else if ((c >= '0' && c <= '9') || (c == '-')) |
| 1705 | return json_parse_number (parser, c); | 1655 | return json_parse_number (parser, c); |
| 1706 | else | 1656 | else |
| @@ -1816,14 +1766,13 @@ usage: (json-parse-string STRING &rest ARGS) */) | |||
| 1816 | 1766 | ||
| 1817 | Lisp_Object string = args[0]; | 1767 | Lisp_Object string = args[0]; |
| 1818 | CHECK_STRING (string); | 1768 | CHECK_STRING (string); |
| 1819 | Lisp_Object encoded = json_encode (string); | ||
| 1820 | struct json_configuration conf | 1769 | struct json_configuration conf |
| 1821 | = { json_object_hashtable, json_array_array, QCnull, QCfalse }; | 1770 | = { json_object_hashtable, json_array_array, QCnull, QCfalse }; |
| 1822 | json_parse_args (nargs - 1, args + 1, &conf, true); | 1771 | json_parse_args (nargs - 1, args + 1, &conf, true); |
| 1823 | 1772 | ||
| 1824 | struct json_parser p; | 1773 | struct json_parser p; |
| 1825 | const unsigned char *begin = (const unsigned char *) SSDATA (encoded); | 1774 | const unsigned char *begin = SDATA (string); |
| 1826 | json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL, NULL); | 1775 | json_parser_init (&p, conf, begin, begin + SBYTES (string), NULL, NULL); |
| 1827 | record_unwind_protect_ptr (json_parser_done, &p); | 1776 | record_unwind_protect_ptr (json_parser_done, &p); |
| 1828 | 1777 | ||
| 1829 | return unbind_to (count, | 1778 | return unbind_to (count, |
diff --git a/src/lisp.h b/src/lisp.h index 43a29489a25..3cb4361e75e 100644 --- a/src/lisp.h +++ b/src/lisp.h | |||
| @@ -4744,6 +4744,8 @@ extern ptrdiff_t evxprintf (char **, ptrdiff_t *, char *, ptrdiff_t, | |||
| 4744 | extern Lisp_Object intern_1 (const char *, ptrdiff_t); | 4744 | extern Lisp_Object intern_1 (const char *, ptrdiff_t); |
| 4745 | extern Lisp_Object intern_c_string_1 (const char *, ptrdiff_t); | 4745 | extern Lisp_Object intern_c_string_1 (const char *, ptrdiff_t); |
| 4746 | extern Lisp_Object intern_driver (Lisp_Object, Lisp_Object, Lisp_Object); | 4746 | extern Lisp_Object intern_driver (Lisp_Object, Lisp_Object, Lisp_Object); |
| 4747 | extern Lisp_Object intern_c_multibyte (const char *str, | ||
| 4748 | ptrdiff_t nchars, ptrdiff_t nbytes); | ||
| 4747 | extern void init_symbol (Lisp_Object, Lisp_Object); | 4749 | extern void init_symbol (Lisp_Object, Lisp_Object); |
| 4748 | extern Lisp_Object oblookup (Lisp_Object, const char *, ptrdiff_t, ptrdiff_t); | 4750 | extern Lisp_Object oblookup (Lisp_Object, const char *, ptrdiff_t, ptrdiff_t); |
| 4749 | INLINE void | 4751 | INLINE void |
diff --git a/src/lread.c b/src/lread.c index 1cb941e84fc..09a5589fd0c 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -4993,6 +4993,18 @@ intern_c_string_1 (const char *str, ptrdiff_t len) | |||
| 4993 | return tem; | 4993 | return tem; |
| 4994 | } | 4994 | } |
| 4995 | 4995 | ||
| 4996 | /* Intern STR of NBYTES bytes and NCHARS characters in the default obarray. */ | ||
| 4997 | Lisp_Object | ||
| 4998 | intern_c_multibyte (const char *str, ptrdiff_t nchars, ptrdiff_t nbytes) | ||
| 4999 | { | ||
| 5000 | Lisp_Object obarray = check_obarray (Vobarray); | ||
| 5001 | Lisp_Object sym = oblookup (obarray, str, nchars, nbytes); | ||
| 5002 | if (BARE_SYMBOL_P (sym)) | ||
| 5003 | return sym; | ||
| 5004 | return intern_driver (make_multibyte_string (str, nchars, nbytes), | ||
| 5005 | obarray, sym); | ||
| 5006 | } | ||
| 5007 | |||
| 4996 | static void | 5008 | static void |
| 4997 | define_symbol (Lisp_Object sym, char const *str) | 5009 | define_symbol (Lisp_Object sym, char const *str) |
| 4998 | { | 5010 | { |
diff --git a/test/src/json-tests.el b/test/src/json-tests.el index fb2384d4a8d..a1bafadaa87 100644 --- a/test/src/json-tests.el +++ b/test/src/json-tests.el | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | 25 | ||
| 26 | (require 'cl-lib) | 26 | (require 'cl-lib) |
| 27 | (require 'map) | 27 | (require 'map) |
| 28 | (require 'subr-x) | ||
| 28 | 29 | ||
| 29 | (declare-function json-serialize "json.c" (object &rest args)) | 30 | (declare-function json-serialize "json.c" (object &rest args)) |
| 30 | (declare-function json-insert "json.c" (object &rest args)) | 31 | (declare-function json-insert "json.c" (object &rest args)) |
| @@ -155,6 +156,9 @@ | |||
| 155 | ) | 156 | ) |
| 156 | 157 | ||
| 157 | (ert-deftest json-parse-string/object () | 158 | (ert-deftest json-parse-string/object () |
| 159 | :expected-result :failed | ||
| 160 | ;; FIXME: This currently fails. Should the parser deduplicate keys? | ||
| 161 | ;; Never, always, or for alist and plist only? | ||
| 158 | (let ((input | 162 | (let ((input |
| 159 | "{ \"abc\" : [1, 2, true], \"def\" : null, \"abc\" : [9, false] }\n")) | 163 | "{ \"abc\" : [1, 2, true], \"def\" : null, \"abc\" : [9, false] }\n")) |
| 160 | (let ((actual (json-parse-string input))) | 164 | (let ((actual (json-parse-string input))) |
| @@ -167,6 +171,15 @@ | |||
| 167 | (should (equal (json-parse-string input :object-type 'plist) | 171 | (should (equal (json-parse-string input :object-type 'plist) |
| 168 | '(:abc [9 :false] :def :null))))) | 172 | '(:abc [9 :false] :def :null))))) |
| 169 | 173 | ||
| 174 | (ert-deftest json-parse-string/object-unicode-keys () | ||
| 175 | (let ((input "{\"é\":1,\"☃\":2,\"𐌐\":3}")) | ||
| 176 | (let ((actual (json-parse-string input))) | ||
| 177 | (should (equal (sort (hash-table-keys actual)) '("é" "☃" "𐌐")))) | ||
| 178 | (should (equal (json-parse-string input :object-type 'alist) | ||
| 179 | '((é . 1) (☃ . 2) (𐌐 . 3)))) | ||
| 180 | (should (equal (json-parse-string input :object-type 'plist) | ||
| 181 | '(:é 1 :☃ 2 :𐌐 3))))) | ||
| 182 | |||
| 170 | (ert-deftest json-parse-string/array () | 183 | (ert-deftest json-parse-string/array () |
| 171 | (let ((input "[\"a\", 1, [\"b\", 2]]")) | 184 | (let ((input "[\"a\", 1, [\"b\", 2]]")) |
| 172 | (should (equal (json-parse-string input) | 185 | (should (equal (json-parse-string input) |
| @@ -182,8 +195,8 @@ | |||
| 182 | ["\nasdфывfgh\t"])) | 195 | ["\nasdфывfgh\t"])) |
| 183 | (should (equal (json-parse-string "[\"\\uD834\\uDD1E\"]") ["\U0001D11E"])) | 196 | (should (equal (json-parse-string "[\"\\uD834\\uDD1E\"]") ["\U0001D11E"])) |
| 184 | (should-error (json-parse-string "foo") :type 'json-parse-error) | 197 | (should-error (json-parse-string "foo") :type 'json-parse-error) |
| 185 | ;; FIXME: Is this the right behavior? | 198 | (should-error (json-parse-string "[\"\u00C4\xC3\x84\"]") |
| 186 | (should (equal (json-parse-string "[\"\u00C4\xC3\x84\"]") ["\u00C4\u00C4"]))) | 199 | :type 'json-utf8-decode-error)) |
| 187 | 200 | ||
| 188 | (ert-deftest json-serialize/string () | 201 | (ert-deftest json-serialize/string () |
| 189 | (should (equal (json-serialize ["foo"]) "[\"foo\"]")) | 202 | (should (equal (json-serialize ["foo"]) "[\"foo\"]")) |
| @@ -201,9 +214,23 @@ | |||
| 201 | (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument) | 214 | (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument) |
| 202 | (should-error (json-serialize ["u\u00C4\xCCv"]) :type 'wrong-type-argument)) | 215 | (should-error (json-serialize ["u\u00C4\xCCv"]) :type 'wrong-type-argument)) |
| 203 | 216 | ||
| 217 | (ert-deftest json-parse-string/short () | ||
| 218 | :expected-result :failed | ||
| 219 | (should-error (json-parse-string "") :type 'json-end-of-file) | ||
| 220 | (should-error (json-parse-string " ") :type 'json-end-of-file) | ||
| 221 | ;; BUG: currently results in `json-end-of-file' for short non-empty inputs. | ||
| 222 | (dolist (s '("a" "ab" "abc" "abcd" | ||
| 223 | "t" "tr" "tru" "truE" "truee" | ||
| 224 | "n" "nu" "nul" "nulL" "nulll" | ||
| 225 | "f" "fa" "fal" "fals" "falsE" "falsee")) | ||
| 226 | (condition-case err | ||
| 227 | (json-parse-string s) | ||
| 228 | (error | ||
| 229 | (should (eq (car err) 'json-parse-error))) | ||
| 230 | (:success (error "parsing %S should fail" s))))) | ||
| 231 | |||
| 204 | (ert-deftest json-parse-string/null () | 232 | (ert-deftest json-parse-string/null () |
| 205 | (should-error (json-parse-string "\x00") :type 'wrong-type-argument) | 233 | (should (equal (json-parse-string "[\"a\\u0000b\"]") ["a\0b"])) |
| 206 | (should (json-parse-string "[\"a\\u0000b\"]")) | ||
| 207 | (let* ((string "{\"foo\":\"this is a string including a literal \\u0000\"}") | 234 | (let* ((string "{\"foo\":\"this is a string including a literal \\u0000\"}") |
| 208 | (data (json-parse-string string))) | 235 | (data (json-parse-string string))) |
| 209 | (should (hash-table-p data)) | 236 | (should (hash-table-p data)) |
| @@ -214,30 +241,34 @@ | |||
| 214 | https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt. | 241 | https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt. |
| 215 | Test with both unibyte and multibyte strings." | 242 | Test with both unibyte and multibyte strings." |
| 216 | ;; Invalid UTF-8 code unit sequences. | 243 | ;; Invalid UTF-8 code unit sequences. |
| 217 | (should-error (json-parse-string "[\"\x80\"]") :type 'json-parse-error) | 244 | (should-error (json-parse-string "[\"\x80\"]") :type 'json-utf8-decode-error) |
| 218 | (should-error (json-parse-string "[\"\u00C4\x80\"]") :type 'json-parse-error) | 245 | (should-error (json-parse-string "[\"\u00C4\x80\"]") |
| 219 | (should-error (json-parse-string "[\"\xBF\"]") :type 'json-parse-error) | 246 | :type 'json-utf8-decode-error) |
| 220 | (should-error (json-parse-string "[\"\u00C4\xBF\"]") :type 'json-parse-error) | 247 | (should-error (json-parse-string "[\"\xBF\"]") :type 'json-utf8-decode-error) |
| 221 | (should-error (json-parse-string "[\"\xFE\"]") :type 'json-parse-error) | 248 | (should-error (json-parse-string "[\"\u00C4\xBF\"]") |
| 222 | (should-error (json-parse-string "[\"\u00C4\xFE\"]") :type 'json-parse-error) | 249 | :type 'json-utf8-decode-error) |
| 223 | (should-error (json-parse-string "[\"\xC0\xAF\"]") :type 'json-parse-error) | 250 | (should-error (json-parse-string "[\"\xFE\"]") :type 'json-utf8-decode-error) |
| 251 | (should-error (json-parse-string "[\"\u00C4\xFE\"]") | ||
| 252 | :type 'json-utf8-decode-error) | ||
| 253 | (should-error (json-parse-string "[\"\xC0\xAF\"]") | ||
| 254 | :type 'json-utf8-decode-error) | ||
| 224 | (should-error (json-parse-string "[\"\u00C4\xC0\xAF\"]") | 255 | (should-error (json-parse-string "[\"\u00C4\xC0\xAF\"]") |
| 225 | :type 'json-parse-error) | 256 | :type 'json-utf8-decode-error) |
| 226 | (should-error (json-parse-string "[\"\u00C4\xC0\x80\"]") | 257 | (should-error (json-parse-string "[\"\u00C4\xC0\x80\"]") |
| 227 | :type 'json-parse-error) | 258 | :type 'json-utf8-decode-error) |
| 228 | ;; Surrogates. | 259 | ;; Surrogates. |
| 229 | (should-error (json-parse-string "[\"\uDB7F\"]") | 260 | (should-error (json-parse-string "[\"\uDB7F\"]") |
| 230 | :type 'json-parse-error) | 261 | :type 'json-utf8-decode-error) |
| 231 | (should-error (json-parse-string "[\"\xED\xAD\xBF\"]") | 262 | (should-error (json-parse-string "[\"\xED\xAD\xBF\"]") |
| 232 | :type 'json-parse-error) | 263 | :type 'json-utf8-decode-error) |
| 233 | (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\"]") | 264 | (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\"]") |
| 234 | :type 'json-parse-error) | 265 | :type 'json-utf8-decode-error) |
| 235 | (should-error (json-parse-string "[\"\uDB7F\uDFFF\"]") | 266 | (should-error (json-parse-string "[\"\uDB7F\uDFFF\"]") |
| 236 | :type 'json-parse-error) | 267 | :type 'json-utf8-decode-error) |
| 237 | (should-error (json-parse-string "[\"\xED\xAD\xBF\xED\xBF\xBF\"]") | 268 | (should-error (json-parse-string "[\"\xED\xAD\xBF\xED\xBF\xBF\"]") |
| 238 | :type 'json-parse-error) | 269 | :type 'json-utf8-decode-error) |
| 239 | (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\xED\xBF\xBF\"]") | 270 | (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\xED\xBF\xBF\"]") |
| 240 | :type 'json-parse-error)) | 271 | :type 'json-utf8-decode-error)) |
| 241 | 272 | ||
| 242 | (ert-deftest json-parse-string/incomplete () | 273 | (ert-deftest json-parse-string/incomplete () |
| 243 | (should-error (json-parse-string "[123") :type 'json-end-of-file)) | 274 | (should-error (json-parse-string "[123") :type 'json-end-of-file)) |