diff options
| author | Kenichi Handa | 2013-04-05 23:08:56 +0900 |
|---|---|---|
| committer | Kenichi Handa | 2013-04-05 23:08:56 +0900 |
| commit | 251e91474c91e16b101502c2ed7c05fc13e4ecea (patch) | |
| tree | 63963998fb448a6f5a1ef82bdf964737ef256a6d /src | |
| parent | 022039da8ea1166498c507dda4944afd9c49c9fe (diff) | |
| download | emacs-251e91474c91e16b101502c2ed7c05fc13e4ecea.tar.gz emacs-251e91474c91e16b101502c2ed7c05fc13e4ecea.zip | |
Optimize the code for reading UTF-8 files.
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 16 | ||||
| -rw-r--r-- | src/coding.c | 206 |
2 files changed, 178 insertions, 44 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index 0634ec7cc1d..6b3ca9d3ff3 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,17 @@ | |||
| 1 | 2013-04-03 Kenichi Handa <handa@gnu.org> | ||
| 2 | |||
| 3 | The following changes is to optimize the code for reading UTF-8 | ||
| 4 | files. | ||
| 5 | |||
| 6 | * coding.c (check_ascii): Renamed from detect_ascii. Return value | ||
| 7 | changed. Check EOL format. Do not call adjust_coding_eol_type | ||
| 8 | here. | ||
| 9 | (check_utf_8): New function. | ||
| 10 | (adjust_coding_eol_type): Do nothing if already adjusted. | ||
| 11 | (detect_coding): Compare the return value of check_ascii with | ||
| 12 | coding->src_bytes. Call adjust_coding_eol_type if necessary. | ||
| 13 | (decode_coding_gap): Optimize for valid UTF-8. | ||
| 14 | |||
| 1 | 2013-03-21 Kenichi Handa <handa@gnu.org> | 15 | 2013-03-21 Kenichi Handa <handa@gnu.org> |
| 2 | 16 | ||
| 3 | * coding.c (syms_of_coding): Cancel previous change. | 17 | * coding.c (syms_of_coding): Cancel previous change. |
| @@ -89,7 +103,7 @@ | |||
| 89 | 103 | ||
| 90 | * coding.c (decode_coding_gap): Fix typo caught by static checking. | 104 | * coding.c (decode_coding_gap): Fix typo caught by static checking. |
| 91 | 105 | ||
| 92 | 2013-03-15 handa <handa@gnu.org> | 106 | 2013-03-15 Kenichi Handa <handa@gnu.org> |
| 93 | 107 | ||
| 94 | * insdel.c (insert_from_gap): New arg text_at_gap_tail. | 108 | * insdel.c (insert_from_gap): New arg text_at_gap_tail. |
| 95 | (adjust_after_replace): Make it back to static. Delete the third | 109 | (adjust_after_replace): Make it back to static. Delete the third |
diff --git a/src/coding.c b/src/coding.c index 8a09cd67859..735af25502d 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -6072,17 +6072,18 @@ complement_process_encoding_system (Lisp_Object coding_system) | |||
| 6072 | #define EOL_SEEN_CRLF 4 | 6072 | #define EOL_SEEN_CRLF 4 |
| 6073 | 6073 | ||
| 6074 | 6074 | ||
| 6075 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); | 6075 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, |
| 6076 | int eol_seen); | ||
| 6076 | 6077 | ||
| 6077 | 6078 | ||
| 6078 | /* Return true iff all the source bytes are ASCII. | 6079 | /* Return the number of ASCII characters at the head of the source. |
| 6079 | By side effects, set coding->head_ascii and coding->eol_seen. The | 6080 | By side effects, set coding->head_ascii and coding->eol_seen. The |
| 6080 | value of coding->eol_seen is "logical or" of EOL_SEEN_LF, | 6081 | value of coding->eol_seen is "logical or" of EOL_SEEN_LF, |
| 6081 | EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when | 6082 | EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when |
| 6082 | all the source bytes are ASCII. */ | 6083 | all the source bytes are ASCII. */ |
| 6083 | 6084 | ||
| 6084 | static bool | 6085 | static int |
| 6085 | detect_ascii (struct coding_system *coding) | 6086 | check_ascii (struct coding_system *coding) |
| 6086 | { | 6087 | { |
| 6087 | const unsigned char *src, *end; | 6088 | const unsigned char *src, *end; |
| 6088 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); | 6089 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); |
| @@ -6096,21 +6097,20 @@ detect_ascii (struct coding_system *coding) | |||
| 6096 | src = coding->source; | 6097 | src = coding->source; |
| 6097 | end = src + coding->src_bytes; | 6098 | end = src + coding->src_bytes; |
| 6098 | 6099 | ||
| 6099 | if (inhibit_eol_conversion) | 6100 | if (inhibit_eol_conversion |
| 6101 | || eol_seen != EOL_SEEN_NONE) | ||
| 6100 | { | 6102 | { |
| 6101 | /* We don't have to check EOL format. */ | 6103 | /* We don't have to check EOL format. */ |
| 6102 | while (src < end && !( *src & 0x80)) src++; | 6104 | while (src < end && !( *src & 0x80)) src++; |
| 6103 | eol_seen = EOL_SEEN_LF; | 6105 | if (inhibit_eol_conversion) |
| 6104 | adjust_coding_eol_type (coding, eol_seen); | 6106 | { |
| 6105 | } | 6107 | eol_seen = EOL_SEEN_LF; |
| 6106 | else if (eol_seen != EOL_SEEN_NONE) | 6108 | adjust_coding_eol_type (coding, eol_seen); |
| 6107 | { | 6109 | } |
| 6108 | /* We don't have to check EOL format either. */ | ||
| 6109 | while (src < end && !(*src & 0x80)) src++; | ||
| 6110 | } | 6110 | } |
| 6111 | else | 6111 | else |
| 6112 | { | 6112 | { |
| 6113 | end--; /* We look ahead one byte. */ | 6113 | end--; /* We look ahead one byte for "CR LF". */ |
| 6114 | while (src < end) | 6114 | while (src < end) |
| 6115 | { | 6115 | { |
| 6116 | int c = *src; | 6116 | int c = *src; |
| @@ -6118,6 +6118,69 @@ detect_ascii (struct coding_system *coding) | |||
| 6118 | if (c & 0x80) | 6118 | if (c & 0x80) |
| 6119 | break; | 6119 | break; |
| 6120 | src++; | 6120 | src++; |
| 6121 | if (c == '\r') | ||
| 6122 | { | ||
| 6123 | if (*src == '\n') | ||
| 6124 | { | ||
| 6125 | eol_seen |= EOL_SEEN_CRLF; | ||
| 6126 | src++; | ||
| 6127 | } | ||
| 6128 | else | ||
| 6129 | eol_seen |= EOL_SEEN_CR; | ||
| 6130 | } | ||
| 6131 | else if (c == '\n') | ||
| 6132 | eol_seen |= EOL_SEEN_LF; | ||
| 6133 | } | ||
| 6134 | if (src == end) | ||
| 6135 | { | ||
| 6136 | int c = *src; | ||
| 6137 | |||
| 6138 | /* All bytes but the last one C are ASCII. */ | ||
| 6139 | if (! (c & 0x80)) | ||
| 6140 | { | ||
| 6141 | if (c == '\r') | ||
| 6142 | eol_seen |= EOL_SEEN_CR; | ||
| 6143 | else if (c == '\n') | ||
| 6144 | eol_seen |= EOL_SEEN_LF; | ||
| 6145 | src++; | ||
| 6146 | } | ||
| 6147 | } | ||
| 6148 | } | ||
| 6149 | coding->head_ascii = src - coding->source; | ||
| 6150 | coding->eol_seen = eol_seen; | ||
| 6151 | return (coding->head_ascii); | ||
| 6152 | } | ||
| 6153 | |||
| 6154 | |||
| 6155 | /* Return the number of charcters at the source if all the bytes are | ||
| 6156 | valid UTF-8 (of Unicode range). Otherwise, return -1. By side | ||
| 6157 | effects, update coding->eol_seen. The value of coding->eol_seen is | ||
| 6158 | "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but | ||
| 6159 | the value is reliable only when all the source bytes are valid | ||
| 6160 | UTF-8. */ | ||
| 6161 | |||
| 6162 | static int | ||
| 6163 | check_utf_8 (struct coding_system *coding) | ||
| 6164 | { | ||
| 6165 | const unsigned char *src, *end; | ||
| 6166 | int eol_seen = coding->eol_seen; | ||
| 6167 | int nchars = coding->head_ascii; | ||
| 6168 | |||
| 6169 | if (coding->head_ascii < 0) | ||
| 6170 | check_ascii (coding); | ||
| 6171 | else | ||
| 6172 | coding_set_source (coding); | ||
| 6173 | src = coding->source + coding->head_ascii; | ||
| 6174 | /* We look ahead one byte for CR LF. */ | ||
| 6175 | end = coding->source + coding->src_bytes - 1; | ||
| 6176 | |||
| 6177 | while (src < end) | ||
| 6178 | { | ||
| 6179 | int c = *src; | ||
| 6180 | |||
| 6181 | if (UTF_8_1_OCTET_P (*src)) | ||
| 6182 | { | ||
| 6183 | src++; | ||
| 6121 | if (c < 0x20) | 6184 | if (c < 0x20) |
| 6122 | { | 6185 | { |
| 6123 | if (c == '\r') | 6186 | if (c == '\r') |
| @@ -6126,6 +6189,7 @@ detect_ascii (struct coding_system *coding) | |||
| 6126 | { | 6189 | { |
| 6127 | eol_seen |= EOL_SEEN_CRLF; | 6190 | eol_seen |= EOL_SEEN_CRLF; |
| 6128 | src++; | 6191 | src++; |
| 6192 | nchars++; | ||
| 6129 | } | 6193 | } |
| 6130 | else | 6194 | else |
| 6131 | eol_seen |= EOL_SEEN_CR; | 6195 | eol_seen |= EOL_SEEN_CR; |
| @@ -6134,27 +6198,58 @@ detect_ascii (struct coding_system *coding) | |||
| 6134 | eol_seen |= EOL_SEEN_LF; | 6198 | eol_seen |= EOL_SEEN_LF; |
| 6135 | } | 6199 | } |
| 6136 | } | 6200 | } |
| 6137 | if (src > end) | 6201 | else if (UTF_8_2_OCTET_LEADING_P (c)) |
| 6138 | /* The last two bytes are CR LF, which means that we have | ||
| 6139 | scanned all bytes. */ | ||
| 6140 | end++; | ||
| 6141 | else if (src == end) | ||
| 6142 | { | 6202 | { |
| 6143 | end++; | 6203 | if (c < 0xC2 /* overlong sequence */ |
| 6144 | if (! (*src & 0x80)) | 6204 | || src + 1 >= end |
| 6145 | { | 6205 | || ! UTF_8_EXTRA_OCTET_P (src[1])) |
| 6146 | if (*src == '\r') | 6206 | return -1; |
| 6147 | eol_seen |= EOL_SEEN_CR; | 6207 | src += 2; |
| 6148 | else if (*src == '\n') | ||
| 6149 | eol_seen |= EOL_SEEN_LF; | ||
| 6150 | src++; | ||
| 6151 | } | ||
| 6152 | } | 6208 | } |
| 6153 | adjust_coding_eol_type (coding, eol_seen); | 6209 | else if (UTF_8_3_OCTET_LEADING_P (c)) |
| 6210 | { | ||
| 6211 | if (src + 2 >= end | ||
| 6212 | || ! (UTF_8_EXTRA_OCTET_P (src[1]) | ||
| 6213 | && UTF_8_EXTRA_OCTET_P (src[2]))) | ||
| 6214 | return -1; | ||
| 6215 | c = (((c & 0xF) << 12) | ||
| 6216 | | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); | ||
| 6217 | if (c < 0x800 /* overlong sequence */ | ||
| 6218 | || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */ | ||
| 6219 | return -1; | ||
| 6220 | src += 3; | ||
| 6221 | } | ||
| 6222 | else if (UTF_8_4_OCTET_LEADING_P (c)) | ||
| 6223 | { | ||
| 6224 | if (src + 3 >= end | ||
| 6225 | || ! (UTF_8_EXTRA_OCTET_P (src[1]) | ||
| 6226 | && UTF_8_EXTRA_OCTET_P (src[2]) | ||
| 6227 | && UTF_8_EXTRA_OCTET_P (src[3]))) | ||
| 6228 | return -1; | ||
| 6229 | c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12) | ||
| 6230 | | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); | ||
| 6231 | if (c < 0x10000 /* overlong sequence */ | ||
| 6232 | || c >= 0x110000) /* non-Unicode character */ | ||
| 6233 | return -1; | ||
| 6234 | src += 4; | ||
| 6235 | } | ||
| 6236 | else | ||
| 6237 | return -1; | ||
| 6238 | nchars++; | ||
| 6239 | } | ||
| 6240 | |||
| 6241 | if (src == end) | ||
| 6242 | { | ||
| 6243 | if (! UTF_8_1_OCTET_P (*src)) | ||
| 6244 | return -1; | ||
| 6245 | nchars++; | ||
| 6246 | if (*src == '\r') | ||
| 6247 | eol_seen |= EOL_SEEN_CR; | ||
| 6248 | else if (*src == '\n') | ||
| 6249 | eol_seen |= EOL_SEEN_LF; | ||
| 6154 | } | 6250 | } |
| 6155 | coding->head_ascii = src - coding->source; | ||
| 6156 | coding->eol_seen = eol_seen; | 6251 | coding->eol_seen = eol_seen; |
| 6157 | return (src == end); | 6252 | return nchars; |
| 6158 | } | 6253 | } |
| 6159 | 6254 | ||
| 6160 | 6255 | ||
| @@ -6269,6 +6364,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen) | |||
| 6269 | Lisp_Object eol_type; | 6364 | Lisp_Object eol_type; |
| 6270 | 6365 | ||
| 6271 | eol_type = CODING_ID_EOL_TYPE (coding->id); | 6366 | eol_type = CODING_ID_EOL_TYPE (coding->id); |
| 6367 | if (! VECTORP (eol_type)) | ||
| 6368 | /* Already adjusted. */ | ||
| 6369 | return eol_type; | ||
| 6272 | if (eol_seen & EOL_SEEN_LF) | 6370 | if (eol_seen & EOL_SEEN_LF) |
| 6273 | { | 6371 | { |
| 6274 | coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); | 6372 | coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); |
| @@ -6360,7 +6458,8 @@ detect_coding (struct coding_system *coding) | |||
| 6360 | { | 6458 | { |
| 6361 | coding->eol_seen |= EOL_SEEN_CRLF; | 6459 | coding->eol_seen |= EOL_SEEN_CRLF; |
| 6362 | src++; | 6460 | src++; |
| 6363 | coding->head_ascii++; | 6461 | if (! eight_bit_found) |
| 6462 | coding->head_ascii++; | ||
| 6364 | } | 6463 | } |
| 6365 | else | 6464 | else |
| 6366 | coding->eol_seen |= EOL_SEEN_CR; | 6465 | coding->eol_seen |= EOL_SEEN_CR; |
| @@ -6461,9 +6560,14 @@ detect_coding (struct coding_system *coding) | |||
| 6461 | coding_systems | 6560 | coding_systems |
| 6462 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6561 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6463 | detect_info.found = detect_info.rejected = 0; | 6562 | detect_info.found = detect_info.rejected = 0; |
| 6464 | if (detect_ascii (coding)) | 6563 | if (check_ascii (coding) == coding->src_bytes) |
| 6465 | { | 6564 | { |
| 6565 | int head_ascii = coding->head_ascii; | ||
| 6566 | |||
| 6567 | if (coding->eol_seen != EOL_SEEN_NONE) | ||
| 6568 | adjust_coding_eol_type (coding, coding->eol_seen); | ||
| 6466 | setup_coding_system (XCDR (coding_systems), coding); | 6569 | setup_coding_system (XCDR (coding_systems), coding); |
| 6570 | coding->head_ascii = head_ascii; | ||
| 6467 | } | 6571 | } |
| 6468 | else | 6572 | else |
| 6469 | { | 6573 | { |
| @@ -7620,15 +7724,27 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7620 | if (CODING_REQUIRE_DETECTION (coding)) | 7724 | if (CODING_REQUIRE_DETECTION (coding)) |
| 7621 | detect_coding (coding); | 7725 | detect_coding (coding); |
| 7622 | attrs = CODING_ID_ATTRS (coding->id); | 7726 | attrs = CODING_ID_ATTRS (coding->id); |
| 7623 | if (! disable_ascii_optimization) | 7727 | if (! disable_ascii_optimization |
| 7624 | { | 7728 | && ! coding->src_multibyte |
| 7625 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) | 7729 | && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) |
| 7626 | && NILP (CODING_ATTR_POST_READ (attrs)) | 7730 | && NILP (CODING_ATTR_POST_READ (attrs)) |
| 7627 | && NILP (get_translation_table (attrs, 0, NULL)) | 7731 | && NILP (get_translation_table (attrs, 0, NULL))) |
| 7628 | && (coding->head_ascii >= 0 /* We've already called detect_coding */ | 7732 | { |
| 7629 | ? coding->head_ascii == bytes | 7733 | chars = coding->head_ascii; |
| 7630 | : detect_ascii (coding))) | 7734 | if (chars < 0) |
| 7735 | chars = check_ascii (coding); | ||
| 7736 | if (chars != bytes) | ||
| 7737 | { | ||
| 7738 | if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)) | ||
| 7739 | chars = check_utf_8 (coding); | ||
| 7740 | else | ||
| 7741 | chars = -1; | ||
| 7742 | } | ||
| 7743 | if (chars >= 0) | ||
| 7631 | { | 7744 | { |
| 7745 | if (coding->eol_seen != EOL_SEEN_NONE) | ||
| 7746 | adjust_coding_eol_type (coding, coding->eol_seen); | ||
| 7747 | |||
| 7632 | if (coding->eol_seen == EOL_SEEN_CR) | 7748 | if (coding->eol_seen == EOL_SEEN_CR) |
| 7633 | { | 7749 | { |
| 7634 | unsigned char *src_end = GAP_END_ADDR; | 7750 | unsigned char *src_end = GAP_END_ADDR; |
| @@ -7645,6 +7761,7 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7645 | unsigned char *src = GAP_END_ADDR; | 7761 | unsigned char *src = GAP_END_ADDR; |
| 7646 | unsigned char *src_beg = src - coding->src_bytes; | 7762 | unsigned char *src_beg = src - coding->src_bytes; |
| 7647 | unsigned char *dst = src; | 7763 | unsigned char *dst = src; |
| 7764 | ptrdiff_t diff; | ||
| 7648 | 7765 | ||
| 7649 | while (src_beg < src) | 7766 | while (src_beg < src) |
| 7650 | { | 7767 | { |
| @@ -7652,10 +7769,13 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7652 | if (*src == '\n') | 7769 | if (*src == '\n') |
| 7653 | src--; | 7770 | src--; |
| 7654 | } | 7771 | } |
| 7655 | bytes -= dst - src; | 7772 | diff = dst - src; |
| 7773 | bytes -= diff; | ||
| 7774 | chars -= diff; | ||
| 7656 | } | 7775 | } |
| 7657 | coding->produced_char = coding->produced = bytes; | 7776 | coding->produced = bytes; |
| 7658 | insert_from_gap (bytes, bytes, 1); | 7777 | coding->produced_char = chars; |
| 7778 | insert_from_gap (chars, bytes, 1); | ||
| 7659 | return; | 7779 | return; |
| 7660 | } | 7780 | } |
| 7661 | } | 7781 | } |