aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2013-04-05 23:08:56 +0900
committerKenichi Handa2013-04-05 23:08:56 +0900
commit251e91474c91e16b101502c2ed7c05fc13e4ecea (patch)
tree63963998fb448a6f5a1ef82bdf964737ef256a6d /src/coding.c
parent022039da8ea1166498c507dda4944afd9c49c9fe (diff)
downloademacs-251e91474c91e16b101502c2ed7c05fc13e4ecea.tar.gz
emacs-251e91474c91e16b101502c2ed7c05fc13e4ecea.zip
Optimize the code for reading UTF-8 files.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c206
1 files changed, 163 insertions, 43 deletions
diff --git a/src/coding.c b/src/coding.c
index 8a09cd67859..735af25502d 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6072,17 +6072,18 @@ complement_process_encoding_system (Lisp_Object coding_system)
6072#define EOL_SEEN_CRLF 4 6072#define EOL_SEEN_CRLF 4
6073 6073
6074 6074
6075static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); 6075static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6076 int eol_seen);
6076 6077
6077 6078
6078/* Return true iff all the source bytes are ASCII. 6079/* Return the number of ASCII characters at the head of the source.
6079 By side effects, set coding->head_ascii and coding->eol_seen. The 6080 By side effects, set coding->head_ascii and coding->eol_seen. The
6080 value of coding->eol_seen is "logical or" of EOL_SEEN_LF, 6081 value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6081 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when 6082 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6082 all the source bytes are ASCII. */ 6083 all the source bytes are ASCII. */
6083 6084
6084static bool 6085static int
6085detect_ascii (struct coding_system *coding) 6086check_ascii (struct coding_system *coding)
6086{ 6087{
6087 const unsigned char *src, *end; 6088 const unsigned char *src, *end;
6088 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); 6089 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
@@ -6096,21 +6097,20 @@ detect_ascii (struct coding_system *coding)
6096 src = coding->source; 6097 src = coding->source;
6097 end = src + coding->src_bytes; 6098 end = src + coding->src_bytes;
6098 6099
6099 if (inhibit_eol_conversion) 6100 if (inhibit_eol_conversion
6101 || eol_seen != EOL_SEEN_NONE)
6100 { 6102 {
6101 /* We don't have to check EOL format. */ 6103 /* We don't have to check EOL format. */
6102 while (src < end && !( *src & 0x80)) src++; 6104 while (src < end && !( *src & 0x80)) src++;
6103 eol_seen = EOL_SEEN_LF; 6105 if (inhibit_eol_conversion)
6104 adjust_coding_eol_type (coding, eol_seen); 6106 {
6105 } 6107 eol_seen = EOL_SEEN_LF;
6106 else if (eol_seen != EOL_SEEN_NONE) 6108 adjust_coding_eol_type (coding, eol_seen);
6107 { 6109 }
6108 /* We don't have to check EOL format either. */
6109 while (src < end && !(*src & 0x80)) src++;
6110 } 6110 }
6111 else 6111 else
6112 { 6112 {
6113 end--; /* We look ahead one byte. */ 6113 end--; /* We look ahead one byte for "CR LF". */
6114 while (src < end) 6114 while (src < end)
6115 { 6115 {
6116 int c = *src; 6116 int c = *src;
@@ -6118,6 +6118,69 @@ detect_ascii (struct coding_system *coding)
6118 if (c & 0x80) 6118 if (c & 0x80)
6119 break; 6119 break;
6120 src++; 6120 src++;
6121 if (c == '\r')
6122 {
6123 if (*src == '\n')
6124 {
6125 eol_seen |= EOL_SEEN_CRLF;
6126 src++;
6127 }
6128 else
6129 eol_seen |= EOL_SEEN_CR;
6130 }
6131 else if (c == '\n')
6132 eol_seen |= EOL_SEEN_LF;
6133 }
6134 if (src == end)
6135 {
6136 int c = *src;
6137
6138 /* All bytes but the last one C are ASCII. */
6139 if (! (c & 0x80))
6140 {
6141 if (c == '\r')
6142 eol_seen |= EOL_SEEN_CR;
6143 else if (c == '\n')
6144 eol_seen |= EOL_SEEN_LF;
6145 src++;
6146 }
6147 }
6148 }
6149 coding->head_ascii = src - coding->source;
6150 coding->eol_seen = eol_seen;
6151 return (coding->head_ascii);
6152}
6153
6154
6155/* Return the number of charcters at the source if all the bytes are
6156 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6157 effects, update coding->eol_seen. The value of coding->eol_seen is
6158 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6159 the value is reliable only when all the source bytes are valid
6160 UTF-8. */
6161
6162static int
6163check_utf_8 (struct coding_system *coding)
6164{
6165 const unsigned char *src, *end;
6166 int eol_seen = coding->eol_seen;
6167 int nchars = coding->head_ascii;
6168
6169 if (coding->head_ascii < 0)
6170 check_ascii (coding);
6171 else
6172 coding_set_source (coding);
6173 src = coding->source + coding->head_ascii;
6174 /* We look ahead one byte for CR LF. */
6175 end = coding->source + coding->src_bytes - 1;
6176
6177 while (src < end)
6178 {
6179 int c = *src;
6180
6181 if (UTF_8_1_OCTET_P (*src))
6182 {
6183 src++;
6121 if (c < 0x20) 6184 if (c < 0x20)
6122 { 6185 {
6123 if (c == '\r') 6186 if (c == '\r')
@@ -6126,6 +6189,7 @@ detect_ascii (struct coding_system *coding)
6126 { 6189 {
6127 eol_seen |= EOL_SEEN_CRLF; 6190 eol_seen |= EOL_SEEN_CRLF;
6128 src++; 6191 src++;
6192 nchars++;
6129 } 6193 }
6130 else 6194 else
6131 eol_seen |= EOL_SEEN_CR; 6195 eol_seen |= EOL_SEEN_CR;
@@ -6134,27 +6198,58 @@ detect_ascii (struct coding_system *coding)
6134 eol_seen |= EOL_SEEN_LF; 6198 eol_seen |= EOL_SEEN_LF;
6135 } 6199 }
6136 } 6200 }
6137 if (src > end) 6201 else if (UTF_8_2_OCTET_LEADING_P (c))
6138 /* The last two bytes are CR LF, which means that we have
6139 scanned all bytes. */
6140 end++;
6141 else if (src == end)
6142 { 6202 {
6143 end++; 6203 if (c < 0xC2 /* overlong sequence */
6144 if (! (*src & 0x80)) 6204 || src + 1 >= end
6145 { 6205 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6146 if (*src == '\r') 6206 return -1;
6147 eol_seen |= EOL_SEEN_CR; 6207 src += 2;
6148 else if (*src == '\n')
6149 eol_seen |= EOL_SEEN_LF;
6150 src++;
6151 }
6152 } 6208 }
6153 adjust_coding_eol_type (coding, eol_seen); 6209 else if (UTF_8_3_OCTET_LEADING_P (c))
6210 {
6211 if (src + 2 >= end
6212 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6213 && UTF_8_EXTRA_OCTET_P (src[2])))
6214 return -1;
6215 c = (((c & 0xF) << 12)
6216 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6217 if (c < 0x800 /* overlong sequence */
6218 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6219 return -1;
6220 src += 3;
6221 }
6222 else if (UTF_8_4_OCTET_LEADING_P (c))
6223 {
6224 if (src + 3 >= end
6225 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6226 && UTF_8_EXTRA_OCTET_P (src[2])
6227 && UTF_8_EXTRA_OCTET_P (src[3])))
6228 return -1;
6229 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6230 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6231 if (c < 0x10000 /* overlong sequence */
6232 || c >= 0x110000) /* non-Unicode character */
6233 return -1;
6234 src += 4;
6235 }
6236 else
6237 return -1;
6238 nchars++;
6239 }
6240
6241 if (src == end)
6242 {
6243 if (! UTF_8_1_OCTET_P (*src))
6244 return -1;
6245 nchars++;
6246 if (*src == '\r')
6247 eol_seen |= EOL_SEEN_CR;
6248 else if (*src == '\n')
6249 eol_seen |= EOL_SEEN_LF;
6154 } 6250 }
6155 coding->head_ascii = src - coding->source;
6156 coding->eol_seen = eol_seen; 6251 coding->eol_seen = eol_seen;
6157 return (src == end); 6252 return nchars;
6158} 6253}
6159 6254
6160 6255
@@ -6269,6 +6364,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6269 Lisp_Object eol_type; 6364 Lisp_Object eol_type;
6270 6365
6271 eol_type = CODING_ID_EOL_TYPE (coding->id); 6366 eol_type = CODING_ID_EOL_TYPE (coding->id);
6367 if (! VECTORP (eol_type))
6368 /* Already adjusted. */
6369 return eol_type;
6272 if (eol_seen & EOL_SEEN_LF) 6370 if (eol_seen & EOL_SEEN_LF)
6273 { 6371 {
6274 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); 6372 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6360,7 +6458,8 @@ detect_coding (struct coding_system *coding)
6360 { 6458 {
6361 coding->eol_seen |= EOL_SEEN_CRLF; 6459 coding->eol_seen |= EOL_SEEN_CRLF;
6362 src++; 6460 src++;
6363 coding->head_ascii++; 6461 if (! eight_bit_found)
6462 coding->head_ascii++;
6364 } 6463 }
6365 else 6464 else
6366 coding->eol_seen |= EOL_SEEN_CR; 6465 coding->eol_seen |= EOL_SEEN_CR;
@@ -6461,9 +6560,14 @@ detect_coding (struct coding_system *coding)
6461 coding_systems 6560 coding_systems
6462 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6561 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6463 detect_info.found = detect_info.rejected = 0; 6562 detect_info.found = detect_info.rejected = 0;
6464 if (detect_ascii (coding)) 6563 if (check_ascii (coding) == coding->src_bytes)
6465 { 6564 {
6565 int head_ascii = coding->head_ascii;
6566
6567 if (coding->eol_seen != EOL_SEEN_NONE)
6568 adjust_coding_eol_type (coding, coding->eol_seen);
6466 setup_coding_system (XCDR (coding_systems), coding); 6569 setup_coding_system (XCDR (coding_systems), coding);
6570 coding->head_ascii = head_ascii;
6467 } 6571 }
6468 else 6572 else
6469 { 6573 {
@@ -7620,15 +7724,27 @@ decode_coding_gap (struct coding_system *coding,
7620 if (CODING_REQUIRE_DETECTION (coding)) 7724 if (CODING_REQUIRE_DETECTION (coding))
7621 detect_coding (coding); 7725 detect_coding (coding);
7622 attrs = CODING_ID_ATTRS (coding->id); 7726 attrs = CODING_ID_ATTRS (coding->id);
7623 if (! disable_ascii_optimization) 7727 if (! disable_ascii_optimization
7624 { 7728 && ! coding->src_multibyte
7625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) 7729 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7626 && NILP (CODING_ATTR_POST_READ (attrs)) 7730 && NILP (CODING_ATTR_POST_READ (attrs))
7627 && NILP (get_translation_table (attrs, 0, NULL)) 7731 && NILP (get_translation_table (attrs, 0, NULL)))
7628 && (coding->head_ascii >= 0 /* We've already called detect_coding */ 7732 {
7629 ? coding->head_ascii == bytes 7733 chars = coding->head_ascii;
7630 : detect_ascii (coding))) 7734 if (chars < 0)
7735 chars = check_ascii (coding);
7736 if (chars != bytes)
7737 {
7738 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7739 chars = check_utf_8 (coding);
7740 else
7741 chars = -1;
7742 }
7743 if (chars >= 0)
7631 { 7744 {
7745 if (coding->eol_seen != EOL_SEEN_NONE)
7746 adjust_coding_eol_type (coding, coding->eol_seen);
7747
7632 if (coding->eol_seen == EOL_SEEN_CR) 7748 if (coding->eol_seen == EOL_SEEN_CR)
7633 { 7749 {
7634 unsigned char *src_end = GAP_END_ADDR; 7750 unsigned char *src_end = GAP_END_ADDR;
@@ -7645,6 +7761,7 @@ decode_coding_gap (struct coding_system *coding,
7645 unsigned char *src = GAP_END_ADDR; 7761 unsigned char *src = GAP_END_ADDR;
7646 unsigned char *src_beg = src - coding->src_bytes; 7762 unsigned char *src_beg = src - coding->src_bytes;
7647 unsigned char *dst = src; 7763 unsigned char *dst = src;
7764 ptrdiff_t diff;
7648 7765
7649 while (src_beg < src) 7766 while (src_beg < src)
7650 { 7767 {
@@ -7652,10 +7769,13 @@ decode_coding_gap (struct coding_system *coding,
7652 if (*src == '\n') 7769 if (*src == '\n')
7653 src--; 7770 src--;
7654 } 7771 }
7655 bytes -= dst - src; 7772 diff = dst - src;
7773 bytes -= diff;
7774 chars -= diff;
7656 } 7775 }
7657 coding->produced_char = coding->produced = bytes; 7776 coding->produced = bytes;
7658 insert_from_gap (bytes, bytes, 1); 7777 coding->produced_char = chars;
7778 insert_from_gap (chars, bytes, 1);
7659 return; 7779 return;
7660 } 7780 }
7661 } 7781 }