aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2013-03-16 01:03:54 +0900
committerKenichi Handa2013-03-16 01:03:54 +0900
commit8a44e6d176989d8eef140314098c76a70248ba61 (patch)
tree096ee4a0f9a15f2f300ba68d2dd1dd28b88e18a0 /src/coding.c
parent9b5939800615a4e08ac389813a70faf4b9e57bba (diff)
downloademacs-8a44e6d176989d8eef140314098c76a70248ba61.tar.gz
emacs-8a44e6d176989d8eef140314098c76a70248ba61.zip
Optimize ASCII file reading with EOL format detection and decoding.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c197
1 files changed, 159 insertions, 38 deletions
diff --git a/src/coding.c b/src/coding.c
index c18632f301b..5047e1149bc 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
6071#define EOL_SEEN_CR 2 6071#define EOL_SEEN_CR 2
6072#define EOL_SEEN_CRLF 4 6072#define EOL_SEEN_CRLF 4
6073 6073
6074
6075static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
6076
6077
6078/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
6079 By side effects, set coding->head_ascii and coding->eol_seen. The
6080 value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6081 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6082 all the source bytes are ASCII. */
6083
6084static bool
6085detect_ascii (struct coding_system *coding)
6086{
6087 const unsigned char *src, *end;
6088 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6089 int eol_seen;
6090
6091 eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6092 : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6093 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6094 : EOL_SEEN_CR);
6095 coding_set_source (coding);
6096 src = coding->source;
6097 end = src + coding->src_bytes;
6098
6099 if (inhibit_eol_conversion)
6100 {
6101 /* We don't have to check EOL format. */
6102 while (src < end && !( *src & 0x80)) src++;
6103 eol_seen = EOL_SEEN_LF;
6104 adjust_coding_eol_type (coding, eol_seen);
6105 }
6106 else if (eol_seen != EOL_SEEN_NONE)
6107 {
6108 /* We don't have to check EOL format either. */
6109 while (src < end && !(*src & 0x80)) src++;
6110 }
6111 else
6112 {
6113 end--; /* We look ahead one byte. */
6114 while (src < end)
6115 {
6116 int c = *src;
6117
6118 if (c & 0x80)
6119 break;
6120 src++;
6121 if (c < 0x20)
6122 {
6123 if (c == '\r')
6124 {
6125 if (*src == '\n')
6126 {
6127 eol_seen |= EOL_SEEN_CRLF;
6128 src++;
6129 }
6130 else
6131 eol_seen |= EOL_SEEN_CR;
6132 }
6133 else if (c == '\n')
6134 eol_seen |= EOL_SEEN_LF;
6135 }
6136 }
6137 if (src > end)
6138 /* The last two bytes are CR LF, which means that we have
6139 scanned all bytes. */
6140 end++;
6141 else if (src == end)
6142 {
6143 end++;
6144 if (! (*src & 0x80))
6145 {
6146 if (*src == '\r')
6147 eol_seen |= EOL_SEEN_CR;
6148 else if (*src == '\n')
6149 eol_seen |= EOL_SEEN_LF;
6150 src++;
6151 }
6152 }
6153 adjust_coding_eol_type (coding, eol_seen);
6154 }
6155 coding->head_ascii = src - coding->source;
6156 coding->eol_seen = eol_seen;
6157 return (src == end);
6158}
6159
6160
6074/* Detect how end-of-line of a text of length SRC_BYTES pointed by 6161/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6075 SOURCE is encoded. If CATEGORY is one of 6162 SOURCE is encoded. If CATEGORY is one of
6076 coding_category_utf_16_XXXX, assume that CR and LF are encoded by 6163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding)
6215 coding_set_source (coding); 6302 coding_set_source (coding);
6216 6303
6217 src_end = coding->source + coding->src_bytes; 6304 src_end = coding->source + coding->src_bytes;
6218 coding->head_ascii = 0;
6219 6305
6220 /* If we have not yet decided the text encoding type, detect it 6306 /* If we have not yet decided the text encoding type, detect it
6221 now. */ 6307 now. */
@@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding)
6225 struct coding_detection_info detect_info; 6311 struct coding_detection_info detect_info;
6226 bool null_byte_found = 0, eight_bit_found = 0; 6312 bool null_byte_found = 0, eight_bit_found = 0;
6227 6313
6314 coding->head_ascii = 0;
6315 coding->eol_seen = EOL_SEEN_NONE;
6228 detect_info.checked = detect_info.found = detect_info.rejected = 0; 6316 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6229 for (src = coding->source; src < src_end; src++) 6317 for (src = coding->source; src < src_end; src++)
6230 { 6318 {
@@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding)
6263 if (eight_bit_found) 6351 if (eight_bit_found)
6264 break; 6352 break;
6265 } 6353 }
6354 else if (! disable_ascii_optimization
6355 && ! inhibit_eol_conversion)
6356 {
6357 if (c == '\r')
6358 {
6359 if (src < src_end && src[1] == '\n')
6360 {
6361 coding->eol_seen |= EOL_SEEN_CRLF;
6362 src++;
6363 coding->head_ascii++;
6364 }
6365 else
6366 coding->eol_seen |= EOL_SEEN_CR;
6367 }
6368 else if (c == '\n')
6369 {
6370 coding->eol_seen |= EOL_SEEN_LF;
6371 }
6372 }
6373
6266 if (! eight_bit_found) 6374 if (! eight_bit_found)
6267 coding->head_ascii++; 6375 coding->head_ascii++;
6268 } 6376 }
@@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding)
6353 coding_systems 6461 coding_systems
6354 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6462 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6355 detect_info.found = detect_info.rejected = 0; 6463 detect_info.found = detect_info.rejected = 0;
6356 for (src = coding->source; src < src_end; src++) 6464 if (detect_ascii (coding))
6357 { 6465 {
6358 if (*src & 0x80) 6466 setup_coding_system (XCDR (coding_systems), coding);
6359 break;
6360 } 6467 }
6361 coding->head_ascii = src - coding->source; 6468 else
6362 if (CONSP (coding_systems)
6363 && detect_coding_utf_8 (coding, &detect_info))
6364 { 6469 {
6365 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) 6470 if (CONSP (coding_systems)
6366 setup_coding_system (XCAR (coding_systems), coding); 6471 && detect_coding_utf_8 (coding, &detect_info))
6367 else 6472 {
6368 setup_coding_system (XCDR (coding_systems), coding); 6473 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6474 setup_coding_system (XCAR (coding_systems), coding);
6475 else
6476 setup_coding_system (XCDR (coding_systems), coding);
6477 }
6369 } 6478 }
6370 } 6479 }
6371 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 6480 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding)
6378 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6487 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6379 detect_info.found = detect_info.rejected = 0; 6488 detect_info.found = detect_info.rejected = 0;
6380 coding->head_ascii = 0; 6489 coding->head_ascii = 0;
6490 coding->eol_seen = EOL_SEEN_NONE;
6381 if (CONSP (coding_systems) 6491 if (CONSP (coding_systems)
6382 && detect_coding_utf_16 (coding, &detect_info)) 6492 && detect_coding_utf_16 (coding, &detect_info))
6383 { 6493 {
@@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6815 6925
6816 produced = dst - (coding->destination + coding->produced); 6926 produced = dst - (coding->destination + coding->produced);
6817 if (BUFFERP (coding->dst_object) && produced_chars > 0) 6927 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6818 insert_from_gap (produced_chars, produced); 6928 insert_from_gap (produced_chars, produced, 0);
6819 coding->produced += produced; 6929 coding->produced += produced;
6820 coding->produced_char += produced_chars; 6930 coding->produced_char += produced_chars;
6821 return carryover; 6931 return carryover;
@@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding)
7400 } while (coding->consumed_char < coding->src_chars); 7510 } while (coding->consumed_char < coding->src_chars);
7401 7511
7402 if (BUFFERP (coding->dst_object) && coding->produced_char > 0) 7512 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7403 insert_from_gap (coding->produced_char, coding->produced); 7513 insert_from_gap (coding->produced_char, coding->produced, 0);
7404 7514
7405 SAFE_FREE (); 7515 SAFE_FREE ();
7406} 7516}
@@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding,
7510 if (CODING_REQUIRE_DETECTION (coding)) 7620 if (CODING_REQUIRE_DETECTION (coding))
7511 detect_coding (coding); 7621 detect_coding (coding);
7512 attrs = CODING_ID_ATTRS (coding->id); 7622 attrs = CODING_ID_ATTRS (coding->id);
7513#ifndef CODING_DISABLE_ASCII_OPTIMIZATION 7623 if (! disable_ascii_optimization)
7514 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7515 && NILP (CODING_ATTR_POST_READ (attrs))
7516 && NILP (get_translation_table (attrs, 0, NULL))
7517 && (inhibit_eol_conversion
7518 || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
7519 { 7624 {
7520 /* We can skip the conversion if all source bytes are ASCII. */ 7625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7521 if (coding->head_ascii < 0) 7626 && NILP (CODING_ATTR_POST_READ (attrs))
7627 && NILP (get_translation_table (attrs, 0, NULL))
7628 && (coding->head_ascii >= 0 /* We've already called detect_coding */
7629 ? coding->head_ascii == bytes
7630 : detect_ascii (coding)))
7522 { 7631 {
7523 /* We have not yet counted the number of ASCII bytes at the 7632 if (coding->eol_seen == EOL_SEEN_CR)
7524 head of the source. Do it now. */ 7633 {
7525 const unsigned char *src, *src_end; 7634 unsigned char *src_end = GAP_END_ADDR;
7635 unsigned char *src = src - coding->src_bytes;
7526 7636
7527 coding_set_source (coding); 7637 while (src < src_end)
7528 src_end = coding->source + coding->src_bytes; 7638 {
7529 for (src = coding->source; src < src_end; src++) 7639 if (*src++ == '\r')
7640 src[-1] = '\n';
7641 }
7642 }
7643 else if (coding->eol_seen == EOL_SEEN_CRLF)
7530 { 7644 {
7531 if (*src & 0x80) 7645 unsigned char *src = GAP_END_ADDR;
7532 break; 7646 unsigned char *src_beg = src - coding->src_bytes;
7647 unsigned char *dst = src;
7648
7649 while (src_beg < src)
7650 {
7651 *--dst = *--src;
7652 if (*src == '\n')
7653 src--;
7654 }
7655 bytes -= dst - src;
7533 } 7656 }
7534 coding->head_ascii = src - coding->source; 7657 coding->produced_char = coding->produced = bytes;
7535 } 7658 insert_from_gap (bytes, bytes, 1);
7536 if (coding->src_bytes == coding->head_ascii)
7537 {
7538 /* No need of conversion. Use the data in the gap as is. */
7539 coding->produced_char = chars;
7540 coding->produced = bytes;
7541 adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
7542 return; 7659 return;
7543 } 7660 }
7544 } 7661 }
7545#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */
7546 code_conversion_save (0, 0); 7662 code_conversion_save (0, 0);
7547 7663
7548 coding->mode |= CODING_MODE_LAST_BLOCK; 7664 coding->mode |= CODING_MODE_LAST_BLOCK;
@@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10758decode text as usual. */); 10874decode text as usual. */);
10759 inhibit_null_byte_detection = 0; 10875 inhibit_null_byte_detection = 0;
10760 10876
10877 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
10878 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
10879Internal use only. Removed after the experimental optimizer gets stable. */);
10880 disable_ascii_optimization = 0;
10881
10761 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, 10882 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10762 doc: /* Char table for translating self-inserting characters. 10883 doc: /* Char table for translating self-inserting characters.
10763This is applied to the result of input methods, not their input. 10884This is applied to the result of input methods, not their input.