diff options
| author | Kenichi Handa | 2013-03-16 01:03:54 +0900 |
|---|---|---|
| committer | Kenichi Handa | 2013-03-16 01:03:54 +0900 |
| commit | 8a44e6d176989d8eef140314098c76a70248ba61 (patch) | |
| tree | 096ee4a0f9a15f2f300ba68d2dd1dd28b88e18a0 /src/coding.c | |
| parent | 9b5939800615a4e08ac389813a70faf4b9e57bba (diff) | |
| download | emacs-8a44e6d176989d8eef140314098c76a70248ba61.tar.gz emacs-8a44e6d176989d8eef140314098c76a70248ba61.zip | |
Optimize ASCII file reading with EOL format detection and decoding.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 197 |
1 files changed, 159 insertions, 38 deletions
diff --git a/src/coding.c b/src/coding.c index c18632f301b..5047e1149bc 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system) | |||
| 6071 | #define EOL_SEEN_CR 2 | 6071 | #define EOL_SEEN_CR 2 |
| 6072 | #define EOL_SEEN_CRLF 4 | 6072 | #define EOL_SEEN_CRLF 4 |
| 6073 | 6073 | ||
| 6074 | |||
| 6075 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); | ||
| 6076 | |||
| 6077 | |||
| 6078 | /* Return 1 if all the source bytes are ASCII, and return 0 otherwize. | ||
| 6079 | By side effects, set coding->head_ascii and coding->eol_seen. The | ||
| 6080 | value of coding->eol_seen is "logical or" of EOL_SEEN_LF, | ||
| 6081 | EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when | ||
| 6082 | all the source bytes are ASCII. */ | ||
| 6083 | |||
| 6084 | static bool | ||
| 6085 | detect_ascii (struct coding_system *coding) | ||
| 6086 | { | ||
| 6087 | const unsigned char *src, *end; | ||
| 6088 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); | ||
| 6089 | int eol_seen; | ||
| 6090 | |||
| 6091 | eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE | ||
| 6092 | : EQ (eol_type, Qunix) ? EOL_SEEN_LF | ||
| 6093 | : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF | ||
| 6094 | : EOL_SEEN_CR); | ||
| 6095 | coding_set_source (coding); | ||
| 6096 | src = coding->source; | ||
| 6097 | end = src + coding->src_bytes; | ||
| 6098 | |||
| 6099 | if (inhibit_eol_conversion) | ||
| 6100 | { | ||
| 6101 | /* We don't have to check EOL format. */ | ||
| 6102 | while (src < end && !( *src & 0x80)) src++; | ||
| 6103 | eol_seen = EOL_SEEN_LF; | ||
| 6104 | adjust_coding_eol_type (coding, eol_seen); | ||
| 6105 | } | ||
| 6106 | else if (eol_seen != EOL_SEEN_NONE) | ||
| 6107 | { | ||
| 6108 | /* We don't have to check EOL format either. */ | ||
| 6109 | while (src < end && !(*src & 0x80)) src++; | ||
| 6110 | } | ||
| 6111 | else | ||
| 6112 | { | ||
| 6113 | end--; /* We look ahead one byte. */ | ||
| 6114 | while (src < end) | ||
| 6115 | { | ||
| 6116 | int c = *src; | ||
| 6117 | |||
| 6118 | if (c & 0x80) | ||
| 6119 | break; | ||
| 6120 | src++; | ||
| 6121 | if (c < 0x20) | ||
| 6122 | { | ||
| 6123 | if (c == '\r') | ||
| 6124 | { | ||
| 6125 | if (*src == '\n') | ||
| 6126 | { | ||
| 6127 | eol_seen |= EOL_SEEN_CRLF; | ||
| 6128 | src++; | ||
| 6129 | } | ||
| 6130 | else | ||
| 6131 | eol_seen |= EOL_SEEN_CR; | ||
| 6132 | } | ||
| 6133 | else if (c == '\n') | ||
| 6134 | eol_seen |= EOL_SEEN_LF; | ||
| 6135 | } | ||
| 6136 | } | ||
| 6137 | if (src > end) | ||
| 6138 | /* The last two bytes are CR LF, which means that we have | ||
| 6139 | scanned all bytes. */ | ||
| 6140 | end++; | ||
| 6141 | else if (src == end) | ||
| 6142 | { | ||
| 6143 | end++; | ||
| 6144 | if (! (*src & 0x80)) | ||
| 6145 | { | ||
| 6146 | if (*src == '\r') | ||
| 6147 | eol_seen |= EOL_SEEN_CR; | ||
| 6148 | else if (*src == '\n') | ||
| 6149 | eol_seen |= EOL_SEEN_LF; | ||
| 6150 | src++; | ||
| 6151 | } | ||
| 6152 | } | ||
| 6153 | adjust_coding_eol_type (coding, eol_seen); | ||
| 6154 | } | ||
| 6155 | coding->head_ascii = src - coding->source; | ||
| 6156 | coding->eol_seen = eol_seen; | ||
| 6157 | return (src == end); | ||
| 6158 | } | ||
| 6159 | |||
| 6160 | |||
| 6074 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by | 6161 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by |
| 6075 | SOURCE is encoded. If CATEGORY is one of | 6162 | SOURCE is encoded. If CATEGORY is one of |
| 6076 | coding_category_utf_16_XXXX, assume that CR and LF are encoded by | 6163 | coding_category_utf_16_XXXX, assume that CR and LF are encoded by |
| @@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding) | |||
| 6215 | coding_set_source (coding); | 6302 | coding_set_source (coding); |
| 6216 | 6303 | ||
| 6217 | src_end = coding->source + coding->src_bytes; | 6304 | src_end = coding->source + coding->src_bytes; |
| 6218 | coding->head_ascii = 0; | ||
| 6219 | 6305 | ||
| 6220 | /* If we have not yet decided the text encoding type, detect it | 6306 | /* If we have not yet decided the text encoding type, detect it |
| 6221 | now. */ | 6307 | now. */ |
| @@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding) | |||
| 6225 | struct coding_detection_info detect_info; | 6311 | struct coding_detection_info detect_info; |
| 6226 | bool null_byte_found = 0, eight_bit_found = 0; | 6312 | bool null_byte_found = 0, eight_bit_found = 0; |
| 6227 | 6313 | ||
| 6314 | coding->head_ascii = 0; | ||
| 6315 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6228 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | 6316 | detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 6229 | for (src = coding->source; src < src_end; src++) | 6317 | for (src = coding->source; src < src_end; src++) |
| 6230 | { | 6318 | { |
| @@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding) | |||
| 6263 | if (eight_bit_found) | 6351 | if (eight_bit_found) |
| 6264 | break; | 6352 | break; |
| 6265 | } | 6353 | } |
| 6354 | else if (! disable_ascii_optimization | ||
| 6355 | && ! inhibit_eol_conversion) | ||
| 6356 | { | ||
| 6357 | if (c == '\r') | ||
| 6358 | { | ||
| 6359 | if (src < src_end && src[1] == '\n') | ||
| 6360 | { | ||
| 6361 | coding->eol_seen |= EOL_SEEN_CRLF; | ||
| 6362 | src++; | ||
| 6363 | coding->head_ascii++; | ||
| 6364 | } | ||
| 6365 | else | ||
| 6366 | coding->eol_seen |= EOL_SEEN_CR; | ||
| 6367 | } | ||
| 6368 | else if (c == '\n') | ||
| 6369 | { | ||
| 6370 | coding->eol_seen |= EOL_SEEN_LF; | ||
| 6371 | } | ||
| 6372 | } | ||
| 6373 | |||
| 6266 | if (! eight_bit_found) | 6374 | if (! eight_bit_found) |
| 6267 | coding->head_ascii++; | 6375 | coding->head_ascii++; |
| 6268 | } | 6376 | } |
| @@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding) | |||
| 6353 | coding_systems | 6461 | coding_systems |
| 6354 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6462 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6355 | detect_info.found = detect_info.rejected = 0; | 6463 | detect_info.found = detect_info.rejected = 0; |
| 6356 | for (src = coding->source; src < src_end; src++) | 6464 | if (detect_ascii (coding)) |
| 6357 | { | 6465 | { |
| 6358 | if (*src & 0x80) | 6466 | setup_coding_system (XCDR (coding_systems), coding); |
| 6359 | break; | ||
| 6360 | } | 6467 | } |
| 6361 | coding->head_ascii = src - coding->source; | 6468 | else |
| 6362 | if (CONSP (coding_systems) | ||
| 6363 | && detect_coding_utf_8 (coding, &detect_info)) | ||
| 6364 | { | 6469 | { |
| 6365 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | 6470 | if (CONSP (coding_systems) |
| 6366 | setup_coding_system (XCAR (coding_systems), coding); | 6471 | && detect_coding_utf_8 (coding, &detect_info)) |
| 6367 | else | 6472 | { |
| 6368 | setup_coding_system (XCDR (coding_systems), coding); | 6473 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) |
| 6474 | setup_coding_system (XCAR (coding_systems), coding); | ||
| 6475 | else | ||
| 6476 | setup_coding_system (XCDR (coding_systems), coding); | ||
| 6477 | } | ||
| 6369 | } | 6478 | } |
| 6370 | } | 6479 | } |
| 6371 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 6480 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| @@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding) | |||
| 6378 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6487 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6379 | detect_info.found = detect_info.rejected = 0; | 6488 | detect_info.found = detect_info.rejected = 0; |
| 6380 | coding->head_ascii = 0; | 6489 | coding->head_ascii = 0; |
| 6490 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6381 | if (CONSP (coding_systems) | 6491 | if (CONSP (coding_systems) |
| 6382 | && detect_coding_utf_16 (coding, &detect_info)) | 6492 | && detect_coding_utf_16 (coding, &detect_info)) |
| 6383 | { | 6493 | { |
| @@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table, | |||
| 6815 | 6925 | ||
| 6816 | produced = dst - (coding->destination + coding->produced); | 6926 | produced = dst - (coding->destination + coding->produced); |
| 6817 | if (BUFFERP (coding->dst_object) && produced_chars > 0) | 6927 | if (BUFFERP (coding->dst_object) && produced_chars > 0) |
| 6818 | insert_from_gap (produced_chars, produced); | 6928 | insert_from_gap (produced_chars, produced, 0); |
| 6819 | coding->produced += produced; | 6929 | coding->produced += produced; |
| 6820 | coding->produced_char += produced_chars; | 6930 | coding->produced_char += produced_chars; |
| 6821 | return carryover; | 6931 | return carryover; |
| @@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding) | |||
| 7400 | } while (coding->consumed_char < coding->src_chars); | 7510 | } while (coding->consumed_char < coding->src_chars); |
| 7401 | 7511 | ||
| 7402 | if (BUFFERP (coding->dst_object) && coding->produced_char > 0) | 7512 | if (BUFFERP (coding->dst_object) && coding->produced_char > 0) |
| 7403 | insert_from_gap (coding->produced_char, coding->produced); | 7513 | insert_from_gap (coding->produced_char, coding->produced, 0); |
| 7404 | 7514 | ||
| 7405 | SAFE_FREE (); | 7515 | SAFE_FREE (); |
| 7406 | } | 7516 | } |
| @@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7510 | if (CODING_REQUIRE_DETECTION (coding)) | 7620 | if (CODING_REQUIRE_DETECTION (coding)) |
| 7511 | detect_coding (coding); | 7621 | detect_coding (coding); |
| 7512 | attrs = CODING_ID_ATTRS (coding->id); | 7622 | attrs = CODING_ID_ATTRS (coding->id); |
| 7513 | #ifndef CODING_DISABLE_ASCII_OPTIMIZATION | 7623 | if (! disable_ascii_optimization) |
| 7514 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) | ||
| 7515 | && NILP (CODING_ATTR_POST_READ (attrs)) | ||
| 7516 | && NILP (get_translation_table (attrs, 0, NULL)) | ||
| 7517 | && (inhibit_eol_conversion | ||
| 7518 | || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))) | ||
| 7519 | { | 7624 | { |
| 7520 | /* We can skip the conversion if all source bytes are ASCII. */ | 7625 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) |
| 7521 | if (coding->head_ascii < 0) | 7626 | && NILP (CODING_ATTR_POST_READ (attrs)) |
| 7627 | && NILP (get_translation_table (attrs, 0, NULL)) | ||
| 7628 | && (coding->head_ascii >= 0 /* We've already called detect_coding */ | ||
| 7629 | ? coding->head_ascii == bytes | ||
| 7630 | : detect_ascii (coding))) | ||
| 7522 | { | 7631 | { |
| 7523 | /* We have not yet counted the number of ASCII bytes at the | 7632 | if (coding->eol_seen == EOL_SEEN_CR) |
| 7524 | head of the source. Do it now. */ | 7633 | { |
| 7525 | const unsigned char *src, *src_end; | 7634 | unsigned char *src_end = GAP_END_ADDR; |
| 7635 | unsigned char *src = src - coding->src_bytes; | ||
| 7526 | 7636 | ||
| 7527 | coding_set_source (coding); | 7637 | while (src < src_end) |
| 7528 | src_end = coding->source + coding->src_bytes; | 7638 | { |
| 7529 | for (src = coding->source; src < src_end; src++) | 7639 | if (*src++ == '\r') |
| 7640 | src[-1] = '\n'; | ||
| 7641 | } | ||
| 7642 | } | ||
| 7643 | else if (coding->eol_seen == EOL_SEEN_CRLF) | ||
| 7530 | { | 7644 | { |
| 7531 | if (*src & 0x80) | 7645 | unsigned char *src = GAP_END_ADDR; |
| 7532 | break; | 7646 | unsigned char *src_beg = src - coding->src_bytes; |
| 7647 | unsigned char *dst = src; | ||
| 7648 | |||
| 7649 | while (src_beg < src) | ||
| 7650 | { | ||
| 7651 | *--dst = *--src; | ||
| 7652 | if (*src == '\n') | ||
| 7653 | src--; | ||
| 7654 | } | ||
| 7655 | bytes -= dst - src; | ||
| 7533 | } | 7656 | } |
| 7534 | coding->head_ascii = src - coding->source; | 7657 | coding->produced_char = coding->produced = bytes; |
| 7535 | } | 7658 | insert_from_gap (bytes, bytes, 1); |
| 7536 | if (coding->src_bytes == coding->head_ascii) | ||
| 7537 | { | ||
| 7538 | /* No need of conversion. Use the data in the gap as is. */ | ||
| 7539 | coding->produced_char = chars; | ||
| 7540 | coding->produced = bytes; | ||
| 7541 | adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1); | ||
| 7542 | return; | 7659 | return; |
| 7543 | } | 7660 | } |
| 7544 | } | 7661 | } |
| 7545 | #endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */ | ||
| 7546 | code_conversion_save (0, 0); | 7662 | code_conversion_save (0, 0); |
| 7547 | 7663 | ||
| 7548 | coding->mode |= CODING_MODE_LAST_BLOCK; | 7664 | coding->mode |= CODING_MODE_LAST_BLOCK; |
| @@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and | |||
| 10758 | decode text as usual. */); | 10874 | decode text as usual. */); |
| 10759 | inhibit_null_byte_detection = 0; | 10875 | inhibit_null_byte_detection = 0; |
| 10760 | 10876 | ||
| 10877 | DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization, | ||
| 10878 | doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files. | ||
| 10879 | Internal use only. Removed after the experimental optimizer gets stable. */); | ||
| 10880 | disable_ascii_optimization = 0; | ||
| 10881 | |||
| 10761 | DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, | 10882 | DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, |
| 10762 | doc: /* Char table for translating self-inserting characters. | 10883 | doc: /* Char table for translating self-inserting characters. |
| 10763 | This is applied to the result of input methods, not their input. | 10884 | This is applied to the result of input methods, not their input. |