diff options
| author | Tom Tromey | 2013-03-17 05:17:24 -0600 |
|---|---|---|
| committer | Tom Tromey | 2013-03-17 05:17:24 -0600 |
| commit | 6bd488cd8d05aa3983ca55f70ee384732d8c0085 (patch) | |
| tree | 5645fc7b882638d6c0eb3f61fd55bde1a63fc190 /src/coding.c | |
| parent | 71f91792e3013b397996905224f387da5cc539a9 (diff) | |
| parent | 9c44569ea2a18099307e0571d523d8637000a153 (diff) | |
| download | emacs-6bd488cd8d05aa3983ca55f70ee384732d8c0085.tar.gz emacs-6bd488cd8d05aa3983ca55f70ee384732d8c0085.zip | |
merge from trunk
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 200 |
1 files changed, 186 insertions, 14 deletions
diff --git a/src/coding.c b/src/coding.c index 32da72ab626..6cfcec905a1 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -286,6 +286,10 @@ encode_coding_XXX (struct coding_system *coding) | |||
| 286 | #include <config.h> | 286 | #include <config.h> |
| 287 | #include <stdio.h> | 287 | #include <stdio.h> |
| 288 | 288 | ||
| 289 | #ifdef HAVE_WCHAR_H | ||
| 290 | #include <wchar.h> | ||
| 291 | #endif /* HAVE_WCHAR_H */ | ||
| 292 | |||
| 289 | #include "lisp.h" | 293 | #include "lisp.h" |
| 290 | #include "character.h" | 294 | #include "character.h" |
| 291 | #include "buffer.h" | 295 | #include "buffer.h" |
| @@ -6067,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system) | |||
| 6067 | #define EOL_SEEN_CR 2 | 6071 | #define EOL_SEEN_CR 2 |
| 6068 | #define EOL_SEEN_CRLF 4 | 6072 | #define EOL_SEEN_CRLF 4 |
| 6069 | 6073 | ||
| 6074 | |||
| 6075 | static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); | ||
| 6076 | |||
| 6077 | |||
| 6078 | /* Return 1 if all the source bytes are ASCII, and return 0 otherwize. | ||
| 6079 | By side effects, set coding->head_ascii and coding->eol_seen. The | ||
| 6080 | value of coding->eol_seen is "logical or" of EOL_SEEN_LF, | ||
| 6081 | EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when | ||
| 6082 | all the source bytes are ASCII. */ | ||
| 6083 | |||
| 6084 | static bool | ||
| 6085 | detect_ascii (struct coding_system *coding) | ||
| 6086 | { | ||
| 6087 | const unsigned char *src, *end; | ||
| 6088 | Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); | ||
| 6089 | int eol_seen; | ||
| 6090 | |||
| 6091 | eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE | ||
| 6092 | : EQ (eol_type, Qunix) ? EOL_SEEN_LF | ||
| 6093 | : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF | ||
| 6094 | : EOL_SEEN_CR); | ||
| 6095 | coding_set_source (coding); | ||
| 6096 | src = coding->source; | ||
| 6097 | end = src + coding->src_bytes; | ||
| 6098 | |||
| 6099 | if (inhibit_eol_conversion) | ||
| 6100 | { | ||
| 6101 | /* We don't have to check EOL format. */ | ||
| 6102 | while (src < end && !( *src & 0x80)) src++; | ||
| 6103 | eol_seen = EOL_SEEN_LF; | ||
| 6104 | adjust_coding_eol_type (coding, eol_seen); | ||
| 6105 | } | ||
| 6106 | else if (eol_seen != EOL_SEEN_NONE) | ||
| 6107 | { | ||
| 6108 | /* We don't have to check EOL format either. */ | ||
| 6109 | while (src < end && !(*src & 0x80)) src++; | ||
| 6110 | } | ||
| 6111 | else | ||
| 6112 | { | ||
| 6113 | end--; /* We look ahead one byte. */ | ||
| 6114 | while (src < end) | ||
| 6115 | { | ||
| 6116 | int c = *src; | ||
| 6117 | |||
| 6118 | if (c & 0x80) | ||
| 6119 | break; | ||
| 6120 | src++; | ||
| 6121 | if (c < 0x20) | ||
| 6122 | { | ||
| 6123 | if (c == '\r') | ||
| 6124 | { | ||
| 6125 | if (*src == '\n') | ||
| 6126 | { | ||
| 6127 | eol_seen |= EOL_SEEN_CRLF; | ||
| 6128 | src++; | ||
| 6129 | } | ||
| 6130 | else | ||
| 6131 | eol_seen |= EOL_SEEN_CR; | ||
| 6132 | } | ||
| 6133 | else if (c == '\n') | ||
| 6134 | eol_seen |= EOL_SEEN_LF; | ||
| 6135 | } | ||
| 6136 | } | ||
| 6137 | if (src > end) | ||
| 6138 | /* The last two bytes are CR LF, which means that we have | ||
| 6139 | scanned all bytes. */ | ||
| 6140 | end++; | ||
| 6141 | else if (src == end) | ||
| 6142 | { | ||
| 6143 | end++; | ||
| 6144 | if (! (*src & 0x80)) | ||
| 6145 | { | ||
| 6146 | if (*src == '\r') | ||
| 6147 | eol_seen |= EOL_SEEN_CR; | ||
| 6148 | else if (*src == '\n') | ||
| 6149 | eol_seen |= EOL_SEEN_LF; | ||
| 6150 | src++; | ||
| 6151 | } | ||
| 6152 | } | ||
| 6153 | adjust_coding_eol_type (coding, eol_seen); | ||
| 6154 | } | ||
| 6155 | coding->head_ascii = src - coding->source; | ||
| 6156 | coding->eol_seen = eol_seen; | ||
| 6157 | return (src == end); | ||
| 6158 | } | ||
| 6159 | |||
| 6160 | |||
| 6070 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by | 6161 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by |
| 6071 | SOURCE is encoded. If CATEGORY is one of | 6162 | SOURCE is encoded. If CATEGORY is one of |
| 6072 | coding_category_utf_16_XXXX, assume that CR and LF are encoded by | 6163 | coding_category_utf_16_XXXX, assume that CR and LF are encoded by |
| @@ -6211,7 +6302,6 @@ detect_coding (struct coding_system *coding) | |||
| 6211 | coding_set_source (coding); | 6302 | coding_set_source (coding); |
| 6212 | 6303 | ||
| 6213 | src_end = coding->source + coding->src_bytes; | 6304 | src_end = coding->source + coding->src_bytes; |
| 6214 | coding->head_ascii = 0; | ||
| 6215 | 6305 | ||
| 6216 | /* If we have not yet decided the text encoding type, detect it | 6306 | /* If we have not yet decided the text encoding type, detect it |
| 6217 | now. */ | 6307 | now. */ |
| @@ -6221,6 +6311,8 @@ detect_coding (struct coding_system *coding) | |||
| 6221 | struct coding_detection_info detect_info; | 6311 | struct coding_detection_info detect_info; |
| 6222 | bool null_byte_found = 0, eight_bit_found = 0; | 6312 | bool null_byte_found = 0, eight_bit_found = 0; |
| 6223 | 6313 | ||
| 6314 | coding->head_ascii = 0; | ||
| 6315 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6224 | detect_info.checked = detect_info.found = detect_info.rejected = 0; | 6316 | detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 6225 | for (src = coding->source; src < src_end; src++) | 6317 | for (src = coding->source; src < src_end; src++) |
| 6226 | { | 6318 | { |
| @@ -6259,6 +6351,26 @@ detect_coding (struct coding_system *coding) | |||
| 6259 | if (eight_bit_found) | 6351 | if (eight_bit_found) |
| 6260 | break; | 6352 | break; |
| 6261 | } | 6353 | } |
| 6354 | else if (! disable_ascii_optimization | ||
| 6355 | && ! inhibit_eol_conversion) | ||
| 6356 | { | ||
| 6357 | if (c == '\r') | ||
| 6358 | { | ||
| 6359 | if (src < src_end && src[1] == '\n') | ||
| 6360 | { | ||
| 6361 | coding->eol_seen |= EOL_SEEN_CRLF; | ||
| 6362 | src++; | ||
| 6363 | coding->head_ascii++; | ||
| 6364 | } | ||
| 6365 | else | ||
| 6366 | coding->eol_seen |= EOL_SEEN_CR; | ||
| 6367 | } | ||
| 6368 | else if (c == '\n') | ||
| 6369 | { | ||
| 6370 | coding->eol_seen |= EOL_SEEN_LF; | ||
| 6371 | } | ||
| 6372 | } | ||
| 6373 | |||
| 6262 | if (! eight_bit_found) | 6374 | if (! eight_bit_found) |
| 6263 | coding->head_ascii++; | 6375 | coding->head_ascii++; |
| 6264 | } | 6376 | } |
| @@ -6349,14 +6461,20 @@ detect_coding (struct coding_system *coding) | |||
| 6349 | coding_systems | 6461 | coding_systems |
| 6350 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6462 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6351 | detect_info.found = detect_info.rejected = 0; | 6463 | detect_info.found = detect_info.rejected = 0; |
| 6352 | coding->head_ascii = 0; | 6464 | if (detect_ascii (coding)) |
| 6353 | if (CONSP (coding_systems) | ||
| 6354 | && detect_coding_utf_8 (coding, &detect_info)) | ||
| 6355 | { | 6465 | { |
| 6356 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | 6466 | setup_coding_system (XCDR (coding_systems), coding); |
| 6357 | setup_coding_system (XCAR (coding_systems), coding); | 6467 | } |
| 6358 | else | 6468 | else |
| 6359 | setup_coding_system (XCDR (coding_systems), coding); | 6469 | { |
| 6470 | if (CONSP (coding_systems) | ||
| 6471 | && detect_coding_utf_8 (coding, &detect_info)) | ||
| 6472 | { | ||
| 6473 | if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) | ||
| 6474 | setup_coding_system (XCAR (coding_systems), coding); | ||
| 6475 | else | ||
| 6476 | setup_coding_system (XCDR (coding_systems), coding); | ||
| 6477 | } | ||
| 6360 | } | 6478 | } |
| 6361 | } | 6479 | } |
| 6362 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 6480 | else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| @@ -6369,6 +6487,7 @@ detect_coding (struct coding_system *coding) | |||
| 6369 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); | 6487 | = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); |
| 6370 | detect_info.found = detect_info.rejected = 0; | 6488 | detect_info.found = detect_info.rejected = 0; |
| 6371 | coding->head_ascii = 0; | 6489 | coding->head_ascii = 0; |
| 6490 | coding->eol_seen = EOL_SEEN_NONE; | ||
| 6372 | if (CONSP (coding_systems) | 6491 | if (CONSP (coding_systems) |
| 6373 | && detect_coding_utf_16 (coding, &detect_info)) | 6492 | && detect_coding_utf_16 (coding, &detect_info)) |
| 6374 | { | 6493 | { |
| @@ -6806,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table, | |||
| 6806 | 6925 | ||
| 6807 | produced = dst - (coding->destination + coding->produced); | 6926 | produced = dst - (coding->destination + coding->produced); |
| 6808 | if (BUFFERP (coding->dst_object) && produced_chars > 0) | 6927 | if (BUFFERP (coding->dst_object) && produced_chars > 0) |
| 6809 | insert_from_gap (produced_chars, produced); | 6928 | insert_from_gap (produced_chars, produced, 0); |
| 6810 | coding->produced += produced; | 6929 | coding->produced += produced; |
| 6811 | coding->produced_char += produced_chars; | 6930 | coding->produced_char += produced_chars; |
| 6812 | return carryover; | 6931 | return carryover; |
| @@ -7391,7 +7510,7 @@ encode_coding (struct coding_system *coding) | |||
| 7391 | } while (coding->consumed_char < coding->src_chars); | 7510 | } while (coding->consumed_char < coding->src_chars); |
| 7392 | 7511 | ||
| 7393 | if (BUFFERP (coding->dst_object) && coding->produced_char > 0) | 7512 | if (BUFFERP (coding->dst_object) && coding->produced_char > 0) |
| 7394 | insert_from_gap (coding->produced_char, coding->produced); | 7513 | insert_from_gap (coding->produced_char, coding->produced, 0); |
| 7395 | 7514 | ||
| 7396 | SAFE_FREE (); | 7515 | SAFE_FREE (); |
| 7397 | } | 7516 | } |
| @@ -7487,8 +7606,6 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7487 | ptrdiff_t count = SPECPDL_INDEX (); | 7606 | ptrdiff_t count = SPECPDL_INDEX (); |
| 7488 | Lisp_Object attrs; | 7607 | Lisp_Object attrs; |
| 7489 | 7608 | ||
| 7490 | code_conversion_save (0, 0); | ||
| 7491 | |||
| 7492 | coding->src_object = Fcurrent_buffer (); | 7609 | coding->src_object = Fcurrent_buffer (); |
| 7493 | coding->src_chars = chars; | 7610 | coding->src_chars = chars; |
| 7494 | coding->src_bytes = bytes; | 7611 | coding->src_bytes = bytes; |
| @@ -7502,13 +7619,53 @@ decode_coding_gap (struct coding_system *coding, | |||
| 7502 | 7619 | ||
| 7503 | if (CODING_REQUIRE_DETECTION (coding)) | 7620 | if (CODING_REQUIRE_DETECTION (coding)) |
| 7504 | detect_coding (coding); | 7621 | detect_coding (coding); |
| 7622 | attrs = CODING_ID_ATTRS (coding->id); | ||
| 7623 | if (! disable_ascii_optimization) | ||
| 7624 | { | ||
| 7625 | if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) | ||
| 7626 | && NILP (CODING_ATTR_POST_READ (attrs)) | ||
| 7627 | && NILP (get_translation_table (attrs, 0, NULL)) | ||
| 7628 | && (coding->head_ascii >= 0 /* We've already called detect_coding */ | ||
| 7629 | ? coding->head_ascii == bytes | ||
| 7630 | : detect_ascii (coding))) | ||
| 7631 | { | ||
| 7632 | if (coding->eol_seen == EOL_SEEN_CR) | ||
| 7633 | { | ||
| 7634 | unsigned char *src_end = GAP_END_ADDR; | ||
| 7635 | unsigned char *src = src_end - coding->src_bytes; | ||
| 7636 | |||
| 7637 | while (src < src_end) | ||
| 7638 | { | ||
| 7639 | if (*src++ == '\r') | ||
| 7640 | src[-1] = '\n'; | ||
| 7641 | } | ||
| 7642 | } | ||
| 7643 | else if (coding->eol_seen == EOL_SEEN_CRLF) | ||
| 7644 | { | ||
| 7645 | unsigned char *src = GAP_END_ADDR; | ||
| 7646 | unsigned char *src_beg = src - coding->src_bytes; | ||
| 7647 | unsigned char *dst = src; | ||
| 7648 | |||
| 7649 | while (src_beg < src) | ||
| 7650 | { | ||
| 7651 | *--dst = *--src; | ||
| 7652 | if (*src == '\n') | ||
| 7653 | src--; | ||
| 7654 | } | ||
| 7655 | bytes -= dst - src; | ||
| 7656 | } | ||
| 7657 | coding->produced_char = coding->produced = bytes; | ||
| 7658 | insert_from_gap (bytes, bytes, 1); | ||
| 7659 | return; | ||
| 7660 | } | ||
| 7661 | } | ||
| 7662 | code_conversion_save (0, 0); | ||
| 7505 | 7663 | ||
| 7506 | coding->mode |= CODING_MODE_LAST_BLOCK; | 7664 | coding->mode |= CODING_MODE_LAST_BLOCK; |
| 7507 | current_buffer->text->inhibit_shrinking = 1; | 7665 | current_buffer->text->inhibit_shrinking = 1; |
| 7508 | decode_coding (coding); | 7666 | decode_coding (coding); |
| 7509 | current_buffer->text->inhibit_shrinking = 0; | 7667 | current_buffer->text->inhibit_shrinking = 0; |
| 7510 | 7668 | ||
| 7511 | attrs = CODING_ID_ATTRS (coding->id); | ||
| 7512 | if (! NILP (CODING_ATTR_POST_READ (attrs))) | 7669 | if (! NILP (CODING_ATTR_POST_READ (attrs))) |
| 7513 | { | 7670 | { |
| 7514 | ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE; | 7671 | ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE; |
| @@ -7966,11 +8123,21 @@ from_unicode (Lisp_Object str) | |||
| 7966 | return code_convert_string_norecord (str, Qutf_16le, 0); | 8123 | return code_convert_string_norecord (str, Qutf_16le, 0); |
| 7967 | } | 8124 | } |
| 7968 | 8125 | ||
| 8126 | Lisp_Object | ||
| 8127 | from_unicode_buffer (const wchar_t* wstr) | ||
| 8128 | { | ||
| 8129 | return from_unicode ( | ||
| 8130 | make_unibyte_string ( | ||
| 8131 | (char*) wstr, | ||
| 8132 | /* we get one of the two final 0 bytes for free. */ | ||
| 8133 | 1 + sizeof (wchar_t) * wcslen (wstr))); | ||
| 8134 | } | ||
| 8135 | |||
| 7969 | wchar_t * | 8136 | wchar_t * |
| 7970 | to_unicode (Lisp_Object str, Lisp_Object *buf) | 8137 | to_unicode (Lisp_Object str, Lisp_Object *buf) |
| 7971 | { | 8138 | { |
| 7972 | *buf = code_convert_string_norecord (str, Qutf_16le, 1); | 8139 | *buf = code_convert_string_norecord (str, Qutf_16le, 1); |
| 7973 | /* We need to make a another copy (in addition to the one made by | 8140 | /* We need to make another copy (in addition to the one made by |
| 7974 | code_convert_string_norecord) to ensure that the final string is | 8141 | code_convert_string_norecord) to ensure that the final string is |
| 7975 | _doubly_ zero terminated --- that is, that the string is | 8142 | _doubly_ zero terminated --- that is, that the string is |
| 7976 | terminated by two zero bytes and one utf-16le null character. | 8143 | terminated by two zero bytes and one utf-16le null character. |
| @@ -10707,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and | |||
| 10707 | decode text as usual. */); | 10874 | decode text as usual. */); |
| 10708 | inhibit_null_byte_detection = 0; | 10875 | inhibit_null_byte_detection = 0; |
| 10709 | 10876 | ||
| 10877 | DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization, | ||
| 10878 | doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files. | ||
| 10879 | Internal use only. Removed after the experimental optimizer gets stable. */); | ||
| 10880 | disable_ascii_optimization = 0; | ||
| 10881 | |||
| 10710 | DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, | 10882 | DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, |
| 10711 | doc: /* Char table for translating self-inserting characters. | 10883 | doc: /* Char table for translating self-inserting characters. |
| 10712 | This is applied to the result of input methods, not their input. | 10884 | This is applied to the result of input methods, not their input. |