aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c200
1 files changed, 186 insertions, 14 deletions
diff --git a/src/coding.c b/src/coding.c
index 32da72ab626..6cfcec905a1 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -286,6 +286,10 @@ encode_coding_XXX (struct coding_system *coding)
286#include <config.h> 286#include <config.h>
287#include <stdio.h> 287#include <stdio.h>
288 288
289#ifdef HAVE_WCHAR_H
290#include <wchar.h>
291#endif /* HAVE_WCHAR_H */
292
289#include "lisp.h" 293#include "lisp.h"
290#include "character.h" 294#include "character.h"
291#include "buffer.h" 295#include "buffer.h"
@@ -6067,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
6067#define EOL_SEEN_CR 2 6071#define EOL_SEEN_CR 2
6068#define EOL_SEEN_CRLF 4 6072#define EOL_SEEN_CRLF 4
6069 6073
6074
6075static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
6076
6077
6078/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
6079 By side effects, set coding->head_ascii and coding->eol_seen. The
6080 value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6081 EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6082 all the source bytes are ASCII. */
6083
6084static bool
6085detect_ascii (struct coding_system *coding)
6086{
6087 const unsigned char *src, *end;
6088 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6089 int eol_seen;
6090
6091 eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6092 : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6093 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6094 : EOL_SEEN_CR);
6095 coding_set_source (coding);
6096 src = coding->source;
6097 end = src + coding->src_bytes;
6098
6099 if (inhibit_eol_conversion)
6100 {
6101 /* We don't have to check EOL format. */
6102 while (src < end && !( *src & 0x80)) src++;
6103 eol_seen = EOL_SEEN_LF;
6104 adjust_coding_eol_type (coding, eol_seen);
6105 }
6106 else if (eol_seen != EOL_SEEN_NONE)
6107 {
6108 /* We don't have to check EOL format either. */
6109 while (src < end && !(*src & 0x80)) src++;
6110 }
6111 else
6112 {
6113 end--; /* We look ahead one byte. */
6114 while (src < end)
6115 {
6116 int c = *src;
6117
6118 if (c & 0x80)
6119 break;
6120 src++;
6121 if (c < 0x20)
6122 {
6123 if (c == '\r')
6124 {
6125 if (*src == '\n')
6126 {
6127 eol_seen |= EOL_SEEN_CRLF;
6128 src++;
6129 }
6130 else
6131 eol_seen |= EOL_SEEN_CR;
6132 }
6133 else if (c == '\n')
6134 eol_seen |= EOL_SEEN_LF;
6135 }
6136 }
6137 if (src > end)
6138 /* The last two bytes are CR LF, which means that we have
6139 scanned all bytes. */
6140 end++;
6141 else if (src == end)
6142 {
6143 end++;
6144 if (! (*src & 0x80))
6145 {
6146 if (*src == '\r')
6147 eol_seen |= EOL_SEEN_CR;
6148 else if (*src == '\n')
6149 eol_seen |= EOL_SEEN_LF;
6150 src++;
6151 }
6152 }
6153 adjust_coding_eol_type (coding, eol_seen);
6154 }
6155 coding->head_ascii = src - coding->source;
6156 coding->eol_seen = eol_seen;
6157 return (src == end);
6158}
6159
6160
6070/* Detect how end-of-line of a text of length SRC_BYTES pointed by 6161/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6071 SOURCE is encoded. If CATEGORY is one of 6162 SOURCE is encoded. If CATEGORY is one of
6072 coding_category_utf_16_XXXX, assume that CR and LF are encoded by 6163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6211,7 +6302,6 @@ detect_coding (struct coding_system *coding)
6211 coding_set_source (coding); 6302 coding_set_source (coding);
6212 6303
6213 src_end = coding->source + coding->src_bytes; 6304 src_end = coding->source + coding->src_bytes;
6214 coding->head_ascii = 0;
6215 6305
6216 /* If we have not yet decided the text encoding type, detect it 6306 /* If we have not yet decided the text encoding type, detect it
6217 now. */ 6307 now. */
@@ -6221,6 +6311,8 @@ detect_coding (struct coding_system *coding)
6221 struct coding_detection_info detect_info; 6311 struct coding_detection_info detect_info;
6222 bool null_byte_found = 0, eight_bit_found = 0; 6312 bool null_byte_found = 0, eight_bit_found = 0;
6223 6313
6314 coding->head_ascii = 0;
6315 coding->eol_seen = EOL_SEEN_NONE;
6224 detect_info.checked = detect_info.found = detect_info.rejected = 0; 6316 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6225 for (src = coding->source; src < src_end; src++) 6317 for (src = coding->source; src < src_end; src++)
6226 { 6318 {
@@ -6259,6 +6351,26 @@ detect_coding (struct coding_system *coding)
6259 if (eight_bit_found) 6351 if (eight_bit_found)
6260 break; 6352 break;
6261 } 6353 }
6354 else if (! disable_ascii_optimization
6355 && ! inhibit_eol_conversion)
6356 {
6357 if (c == '\r')
6358 {
6359 if (src < src_end && src[1] == '\n')
6360 {
6361 coding->eol_seen |= EOL_SEEN_CRLF;
6362 src++;
6363 coding->head_ascii++;
6364 }
6365 else
6366 coding->eol_seen |= EOL_SEEN_CR;
6367 }
6368 else if (c == '\n')
6369 {
6370 coding->eol_seen |= EOL_SEEN_LF;
6371 }
6372 }
6373
6262 if (! eight_bit_found) 6374 if (! eight_bit_found)
6263 coding->head_ascii++; 6375 coding->head_ascii++;
6264 } 6376 }
@@ -6349,14 +6461,20 @@ detect_coding (struct coding_system *coding)
6349 coding_systems 6461 coding_systems
6350 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6462 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6351 detect_info.found = detect_info.rejected = 0; 6463 detect_info.found = detect_info.rejected = 0;
6352 coding->head_ascii = 0; 6464 if (detect_ascii (coding))
6353 if (CONSP (coding_systems)
6354 && detect_coding_utf_8 (coding, &detect_info))
6355 { 6465 {
6356 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) 6466 setup_coding_system (XCDR (coding_systems), coding);
6357 setup_coding_system (XCAR (coding_systems), coding); 6467 }
6358 else 6468 else
6359 setup_coding_system (XCDR (coding_systems), coding); 6469 {
6470 if (CONSP (coding_systems)
6471 && detect_coding_utf_8 (coding, &detect_info))
6472 {
6473 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6474 setup_coding_system (XCAR (coding_systems), coding);
6475 else
6476 setup_coding_system (XCDR (coding_systems), coding);
6477 }
6360 } 6478 }
6361 } 6479 }
6362 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 6480 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6369,6 +6487,7 @@ detect_coding (struct coding_system *coding)
6369 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6487 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6370 detect_info.found = detect_info.rejected = 0; 6488 detect_info.found = detect_info.rejected = 0;
6371 coding->head_ascii = 0; 6489 coding->head_ascii = 0;
6490 coding->eol_seen = EOL_SEEN_NONE;
6372 if (CONSP (coding_systems) 6491 if (CONSP (coding_systems)
6373 && detect_coding_utf_16 (coding, &detect_info)) 6492 && detect_coding_utf_16 (coding, &detect_info))
6374 { 6493 {
@@ -6806,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6806 6925
6807 produced = dst - (coding->destination + coding->produced); 6926 produced = dst - (coding->destination + coding->produced);
6808 if (BUFFERP (coding->dst_object) && produced_chars > 0) 6927 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6809 insert_from_gap (produced_chars, produced); 6928 insert_from_gap (produced_chars, produced, 0);
6810 coding->produced += produced; 6929 coding->produced += produced;
6811 coding->produced_char += produced_chars; 6930 coding->produced_char += produced_chars;
6812 return carryover; 6931 return carryover;
@@ -7391,7 +7510,7 @@ encode_coding (struct coding_system *coding)
7391 } while (coding->consumed_char < coding->src_chars); 7510 } while (coding->consumed_char < coding->src_chars);
7392 7511
7393 if (BUFFERP (coding->dst_object) && coding->produced_char > 0) 7512 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7394 insert_from_gap (coding->produced_char, coding->produced); 7513 insert_from_gap (coding->produced_char, coding->produced, 0);
7395 7514
7396 SAFE_FREE (); 7515 SAFE_FREE ();
7397} 7516}
@@ -7487,8 +7606,6 @@ decode_coding_gap (struct coding_system *coding,
7487 ptrdiff_t count = SPECPDL_INDEX (); 7606 ptrdiff_t count = SPECPDL_INDEX ();
7488 Lisp_Object attrs; 7607 Lisp_Object attrs;
7489 7608
7490 code_conversion_save (0, 0);
7491
7492 coding->src_object = Fcurrent_buffer (); 7609 coding->src_object = Fcurrent_buffer ();
7493 coding->src_chars = chars; 7610 coding->src_chars = chars;
7494 coding->src_bytes = bytes; 7611 coding->src_bytes = bytes;
@@ -7502,13 +7619,53 @@ decode_coding_gap (struct coding_system *coding,
7502 7619
7503 if (CODING_REQUIRE_DETECTION (coding)) 7620 if (CODING_REQUIRE_DETECTION (coding))
7504 detect_coding (coding); 7621 detect_coding (coding);
7622 attrs = CODING_ID_ATTRS (coding->id);
7623 if (! disable_ascii_optimization)
7624 {
7625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7626 && NILP (CODING_ATTR_POST_READ (attrs))
7627 && NILP (get_translation_table (attrs, 0, NULL))
7628 && (coding->head_ascii >= 0 /* We've already called detect_coding */
7629 ? coding->head_ascii == bytes
7630 : detect_ascii (coding)))
7631 {
7632 if (coding->eol_seen == EOL_SEEN_CR)
7633 {
7634 unsigned char *src_end = GAP_END_ADDR;
7635 unsigned char *src = src_end - coding->src_bytes;
7636
7637 while (src < src_end)
7638 {
7639 if (*src++ == '\r')
7640 src[-1] = '\n';
7641 }
7642 }
7643 else if (coding->eol_seen == EOL_SEEN_CRLF)
7644 {
7645 unsigned char *src = GAP_END_ADDR;
7646 unsigned char *src_beg = src - coding->src_bytes;
7647 unsigned char *dst = src;
7648
7649 while (src_beg < src)
7650 {
7651 *--dst = *--src;
7652 if (*src == '\n')
7653 src--;
7654 }
7655 bytes -= dst - src;
7656 }
7657 coding->produced_char = coding->produced = bytes;
7658 insert_from_gap (bytes, bytes, 1);
7659 return;
7660 }
7661 }
7662 code_conversion_save (0, 0);
7505 7663
7506 coding->mode |= CODING_MODE_LAST_BLOCK; 7664 coding->mode |= CODING_MODE_LAST_BLOCK;
7507 current_buffer->text->inhibit_shrinking = 1; 7665 current_buffer->text->inhibit_shrinking = 1;
7508 decode_coding (coding); 7666 decode_coding (coding);
7509 current_buffer->text->inhibit_shrinking = 0; 7667 current_buffer->text->inhibit_shrinking = 0;
7510 7668
7511 attrs = CODING_ID_ATTRS (coding->id);
7512 if (! NILP (CODING_ATTR_POST_READ (attrs))) 7669 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7513 { 7670 {
7514 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE; 7671 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
@@ -7966,11 +8123,21 @@ from_unicode (Lisp_Object str)
7966 return code_convert_string_norecord (str, Qutf_16le, 0); 8123 return code_convert_string_norecord (str, Qutf_16le, 0);
7967} 8124}
7968 8125
8126Lisp_Object
8127from_unicode_buffer (const wchar_t* wstr)
8128{
8129 return from_unicode (
8130 make_unibyte_string (
8131 (char*) wstr,
8132 /* we get one of the two final 0 bytes for free. */
8133 1 + sizeof (wchar_t) * wcslen (wstr)));
8134}
8135
7969wchar_t * 8136wchar_t *
7970to_unicode (Lisp_Object str, Lisp_Object *buf) 8137to_unicode (Lisp_Object str, Lisp_Object *buf)
7971{ 8138{
7972 *buf = code_convert_string_norecord (str, Qutf_16le, 1); 8139 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7973 /* We need to make a another copy (in addition to the one made by 8140 /* We need to make another copy (in addition to the one made by
7974 code_convert_string_norecord) to ensure that the final string is 8141 code_convert_string_norecord) to ensure that the final string is
7975 _doubly_ zero terminated --- that is, that the string is 8142 _doubly_ zero terminated --- that is, that the string is
7976 terminated by two zero bytes and one utf-16le null character. 8143 terminated by two zero bytes and one utf-16le null character.
@@ -10707,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10707decode text as usual. */); 10874decode text as usual. */);
10708 inhibit_null_byte_detection = 0; 10875 inhibit_null_byte_detection = 0;
10709 10876
10877 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
10878 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
10879Internal use only. Removed after the experimental optimizer gets stable. */);
10880 disable_ascii_optimization = 0;
10881
10710 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, 10882 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10711 doc: /* Char table for translating self-inserting characters. 10883 doc: /* Char table for translating self-inserting characters.
10712This is applied to the result of input methods, not their input. 10884This is applied to the result of input methods, not their input.