From bc989a58e2412c152c2aef9d35ca103979edebd5 Mon Sep 17 00:00:00 2001 From: Eli Zaretskii Date: Sat, 9 Mar 2013 20:09:33 +0200 Subject: coding.c (to_unicode): Fix a typo in a comment. --- src/coding.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index 32da72ab626..78e6cff7078 100644 --- a/src/coding.c +++ b/src/coding.c @@ -7970,7 +7970,7 @@ wchar_t * to_unicode (Lisp_Object str, Lisp_Object *buf) { *buf = code_convert_string_norecord (str, Qutf_16le, 1); - /* We need to make a another copy (in addition to the one made by + /* We need to make another copy (in addition to the one made by code_convert_string_norecord) to ensure that the final string is _doubly_ zero terminated --- that is, that the string is terminated by two zero bytes and one utf-16le null character. -- cgit v1.2.1 From c230dd7d89730f565df77046d0666d2082e386ee Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Sun, 10 Mar 2013 23:36:35 +0900 Subject: On file insertion, skip decoding if all bytes are ASCII. --- src/coding.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index 32da72ab626..f33b5e7c7d5 100644 --- a/src/coding.c +++ b/src/coding.c @@ -6349,7 +6349,12 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - coding->head_ascii = 0; + for (src = coding->source; src < src_end; src++) + { + if (*src & 0x80) + break; + } + coding->head_ascii = src - coding->source; if (CONSP (coding_systems) && detect_coding_utf_8 (coding, &detect_info)) { @@ -7487,8 +7492,6 @@ decode_coding_gap (struct coding_system *coding, ptrdiff_t count = SPECPDL_INDEX (); Lisp_Object attrs; - code_conversion_save (0, 0); - coding->src_object = Fcurrent_buffer (); coding->src_chars = chars; coding->src_bytes = bytes; @@ -7502,13 +7505,45 @@ decode_coding_gap (struct coding_system *coding, if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); + attrs = CODING_ID_ATTRS (coding->id); +#ifndef CODING_DISABLE_ASCII_OPTIMIZATION + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL))) + { + /* We can skip the conversion if all source bytes are ASCII. */ + if (coding->head_ascii < 0) + { + /* We have not yet counted the number of ASCII bytes at the + head of the source. Do it now. */ + const unsigned char *src, *src_end; + + coding_set_source (coding); + src_end = coding->source + coding->src_bytes; + for (src = coding->source; src < src_end; src++) + { + if (*src & 0x80) + break; + } + coding->head_ascii = src - coding->source; + } + if (coding->src_bytes == coding->head_ascii) + { + /* No need of conversion. Use the data in the gap as is. */ + coding->produced_char = chars; + coding->produced = bytes; + adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1); + return; + } + } +#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */ + code_conversion_save (0, 0); coding->mode |= CODING_MODE_LAST_BLOCK; current_buffer->text->inhibit_shrinking = 1; decode_coding (coding); current_buffer->text->inhibit_shrinking = 0; - attrs = CODING_ID_ATTRS (coding->id); if (! NILP (CODING_ATTR_POST_READ (attrs))) { ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE; -- cgit v1.2.1 From 7d051e215477753b813864caa23c1009c7692bda Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Mon, 11 Mar 2013 00:06:04 +0900 Subject: Fix previous change. --- src/coding.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index 98af4ddcef7..d6560a92b70 100644 --- a/src/coding.c +++ b/src/coding.c @@ -7509,7 +7509,9 @@ decode_coding_gap (struct coding_system *coding, #ifndef CODING_DISABLE_ASCII_OPTIMIZATION if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL))) + && NILP (get_translation_table (attrs, 0, NULL)) + && (inhibit_eol_conversion + || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))) { /* We can skip the conversion if all source bytes are ASCII. */ if (coding->head_ascii < 0) -- cgit v1.2.1 From 819e2da92a18d7af03ccd9cf0a2e5b940eb7b54f Mon Sep 17 00:00:00 2001 From: Daniel Colascione Date: Sun, 10 Mar 2013 14:55:25 -0800 Subject: 2013-03-10 Daniel Colascione * w32term.h (GUISTR, GUI_ENCODE_FILE, GUI_ENCODE_SYSTEM, GUI_FN) (GUI_SDATA, guichar_t): Macros to abstract out differences between NTGUI_UNICODE and !NTGUI_UNICODE builds, some moved out of w32fns.c. * w32term.c (construct_drag_n_drop): Use the above macros to make drag-and-drop work for non-ASCII filenames in cygw32 builds. * w32fns.c (x_set_name, x_set_title): Use the above macros to properly display non-ASCII frame titles in cygw32 builds. * w32fns.c (Fw32_shell_execute): Use the above macros to properly call ShellExecute in cygw32 builds. * w32fn.c (Fx_file_dialog): Use the above macros to simplify the common file dialog code. * w32fns.c (Ffile_system_info): Remove from cygw32 builds, which can just use du like other systems. * coding.c (from_unicode_buffer): Declare. * coding.c (from_unicode_buffer): Implement. --- src/coding.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index d6560a92b70..c18632f301b 100644 --- a/src/coding.c +++ b/src/coding.c @@ -286,6 +286,10 @@ encode_coding_XXX (struct coding_system *coding) #include #include +#ifdef HAVE_WCHAR_H +#include +#endif /* HAVE_WCHAR_H */ + #include "lisp.h" #include "character.h" #include "buffer.h" @@ -8003,6 +8007,16 @@ from_unicode (Lisp_Object str) return code_convert_string_norecord (str, Qutf_16le, 0); } +Lisp_Object +from_unicode_buffer (const wchar_t* wstr) +{ + return from_unicode ( + make_unibyte_string ( + (char*) wstr, + /* we get one of the two final 0 bytes for free. */ + 1 + sizeof (wchar_t) * wcslen (wstr))); +} + wchar_t * to_unicode (Lisp_Object str, Lisp_Object *buf) { -- cgit v1.2.1 From 8a44e6d176989d8eef140314098c76a70248ba61 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Sat, 16 Mar 2013 01:03:54 +0900 Subject: Optimize ASCII file reading with EOL format detection and decoding. --- src/coding.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 159 insertions(+), 38 deletions(-) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index c18632f301b..5047e1149bc 100644 --- a/src/coding.c +++ b/src/coding.c @@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system) #define EOL_SEEN_CR 2 #define EOL_SEEN_CRLF 4 + +static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); + + +/* Return 1 if all the source bytes are ASCII, and return 0 otherwize. + By side effects, set coding->head_ascii and coding->eol_seen. The + value of coding->eol_seen is "logical or" of EOL_SEEN_LF, + EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when + all the source bytes are ASCII. */ + +static bool +detect_ascii (struct coding_system *coding) +{ + const unsigned char *src, *end; + Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); + int eol_seen; + + eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE + : EQ (eol_type, Qunix) ? EOL_SEEN_LF + : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF + : EOL_SEEN_CR); + coding_set_source (coding); + src = coding->source; + end = src + coding->src_bytes; + + if (inhibit_eol_conversion) + { + /* We don't have to check EOL format. */ + while (src < end && !( *src & 0x80)) src++; + eol_seen = EOL_SEEN_LF; + adjust_coding_eol_type (coding, eol_seen); + } + else if (eol_seen != EOL_SEEN_NONE) + { + /* We don't have to check EOL format either. */ + while (src < end && !(*src & 0x80)) src++; + } + else + { + end--; /* We look ahead one byte. */ + while (src < end) + { + int c = *src; + + if (c & 0x80) + break; + src++; + if (c < 0x20) + { + if (c == '\r') + { + if (*src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + } + } + if (src > end) + /* The last two bytes are CR LF, which means that we have + scanned all bytes. */ + end++; + else if (src == end) + { + end++; + if (! (*src & 0x80)) + { + if (*src == '\r') + eol_seen |= EOL_SEEN_CR; + else if (*src == '\n') + eol_seen |= EOL_SEEN_LF; + src++; + } + } + adjust_coding_eol_type (coding, eol_seen); + } + coding->head_ascii = src - coding->source; + coding->eol_seen = eol_seen; + return (src == end); +} + + /* Detect how end-of-line of a text of length SRC_BYTES pointed by SOURCE is encoded. If CATEGORY is one of coding_category_utf_16_XXXX, assume that CR and LF are encoded by @@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding) coding_set_source (coding); src_end = coding->source + coding->src_bytes; - coding->head_ascii = 0; /* If we have not yet decided the text encoding type, detect it now. */ @@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding) struct coding_detection_info detect_info; bool null_byte_found = 0, eight_bit_found = 0; + coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; detect_info.checked = detect_info.found = detect_info.rejected = 0; for (src = coding->source; src < src_end; src++) { @@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding) if (eight_bit_found) break; } + else if (! disable_ascii_optimization + && ! inhibit_eol_conversion) + { + if (c == '\r') + { + if (src < src_end && src[1] == '\n') + { + coding->eol_seen |= EOL_SEEN_CRLF; + src++; + coding->head_ascii++; + } + else + coding->eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + { + coding->eol_seen |= EOL_SEEN_LF; + } + } + if (! eight_bit_found) coding->head_ascii++; } @@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - for (src = coding->source; src < src_end; src++) + if (detect_ascii (coding)) { - if (*src & 0x80) - break; + setup_coding_system (XCDR (coding_systems), coding); } - coding->head_ascii = src - coding->source; - if (CONSP (coding_systems) - && detect_coding_utf_8 (coding, &detect_info)) + else { - if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) - setup_coding_system (XCAR (coding_systems), coding); - else - setup_coding_system (XCDR (coding_systems), coding); + if (CONSP (coding_systems) + && detect_coding_utf_8 (coding, &detect_info)) + { + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + setup_coding_system (XCAR (coding_systems), coding); + else + setup_coding_system (XCDR (coding_systems), coding); + } } } else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) @@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding) = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; if (CONSP (coding_systems) && detect_coding_utf_16 (coding, &detect_info)) { @@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table, produced = dst - (coding->destination + coding->produced); if (BUFFERP (coding->dst_object) && produced_chars > 0) - insert_from_gap (produced_chars, produced); + insert_from_gap (produced_chars, produced, 0); coding->produced += produced; coding->produced_char += produced_chars; return carryover; @@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding) } while (coding->consumed_char < coding->src_chars); if (BUFFERP (coding->dst_object) && coding->produced_char > 0) - insert_from_gap (coding->produced_char, coding->produced); + insert_from_gap (coding->produced_char, coding->produced, 0); SAFE_FREE (); } @@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding, if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); attrs = CODING_ID_ATTRS (coding->id); -#ifndef CODING_DISABLE_ASCII_OPTIMIZATION - if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) - && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL)) - && (inhibit_eol_conversion - || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))) + if (! disable_ascii_optimization) { - /* We can skip the conversion if all source bytes are ASCII. */ - if (coding->head_ascii < 0) + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL)) + && (coding->head_ascii >= 0 /* We've already called detect_coding */ + ? coding->head_ascii == bytes + : detect_ascii (coding))) { - /* We have not yet counted the number of ASCII bytes at the - head of the source. Do it now. */ - const unsigned char *src, *src_end; + if (coding->eol_seen == EOL_SEEN_CR) + { + unsigned char *src_end = GAP_END_ADDR; + unsigned char *src = src - coding->src_bytes; - coding_set_source (coding); - src_end = coding->source + coding->src_bytes; - for (src = coding->source; src < src_end; src++) + while (src < src_end) + { + if (*src++ == '\r') + src[-1] = '\n'; + } + } + else if (coding->eol_seen == EOL_SEEN_CRLF) { - if (*src & 0x80) - break; + unsigned char *src = GAP_END_ADDR; + unsigned char *src_beg = src - coding->src_bytes; + unsigned char *dst = src; + + while (src_beg < src) + { + *--dst = *--src; + if (*src == '\n') + src--; + } + bytes -= dst - src; } - coding->head_ascii = src - coding->source; - } - if (coding->src_bytes == coding->head_ascii) - { - /* No need of conversion. Use the data in the gap as is. */ - coding->produced_char = chars; - coding->produced = bytes; - adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1); + coding->produced_char = coding->produced = bytes; + insert_from_gap (bytes, bytes, 1); return; } } -#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */ code_conversion_save (0, 0); coding->mode |= CODING_MODE_LAST_BLOCK; @@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and decode text as usual. */); inhibit_null_byte_detection = 0; + DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization, + doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files. +Internal use only. Removed after the experimental optimizer gets stable. */); + disable_ascii_optimization = 0; + DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. This is applied to the result of input methods, not their input. -- cgit v1.2.1 From cded56c19b30e038537398b5213438c339428ed9 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 15 Mar 2013 13:03:31 -0700 Subject: * coding.c (decode_coding_gap): Fix typo caught by static checking. --- src/coding.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/coding.c') diff --git a/src/coding.c b/src/coding.c index 5047e1149bc..6cfcec905a1 100644 --- a/src/coding.c +++ b/src/coding.c @@ -7632,7 +7632,7 @@ decode_coding_gap (struct coding_system *coding, if (coding->eol_seen == EOL_SEEN_CR) { unsigned char *src_end = GAP_END_ADDR; - unsigned char *src = src - coding->src_bytes; + unsigned char *src = src_end - coding->src_bytes; while (src < src_end) { -- cgit v1.2.1