Optimize ASCII file reading with EOL format detection and decoding.

author: Kenichi Handa 2013-03-16 01:03:54 +0900
committer: Kenichi Handa 2013-03-16 01:03:54 +0900
commit: 8a44e6d176989d8eef140314098c76a70248ba61 (patch)
tree: 096ee4a0f9a15f2f300ba68d2dd1dd28b88e18a0 /src
parent: 9b5939800615a4e08ac389813a70faf4b9e57bba (diff)
download: emacs-8a44e6d176989d8eef140314098c76a70248ba61.tar.gz
emacs-8a44e6d176989d8eef140314098c76a70248ba61.zip
5 files changed, 209 insertions, 59 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index 8ae25e6e612..44e2ff1a1f1 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,31 @@
+2013-03-15  handa  <handa@gnu.org>
+        * insdel.c (insert_from_gap): New arg text_at_gap_tail.
+        (adjust_after_replace): Make it back to static.  Delete the third
+        arg text_at_gap_tail.  Cancel the code for handling it.
+        * coding.h (struct coding_system): New member eol_seen.
+        * coding.c (detect_ascii): New function.
+        (detect_coding): Set coding->head_ascii and coding->eol_seen only
+        when the source bytes are actually scanned.  On detecting for
+        coding_category_utf_8_auto, call detect_ascii instead of scanning
+        source bytes directly.
+        (produce_chars): Call insert_from_gap with the new arg 0.
+        (encode_coding): Likewise.
+        (decode_coding_gap): Control ASCII optimization by the variable
+        disable_ascii_optimization instead of #ifndef .. #endif.
+        Deccode EOL format according to coding->eol_seen.
+        (syms_of_coding): Declare disable-ascii-optimization as a Lisp
+        variable.
+        * global.h (struct emacs_globals): New member
+        f_disable_ascii_optimization.
+        (disable_ascii_optimization): New macro.
+        * lisp.h (adjust_after_replace): Cancel externing it.
+        (insert_from_gap): Adjust prototype.
 2013-03-11  Paul Eggert  <eggert@cs.ucla.edu>
        * insdel.c (adjust_after_replace): Use bool for boolean.
diff --git a/src/coding.c b/src/coding.c
index c18632f301b..5047e1149bc 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
 #define EOL_SEEN_CR     2
 #define EOL_SEEN_CRLF   4
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
+/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
+   By side effects, set coding->head_ascii and coding->eol_seen.  The
+   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
+   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
+   all the source bytes are ASCII.  */
+static bool
+detect_ascii (struct coding_system *coding)
+{
+  const unsigned char *src, *end;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
+  int eol_seen;
+  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
+              : EQ (eol_type, Qunix) ? EOL_SEEN_LF
+              : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+              : EOL_SEEN_CR);
+  coding_set_source (coding);
+  src = coding->source;
+  end = src + coding->src_bytes;
+  if (inhibit_eol_conversion)
+    {
+      /* We don't have to check EOL format.  */
+      while (src < end && !( *src & 0x80)) src++;
+      eol_seen = EOL_SEEN_LF;
+      adjust_coding_eol_type (coding, eol_seen);
+    }
+  else if (eol_seen != EOL_SEEN_NONE)
+    {
+      /* We don't have to check EOL format either.  */
+      while (src < end && !(*src & 0x80)) src++;
+    }
+  else
+    {
+      end--;                    /* We look ahead one byte.  */
+      while (src < end)
+        {
+          int c = *src;
+          if (c & 0x80)
+            break;
+          src++;
+          if (c < 0x20)
+            {
+              if (c == '\r')
+                {
+                  if (*src == '\n')
+                    {
+                      eol_seen |= EOL_SEEN_CRLF;
+                      src++;
+                    }
+                  else
+                    eol_seen |= EOL_SEEN_CR;
+                }
+              else if (c == '\n')
+                eol_seen |= EOL_SEEN_LF;
+            }
+        }
+      if (src > end)
+        /* The last two bytes are CR LF, which means that we have
+           scanned all bytes. */
+        end++;
+      else if (src == end)
+        {
+          end++;
+          if (! (*src & 0x80))
+            {
+              if (*src == '\r')
+                eol_seen |= EOL_SEEN_CR;
+              else if (*src  == '\n')
+                eol_seen |= EOL_SEEN_LF;
+              src++;
+            }
+        }
+      adjust_coding_eol_type (coding, eol_seen);
+    }
+  coding->head_ascii = src - coding->source;
+  coding->eol_seen = eol_seen;
+  return (src == end);
+}
 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
   SOURCE is encoded.  If CATEGORY is one of
   coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding)
  coding_set_source (coding);
  src_end = coding->source + coding->src_bytes;
-  coding->head_ascii = 0;
  /* If we have not yet decided the text encoding type, detect it
     now.  */
@@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding)
      struct coding_detection_info detect_info;
      bool null_byte_found = 0, eight_bit_found = 0;
+      coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
      detect_info.checked = detect_info.found = detect_info.rejected = 0;
      for (src = coding->source; src < src_end; src++)
        {
@@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding)
                  if (eight_bit_found)
                    break;
                }
+              else if (! disable_ascii_optimization
+                       && ! inhibit_eol_conversion)
+                {
+                  if (c == '\r')
+                    {
+                      if (src < src_end && src[1] == '\n')
+                        {
+                          coding->eol_seen |= EOL_SEEN_CRLF;
+                          src++;
+                          coding->head_ascii++;
+                        }
+                      else
+                        coding->eol_seen |= EOL_SEEN_CR;
+                    }
+                  else if (c == '\n')
+                    {
+                      coding->eol_seen |= EOL_SEEN_LF;
+                    }
+                }
              if (! eight_bit_found)
                coding->head_ascii++;
            }
@@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding)
      coding_systems
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
      detect_info.found = detect_info.rejected = 0;
-      for (src = coding->source; src < src_end; src++)
+      if (detect_ascii (coding))
        {
-          if (*src & 0x80)
+          setup_coding_system (XCDR (coding_systems), coding);
-            break;
        }
-      coding->head_ascii = src - coding->source;
+      else
-      if (CONSP (coding_systems)
-          && detect_coding_utf_8 (coding, &detect_info))
        {
-          if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+          if (CONSP (coding_systems)
-            setup_coding_system (XCAR (coding_systems), coding);
+              && detect_coding_utf_8 (coding, &detect_info))
-          else
+            {
-            setup_coding_system (XCDR (coding_systems), coding);
+              if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+                setup_coding_system (XCAR (coding_systems), coding);
+              else
+                setup_coding_system (XCDR (coding_systems), coding);
+            }
        }
    }
  else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding)
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
      detect_info.found = detect_info.rejected = 0;
      coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
      if (CONSP (coding_systems)
          && detect_coding_utf_16 (coding, &detect_info))
        {
@@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
  produced = dst - (coding->destination + coding->produced);
  if (BUFFERP (coding->dst_object) && produced_chars > 0)
-    insert_from_gap (produced_chars, produced);
+    insert_from_gap (produced_chars, produced, 0);
  coding->produced += produced;
  coding->produced_char += produced_chars;
  return carryover;
@@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding)
  } while (coding->consumed_char < coding->src_chars);
  if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
-    insert_from_gap (coding->produced_char, coding->produced);
+    insert_from_gap (coding->produced_char, coding->produced, 0);
  SAFE_FREE ();
 }
@@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding,
  if (CODING_REQUIRE_DETECTION (coding))
    detect_coding (coding);
  attrs = CODING_ID_ATTRS (coding->id);
-#ifndef CODING_DISABLE_ASCII_OPTIMIZATION
+  if (! disable_ascii_optimization)
-  if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
-      && NILP (CODING_ATTR_POST_READ (attrs))
-      && NILP (get_translation_table (attrs, 0, NULL))
-      && (inhibit_eol_conversion
-          || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
    {
-      /* We can skip the conversion if all source bytes are ASCII.  */
+      if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
-      if (coding->head_ascii < 0)
+          && NILP (CODING_ATTR_POST_READ (attrs))
+          && NILP (get_translation_table (attrs, 0, NULL))
+          && (coding->head_ascii >= 0 /* We've already called detect_coding */
+              ? coding->head_ascii == bytes
+              : detect_ascii (coding)))
        {
-          /* We have not yet counted the number of ASCII bytes at the
+          if (coding->eol_seen == EOL_SEEN_CR)
-             head of the source.  Do it now.  */
+            {
-          const unsigned char *src, *src_end;
+              unsigned char *src_end = GAP_END_ADDR;
+              unsigned char *src = src - coding->src_bytes;
-          coding_set_source (coding);
+              while (src < src_end)
-          src_end = coding->source + coding->src_bytes;
+                {
-          for (src = coding->source; src < src_end; src++)
+                  if (*src++ == '\r')
+                    src[-1] = '\n';
+                }
+            }
+          else if (coding->eol_seen == EOL_SEEN_CRLF)
            {
-              if (*src & 0x80)
+              unsigned char *src = GAP_END_ADDR;
-                break;
+              unsigned char *src_beg = src - coding->src_bytes;
+              unsigned char *dst = src;
+              while (src_beg < src)
+                {
+                  *--dst = *--src;
+                  if (*src == '\n')
+                    src--;
+                }
+              bytes -= dst - src;
            }
-          coding->head_ascii = src - coding->source;
+          coding->produced_char = coding->produced = bytes;
-        }
+          insert_from_gap (bytes, bytes, 1);
-      if (coding->src_bytes == coding->head_ascii)
-        {
-          /* No need of conversion.  Use the data in the gap as is.  */
-          coding->produced_char = chars;
-          coding->produced = bytes;
-          adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
          return;
        }
    }
-#endif  /* not CODING_DISABLE_ASCII_OPTIMIZATION */
  code_conversion_save (0, 0);
  coding->mode |= CODING_MODE_LAST_BLOCK;
@@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
 decode text as usual.  */);
  inhibit_null_byte_detection = 0;
+  DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
+               doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
+Internal use only.  Removed after the experimental optimizer gets stable. */);
+  disable_ascii_optimization = 0;
  DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
               doc: /* Char table for translating self-inserting characters.
 This is applied to the result of input methods, not their input.
diff --git a/src/coding.h b/src/coding.h
index c13567c3d53..d40209be68f 100644
--- a/src/coding.h
+++ b/src/coding.h
@@ -440,9 +440,13 @@ struct coding_system
  /* How may heading bytes we can skip for decoding.  This is set to
     -1 in setup_coding_system, and updated by detect_coding.  So,
     when this is equal to the byte length of the text being
-     converted, we can skip the actual conversion process.  */
+     converted, we can skip the actual conversion process except for
+     the eol format.  */
  ptrdiff_t head_ascii;
+  /* Used internally in coding.c.  See the comment of detect_ascii.  */
+  int eol_seen;
  /* The following members are set by encoding/decoding routine.  */
  ptrdiff_t produced, produced_char, consumed, consumed_char;
diff --git a/src/insdel.c b/src/insdel.c
index c0afa80d5e8..a60fed0c32e 100644
--- a/src/insdel.c
+++ b/src/insdel.c
@@ -977,10 +977,11 @@ insert_from_string_1 (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte,
 }
 /* Insert a sequence of NCHARS chars which occupy NBYTES bytes
-   starting at GPT_ADDR.  */
+   starting at GAP_END_ADDR - NBYTES (if text_at_gap_tail) and at
+   GPT_ADDR (if not text_at_gap_tail).  */
 void
-insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
+insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
 {
  if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
    nchars = nbytes;
@@ -989,10 +990,13 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
  MODIFF++;
  GAP_SIZE -= nbytes;
-  GPT += nchars;
+  if (! text_at_gap_tail)
+    {
+      GPT += nchars;
+      GPT_BYTE += nbytes;
+    }
  ZV += nchars;
  Z += nchars;
-  GPT_BYTE += nbytes;
  ZV_BYTE += nbytes;
  Z_BYTE += nbytes;
  if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor.  */
@@ -1010,7 +1014,7 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
                                   current_buffer, 0);
    }
-  if (GPT - nchars < PT)
+  if (! text_at_gap_tail && GPT - nchars < PT)
    adjust_point (nchars, nbytes);
  check_markers ();
@@ -1162,16 +1166,14 @@ insert_from_buffer_1 (struct buffer *buf,
 /* Record undo information and adjust markers and position keepers for
   a replacement of a text PREV_TEXT at FROM to a new text of LEN
-   chars (LEN_BYTE bytes).  If TEXT_AT_GAP_TAIL, the new text
+   chars (LEN_BYTE bytes) which resides in the gap just after
-   resides at the gap tail; i.e. at (GAP_END_ADDR - LEN_BYTE)
+   GPT_ADDR.
-   Otherwise, the text resides in the gap just after GPT_BYTE.
   PREV_TEXT nil means the new text was just inserted.  */
-void
+static void
 adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
-                      Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte,
+                      Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte)
-                      bool text_at_gap_tail)
 {
  ptrdiff_t nchars_del = 0, nbytes_del = 0;
@@ -1191,11 +1193,8 @@ adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
  GAP_SIZE -= len_byte;
  ZV += len; Z+= len;
  ZV_BYTE += len_byte; Z_BYTE += len_byte;
-  if (! text_at_gap_tail)
+  GPT += len; GPT_BYTE += len_byte;
-    {
+  if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
-      GPT += len; GPT_BYTE += len_byte;
-      if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
-    }
  if (nchars_del > 0)
    adjust_markers_for_replace (from, from_byte, nchars_del, nbytes_del,
@@ -1250,7 +1249,7 @@ adjust_after_insert (ptrdiff_t from, ptrdiff_t from_byte,
  GPT -= len; GPT_BYTE -= len_byte;
  ZV -= len; ZV_BYTE -= len_byte;
  Z -= len; Z_BYTE -= len_byte;
-  adjust_after_replace (from, from_byte, Qnil, newlen, len_byte, 0);
+  adjust_after_replace (from, from_byte, Qnil, newlen, len_byte);
 }
 /* Replace the text from character positions FROM to TO with NEW,
diff --git a/src/lisp.h b/src/lisp.h
index bb9f60b29f9..b2ab5684d4d 100644
--- a/src/lisp.h
+++ b/src/lisp.h
@@ -2880,7 +2880,7 @@ extern void insert (const char *, ptrdiff_t);
 extern void insert_and_inherit (const char *, ptrdiff_t);
 extern void insert_1_both (const char *, ptrdiff_t, ptrdiff_t,
                           bool, bool, bool);
-extern void insert_from_gap (ptrdiff_t, ptrdiff_t);
+extern void insert_from_gap (ptrdiff_t, ptrdiff_t, bool text_at_gap_tail);
 extern void insert_from_string (Lisp_Object, ptrdiff_t, ptrdiff_t,
                                ptrdiff_t, ptrdiff_t, bool);
 extern void insert_from_buffer (struct buffer *, ptrdiff_t, ptrdiff_t, bool);
@@ -2900,8 +2900,6 @@ extern Lisp_Object del_range_2 (ptrdiff_t, ptrdiff_t,
 extern void modify_region_1 (ptrdiff_t, ptrdiff_t, bool);
 extern void prepare_to_modify_buffer (ptrdiff_t, ptrdiff_t, ptrdiff_t *);
 extern void signal_after_change (ptrdiff_t, ptrdiff_t, ptrdiff_t);
-extern void adjust_after_replace (ptrdiff_t, ptrdiff_t, Lisp_Object,
-                                  ptrdiff_t, ptrdiff_t, bool);
 extern void adjust_after_insert (ptrdiff_t, ptrdiff_t, ptrdiff_t,
                                 ptrdiff_t, ptrdiff_t);
 extern void adjust_markers_for_delete (ptrdiff_t, ptrdiff_t,
author	Kenichi Handa	2013-03-16 01:03:54 +0900
committer	Kenichi Handa	2013-03-16 01:03:54 +0900
commit	8a44e6d176989d8eef140314098c76a70248ba61 (patch)
tree	096ee4a0f9a15f2f300ba68d2dd1dd28b88e18a0 /src
parent	9b5939800615a4e08ac389813a70faf4b9e57bba (diff)
download	emacs-8a44e6d176989d8eef140314098c76a70248ba61.tar.gz emacs-8a44e6d176989d8eef140314098c76a70248ba61.zip