Fix and speed up en/decoding of UTF-8 strings

* src/coding.c (get_char_bytes, encode_string_utf_8) (decode_string_utf_8): Fix commentary. (encode_string_utf_8): Return the original ASCII string only if NOCOPY is non-zero. (decode_string_utf_8): Accept 2 additional arguments STR and STR_LEN, which allow to pass the input text as a C string. (make_string_from_utf8): Delegate the job to decode_string_utf_8. * src/coding.h: Update the prototype of decode_string_utf_8. * src/json.c (json_encode): Call encode_string_utf_8.
author: Eli Zaretskii 2019-11-23 11:27:43 +0200
committer: Eli Zaretskii 2019-11-23 11:27:43 +0200
commit: c26556bd18f8ca1e891bd1750c9f95b21ea457b0 (patch)
tree: 6d13489bbc75c0b0eef4d38b8df9ee290cf7e5ef /src/coding.c
parent: 6d4d00c63417e3479e978a373f252b9f2709ce39 (diff)
download: emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.tar.gz
emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.zip
1 files changed, 122 insertions, 80 deletions
diff --git a/src/coding.c b/src/coding.c
index 560ec0883ff..5f477cf9473 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6353,11 +6353,15 @@ utf8_string_p (Lisp_Object string)
 }
 /* Like make_string, but always returns a multibyte Lisp string, and
-   avoids decoding if TEXT encoded in UTF-8.  */
+   avoids decoding if TEXT is encoded in UTF-8.  */
 Lisp_Object
 make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
 {
+#if 0
+  /* This method is on average 2 times slower than if we use
+     decode_string_utf_8.  However, please leave the slower
+     implementation in the code for now, in case it needs to be reused
+     in some situations.  */
  ptrdiff_t chars, bytes;
  parse_str_as_multibyte ((const unsigned char *) text, nbytes,
                          &chars, &bytes);
@@ -6374,6 +6378,9 @@ make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
      decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
      return coding.dst_object;
    }
+#else
+  return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
+#endif
 }
 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
@@ -9537,7 +9544,7 @@ get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
  return BUF_GPT_ADDR (buf);
 }
-/* Return a pointer to the byte sequence for C, and set the length in
+/* Return a pointer to the byte sequence for C, and its byte length in
   LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
   and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
   decode_string_utf_8 when those arguments are given by
@@ -9572,11 +9579,16 @@ get_char_bytes (int c, int *len)
 /* Encode STRING by the coding system utf-8-unix.
+   This function is optimized for speed when the input string is
+   already a valid sequence of Unicode codepoints in the internal
+   representation, i.e. there are neither 8-bit raw bytes nor
+   characters beyond the Unicode range in the string's contents.
   Ignore any :pre-write-conversion and :encode-translation-table
-   properties of that coding system.
+   properties.
   Assume that arguments have values as described below.
-   The validity must be assured by callers.
+   The validity must be enforced and ensured by the caller.
   STRING is a multibyte string or an ASCII-only unibyte string.
@@ -9587,17 +9599,24 @@ get_char_bytes (int c, int *len)
   inserted characters.  The caller should have made BUFFER ready for
   modifying in advance (e.g., by calling invalidate_buffer_caches).
-   If BUFFER is Qnil, return a unibyte string from the encoded result.
+   If BUFFER is nil, return a unibyte string from the encoded result.
-   If NOCOPY, and if STRING contains only Unicode characters (i.e.,
-   the encoding does not change the byte sequence), return STRING even
+   If NOCOPY is non-zero, and if STRING contains only Unicode
-   if it is multibyte.
+   characters (i.e., the encoding does not change the byte sequence),
+   return STRING even if it is multibyte.  WARNING: This will return a
+   _multibyte_ string, something that callers might not expect, especially
+   if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
+   will only use the byte sequence of the encoded result accessed by
+   SDATA or SSDATA, and the original STRING will _not_ be modified after
+   the encoding.  When in doubt, always pass NOCOPY as zero.  You _have_
+   been warned!
   HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
-   character.  The former is for an eight-bit character (represented
+   character in STRING.  The former is for an eight-bit character (represented
   by a 2-byte overlong sequence in a multibyte STRING).  The latter is
-   for an over-Unicode character (a character whose code is greater
+   for a codepoint beyond the end of the Unicode range (a character whose
-   than the maximum Unicode character 0x10FFFF, represented by a 4 or
+   code is greater than the maximum Unicode character 0x10FFFF, represented
-   5-byte sequence in a multibyte STRING).
+   by a 4 or 5-byte sequence in a multibyte STRING).
   If these two arguments are unibyte strings (typically
   "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
@@ -9605,18 +9624,20 @@ get_char_bytes (int c, int *len)
   unibyte sequence.
   If the two arguments are characters, encode a non-Unicode
-   character as if it was the argument.
+   character as the respective argument characters.
   If they are Qignored, skip a non-Unicode character.
-   If HANDLE-8-BIT is Qt, encode an eight-bit character into one
+   If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
-   byte of the same value.
+   of the same value, like the usual Emacs encoding does.
-   If HANDLE-OVER-UNI is Qt, encode an over-unicode character
+   If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
-   into the same 4 or 5-byte sequence.
+   range into the same 4 or 5-byte sequence as used by Emacs
+   internally, like the usual Emacs encoding does.
   If the two arguments are Qnil, return Qnil if STRING has a
-   non-Unicode character.  */
+   non-Unicode character.  This allows the caller to signal an error
+   if such input strings are not allowed.  */
 Lisp_Object
 encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
@@ -9624,15 +9645,15 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
                     Lisp_Object handle_over_uni)
 {
  ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
-  if (NILP (buffer) && nchars == nbytes)
+  if (NILP (buffer) && nchars == nbytes && nocopy)
-    /* STRING contains only ASCII characters. */
+    /* STRING contains only ASCII characters.  */
    return string;
  ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
  /* The following two vars are counted only if handle_over_uni is not Qt.  */
  ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
  ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
-  ptrdiff_t outbytes;        /* number of bytes of decoding result. */
+  ptrdiff_t outbytes;        /* number of bytes of decoding result */
  unsigned char *p = SDATA (string);
  unsigned char *pend = p + nbytes;
  unsigned char *src = NULL, *dst = NULL;
@@ -9668,10 +9689,10 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
            }
          /* A character to change the byte sequence on encoding was
-             found.  A rare case. */
+             found.  A rare case.  */
          if (len == 2)
            {
-              /* Handle an eight-bit character by handle_8_bit. */
+              /* Handle an eight-bit character by handle_8_bit.  */
              if (scan_count == 0)
                {
                  if (NILP (handle_8_bit))
@@ -9699,7 +9720,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
            }
          else                  /* len == 4 or 5 */
            {
-              /* Handle an over-unicode character by handle_over_uni. */
+              /* Handle an over-unicode character by handle_over_uni.  */
              if (scan_count == 0)
                {
                  if (NILP (handle_over_uni))
@@ -9729,19 +9750,20 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
      if (scan_count == 0)
        {
-          /* End of the first scane */
+          /* End of the first scan.  */
          outbytes = nbytes;
          if (num_8_bit == 0
              && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
            {
              /* We can break the loop because there is no need of
                 changing the byte sequence.  This is the typical
-                 case. */
+                 case.  */
              scan_count = 1;
            }
          else
            {
-              /* Prepare for the next scan to handle non-Unicode characters. */
+              /* Prepare for handling non-Unicode characters during
+                 the next scan.  */
              if (num_8_bit > 0)
                {
                  if (CHARACTERP (handle_8_bit))
@@ -9792,7 +9814,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
                }
            }
-          /* Prepare a return value and a space to store the encoded bytes. */
+          /* Prepare return value and space to store the encoded bytes.  */
          if (BUFFERP (buffer))
            {
              val = make_fixnum (outbytes);
@@ -9822,38 +9844,51 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
  return val;
 }
-/* Decode STRING by the coding system utf-8-unix.
+/* Decode input string by the coding system utf-8-unix.
-   Ignore any :pre-write-conversion and :encode-translation-table
+   This function is optimized for speed when the input string is
-   properties of that coding system.
+   already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
+   bytes nor any UTF-8 sequences longer than 4 bytes in the string's
+   contents.
-   Assumes that arguments have values as described below.
+   Ignore any :post-read-conversion and :decode-translation-table
-   The validity must be assured by callers.
+   properties.
-   STRING is a unibyte string or an ASCII-only multibyte string.
+   Assume that arguments have values as described below.
+   The validity must be enforced and ensured by the caller.
-   BUFFER is a multibyte buffer or Qnil.
+   STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
+   If STRING is Qnil, the input is a C string pointed by STR whose
+   length in bytes is in STR_LEN.
+   BUFFER is a multibyte buffer or Qnil.
   If BUFFER is a multibyte buffer, insert the decoding result of
   Unicode characters after point of the buffer, and return the number
   of inserted characters.  The caller should have made BUFFER ready
   for modifying in advance (e.g., by calling invalidate_buffer_caches).
   If BUFFER is Qnil, return a multibyte string from the decoded result.
-   As a special case, return STRING itself in the following cases:
-   1. STRING contains only ASCII characters.
-   2. NOCOPY is true, and STRING contains only valid UTF-8 sequences.
-   For maximum speed, always specify NOCOPY true when STRING is
+   NOCOPY non-zero means it is OK to return the input STRING if it
-   guaranteed to contain only valid UTF-8 sequences.
+   contains only ASCII characters or only valid UTF-8 sequences of 2
+   to 4 bytes.  WARNING: This will return a _unibyte_ string, something
+   that callers might not expect, especially if STRING is not
+   pure-ASCII; only use NOCOPY non-zero if the caller will only use
+   the byte sequence of the decoded result accessed via SDATA or
+   SSDATA, and if the original STRING will _not_ be modified after the
+   decoding.  When in dount, always pass NOCOPY as zero.  You _have_
+   been warned!
+   If STRING is Qnil, and the original string is passed via STR, NOCOPY
+   is ignored.
   HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
-   byte sequence.  The former is for an 1-byte invalid sequence that
+   byte sequence.  The former is for a 1-byte invalid sequence that
-   violates the fundamental UTF-8 encoding rule.  The latter is for a
+   violates the fundamental UTF-8 encoding rules.  The latter is for a
-   4 or 5-byte invalid sequence that Emacs internally uses to
+   4 or 5-byte overlong sequences that Emacs internally uses to
-   represent an over-unicode character (a character of code greater
+   represent characters beyond the Unicode range (characters whose
-   than #x10FFFF).  Note that this function does not treat an overlong
+   codepoints are greater than #x10FFFF).  Note that this function does
-   UTF-8 sequence as invalid.
+   not in general treat such overlong UTF-8 sequences as invalid.
   If these two arguments are strings (typically a 1-char string of
   the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
@@ -9862,24 +9897,28 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
   If the two arguments are characters, decode an invalid byte
   sequence into the corresponding multibyte representation of the
-   characters.
+   respective character.
-   If they are Qignored, skip an invalid byte sequence.
+   If they are Qignored, skip an invalid byte sequence without
+   producing anything in the decoded string.
-   If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into
+   If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
-   the corresponding eight-bit character.
+   corresponding eight-bit multibyte representation, like the usual
+   Emacs decoding does.
-   If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte invalid sequence
+   If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
-   that follows Emacs' representation for an over-unicode character
+   that follows Emacs' internal representation for a character beyond
-   into the corresponding character.
+   Unicode range into the corresponding character, like the usual
+   Emacs decoding does.
-   If the two arguments are Qnil, return Qnil if STRING has an invalid
+   If the two arguments are Qnil, return Qnil if the input string has
-   sequence.  */
+   raw bytes or overlong sequences.  This allows the caller to signal
+   an error if such inputs are not allowed.  */
 Lisp_Object
-decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
+decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
-                     bool nocopy, Lisp_Object handle_8_bit,
+                     Lisp_Object buffer, bool nocopy,
-                     Lisp_Object handle_over_uni)
+                     Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
 {
  /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
     and it returns 0 for an invalid sequence.  */
@@ -9891,24 +9930,26 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
   : (c) == 0xF8 ? 5                    \
   : 0)
-  ptrdiff_t nbytes = SBYTES (string);
+  ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
-  unsigned char *p = SDATA (string), *pend = p + nbytes;
+  unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
-  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences. */
+  unsigned char *str_orig = p;
-  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences. */
+  unsigned char *pend = p + nbytes;
-  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences. */
+  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences */
-  ptrdiff_t outbytes = nbytes;  /* number of decoded bytes. */
+  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences */
-  ptrdiff_t outchars = 0;    /* number of decoded characters. */
+  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences */
+  ptrdiff_t outbytes = nbytes;  /* number of decoded bytes */
+  ptrdiff_t outchars = 0;    /* number of decoded characters */
  unsigned char *src = NULL, *dst = NULL;
  bool change_byte_sequence = false;
-  /* Scan bytes in STRING twice.  The first scan is to count invalid
+  /* Scan input bytes twice.  The first scan is to count invalid
-     sequences, and the second scan is to decode STRING.  If the
+     sequences, and the second scan is to decode input.  If the
     decoding is trivial (no need of changing the byte sequence),
     the second scan is avoided.  */
  while (p < pend)
    {
      src = p;
-      /* Try short cut for an ASCII-only case. */
+      /* Try short cut for an ASCII-only case.  */
      while (p < pend && *p < 0x80) p++;
      outchars += (p - src);
      if (p == pend)
@@ -9916,7 +9957,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
      int c = *p;
      outchars++;
      int len = UTF_8_SEQUENCE_LENGTH (c);
-      /* len == 0, 2, 3, 4, 5 */
+      /* len == 0, 2, 3, 4, 5.  */
      if (UTF_8_EXTRA_OCTET_P (p[1])
          && (len == 2
              || (UTF_8_EXTRA_OCTET_P (p[2])
@@ -9930,7 +9971,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
          continue;
        }
-      /* A sequence to change on decoding was found.  A rare case. */
+      /* A sequence to change on decoding was found.  A rare case.  */
      if (len == 0)
        {
          if (NILP (handle_8_bit))
@@ -9951,19 +9992,19 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
      p += len;
    }
-  Lisp_Object val;           /* the return value. */
+  Lisp_Object val;           /* the return value */
  if (! change_byte_sequence
      && NILP (buffer))
    {
-      if (nocopy)
+      if (nocopy && STRINGP (string))
        return string;
      val = make_uninit_multibyte_string (outchars, outbytes);
-      memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
+      memcpy (SDATA (val), str_orig, pend - str_orig);
      return val;
    }
-  /* Count the number of resulting chars and bytes. */
+  /* Count the number of resulting chars and bytes.  */
  unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
  int replace_8_bit_len = 0, replace_over_uni_len = 0;
@@ -10022,7 +10063,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
        }
    }
-  /* Prepare a return value and a space to store the decoded bytes. */
+  /* Prepare return value and  space to store the decoded bytes.  */
  if (BUFFERP (buffer))
    {
      val = make_fixnum (outchars);
@@ -10030,19 +10071,20 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
    }
  else
    {
-      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
+          && STRINGP (string))
        return string;
      val = make_uninit_multibyte_string (outchars, outbytes);
      dst = SDATA (val);
    }
-  src = SDATA (string);
+  src = str_orig;
  if (change_byte_sequence)
    {
      p = src;
      while (p < pend)
        {
-          /* Try short cut for an ASCII-only case. */
+          /* Try short cut for an ASCII-only case.  */
          /* while (p < pend && *p < 0x80) p++; */
          /* if (p == pend) */
          /*   break; */
@@ -10089,7 +10131,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
            }
          else                  /* len == 4 or 5 */
            {
-              /* Handle p[0]... by handle_over_uni */
+              /* Handle p[0]... by handle_over_uni.  */
              if (replace_over_uni)
                {
                  memcpy (dst, replace_over_uni, replace_over_uni_len);
author	Eli Zaretskii	2019-11-23 11:27:43 +0200
committer	Eli Zaretskii	2019-11-23 11:27:43 +0200
commit	c26556bd18f8ca1e891bd1750c9f95b21ea457b0 (patch)
tree	6d13489bbc75c0b0eef4d38b8df9ee290cf7e5ef /src/coding.c
parent	6d4d00c63417e3479e978a373f252b9f2709ce39 (diff)
download	emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.tar.gz emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.zip