aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorEli Zaretskii2019-11-23 11:27:43 +0200
committerEli Zaretskii2019-11-23 11:27:43 +0200
commitc26556bd18f8ca1e891bd1750c9f95b21ea457b0 (patch)
tree6d13489bbc75c0b0eef4d38b8df9ee290cf7e5ef /src/coding.c
parent6d4d00c63417e3479e978a373f252b9f2709ce39 (diff)
downloademacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.tar.gz
emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.zip
Fix and speed up en/decoding of UTF-8 strings
* src/coding.c (get_char_bytes, encode_string_utf_8) (decode_string_utf_8): Fix commentary. (encode_string_utf_8): Return the original ASCII string only if NOCOPY is non-zero. (decode_string_utf_8): Accept 2 additional arguments STR and STR_LEN, which allow to pass the input text as a C string. (make_string_from_utf8): Delegate the job to decode_string_utf_8. * src/coding.h: Update the prototype of decode_string_utf_8. * src/json.c (json_encode): Call encode_string_utf_8.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c202
1 files changed, 122 insertions, 80 deletions
diff --git a/src/coding.c b/src/coding.c
index 560ec0883ff..5f477cf9473 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6353,11 +6353,15 @@ utf8_string_p (Lisp_Object string)
6353} 6353}
6354 6354
6355/* Like make_string, but always returns a multibyte Lisp string, and 6355/* Like make_string, but always returns a multibyte Lisp string, and
6356 avoids decoding if TEXT encoded in UTF-8. */ 6356 avoids decoding if TEXT is encoded in UTF-8. */
6357
6358Lisp_Object 6357Lisp_Object
6359make_string_from_utf8 (const char *text, ptrdiff_t nbytes) 6358make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
6360{ 6359{
6360#if 0
6361 /* This method is on average 2 times slower than if we use
6362 decode_string_utf_8. However, please leave the slower
6363 implementation in the code for now, in case it needs to be reused
6364 in some situations. */
6361 ptrdiff_t chars, bytes; 6365 ptrdiff_t chars, bytes;
6362 parse_str_as_multibyte ((const unsigned char *) text, nbytes, 6366 parse_str_as_multibyte ((const unsigned char *) text, nbytes,
6363 &chars, &bytes); 6367 &chars, &bytes);
@@ -6374,6 +6378,9 @@ make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
6374 decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt); 6378 decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
6375 return coding.dst_object; 6379 return coding.dst_object;
6376 } 6380 }
6381#else
6382 return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
6383#endif
6377} 6384}
6378 6385
6379/* Detect how end-of-line of a text of length SRC_BYTES pointed by 6386/* Detect how end-of-line of a text of length SRC_BYTES pointed by
@@ -9537,7 +9544,7 @@ get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
9537 return BUF_GPT_ADDR (buf); 9544 return BUF_GPT_ADDR (buf);
9538} 9545}
9539 9546
9540/* Return a pointer to the byte sequence for C, and set the length in 9547/* Return a pointer to the byte sequence for C, and its byte length in
9541 LEN. This function is used to get a byte sequence for HANDLE_8_BIT 9548 LEN. This function is used to get a byte sequence for HANDLE_8_BIT
9542 and HANDLE_OVER_UNI arguments of encode_string_utf_8 and 9549 and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
9543 decode_string_utf_8 when those arguments are given by 9550 decode_string_utf_8 when those arguments are given by
@@ -9572,11 +9579,16 @@ get_char_bytes (int c, int *len)
9572 9579
9573/* Encode STRING by the coding system utf-8-unix. 9580/* Encode STRING by the coding system utf-8-unix.
9574 9581
9582 This function is optimized for speed when the input string is
9583 already a valid sequence of Unicode codepoints in the internal
9584 representation, i.e. there are neither 8-bit raw bytes nor
9585 characters beyond the Unicode range in the string's contents.
9586
9575 Ignore any :pre-write-conversion and :encode-translation-table 9587 Ignore any :pre-write-conversion and :encode-translation-table
9576 properties of that coding system. 9588 properties.
9577 9589
9578 Assume that arguments have values as described below. 9590 Assume that arguments have values as described below.
9579 The validity must be assured by callers. 9591 The validity must be enforced and ensured by the caller.
9580 9592
9581 STRING is a multibyte string or an ASCII-only unibyte string. 9593 STRING is a multibyte string or an ASCII-only unibyte string.
9582 9594
@@ -9587,17 +9599,24 @@ get_char_bytes (int c, int *len)
9587 inserted characters. The caller should have made BUFFER ready for 9599 inserted characters. The caller should have made BUFFER ready for
9588 modifying in advance (e.g., by calling invalidate_buffer_caches). 9600 modifying in advance (e.g., by calling invalidate_buffer_caches).
9589 9601
9590 If BUFFER is Qnil, return a unibyte string from the encoded result. 9602 If BUFFER is nil, return a unibyte string from the encoded result.
9591 If NOCOPY, and if STRING contains only Unicode characters (i.e., 9603
9592 the encoding does not change the byte sequence), return STRING even 9604 If NOCOPY is non-zero, and if STRING contains only Unicode
9593 if it is multibyte. 9605 characters (i.e., the encoding does not change the byte sequence),
9606 return STRING even if it is multibyte. WARNING: This will return a
9607 _multibyte_ string, something that callers might not expect, especially
9608 if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
9609 will only use the byte sequence of the encoded result accessed by
9610 SDATA or SSDATA, and the original STRING will _not_ be modified after
9611 the encoding. When in doubt, always pass NOCOPY as zero. You _have_
9612 been warned!
9594 9613
9595 HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode 9614 HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
9596 character. The former is for an eight-bit character (represented 9615 character in STRING. The former is for an eight-bit character (represented
9597 by a 2-byte overlong sequence in a multibyte STRING). The latter is 9616 by a 2-byte overlong sequence in a multibyte STRING). The latter is
9598 for an over-Unicode character (a character whose code is greater 9617 for a codepoint beyond the end of the Unicode range (a character whose
9599 than the maximum Unicode character 0x10FFFF, represented by a 4 or 9618 code is greater than the maximum Unicode character 0x10FFFF, represented
9600 5-byte sequence in a multibyte STRING). 9619 by a 4 or 5-byte sequence in a multibyte STRING).
9601 9620
9602 If these two arguments are unibyte strings (typically 9621 If these two arguments are unibyte strings (typically
9603 "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT 9622 "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
@@ -9605,18 +9624,20 @@ get_char_bytes (int c, int *len)
9605 unibyte sequence. 9624 unibyte sequence.
9606 9625
9607 If the two arguments are characters, encode a non-Unicode 9626 If the two arguments are characters, encode a non-Unicode
9608 character as if it was the argument. 9627 character as the respective argument characters.
9609 9628
9610 If they are Qignored, skip a non-Unicode character. 9629 If they are Qignored, skip a non-Unicode character.
9611 9630
9612 If HANDLE-8-BIT is Qt, encode an eight-bit character into one 9631 If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
9613 byte of the same value. 9632 of the same value, like the usual Emacs encoding does.
9614 9633
9615 If HANDLE-OVER-UNI is Qt, encode an over-unicode character 9634 If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
9616 into the same 4 or 5-byte sequence. 9635 range into the same 4 or 5-byte sequence as used by Emacs
9636 internally, like the usual Emacs encoding does.
9617 9637
9618 If the two arguments are Qnil, return Qnil if STRING has a 9638 If the two arguments are Qnil, return Qnil if STRING has a
9619 non-Unicode character. */ 9639 non-Unicode character. This allows the caller to signal an error
9640 if such input strings are not allowed. */
9620 9641
9621Lisp_Object 9642Lisp_Object
9622encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, 9643encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
@@ -9624,15 +9645,15 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9624 Lisp_Object handle_over_uni) 9645 Lisp_Object handle_over_uni)
9625{ 9646{
9626 ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string); 9647 ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
9627 if (NILP (buffer) && nchars == nbytes) 9648 if (NILP (buffer) && nchars == nbytes && nocopy)
9628 /* STRING contains only ASCII characters. */ 9649 /* STRING contains only ASCII characters. */
9629 return string; 9650 return string;
9630 9651
9631 ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */ 9652 ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */
9632 /* The following two vars are counted only if handle_over_uni is not Qt. */ 9653 /* The following two vars are counted only if handle_over_uni is not Qt. */
9633 ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */ 9654 ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
9634 ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */ 9655 ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
9635 ptrdiff_t outbytes; /* number of bytes of decoding result. */ 9656 ptrdiff_t outbytes; /* number of bytes of decoding result */
9636 unsigned char *p = SDATA (string); 9657 unsigned char *p = SDATA (string);
9637 unsigned char *pend = p + nbytes; 9658 unsigned char *pend = p + nbytes;
9638 unsigned char *src = NULL, *dst = NULL; 9659 unsigned char *src = NULL, *dst = NULL;
@@ -9668,10 +9689,10 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9668 } 9689 }
9669 9690
9670 /* A character to change the byte sequence on encoding was 9691 /* A character to change the byte sequence on encoding was
9671 found. A rare case. */ 9692 found. A rare case. */
9672 if (len == 2) 9693 if (len == 2)
9673 { 9694 {
9674 /* Handle an eight-bit character by handle_8_bit. */ 9695 /* Handle an eight-bit character by handle_8_bit. */
9675 if (scan_count == 0) 9696 if (scan_count == 0)
9676 { 9697 {
9677 if (NILP (handle_8_bit)) 9698 if (NILP (handle_8_bit))
@@ -9699,7 +9720,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9699 } 9720 }
9700 else /* len == 4 or 5 */ 9721 else /* len == 4 or 5 */
9701 { 9722 {
9702 /* Handle an over-unicode character by handle_over_uni. */ 9723 /* Handle an over-unicode character by handle_over_uni. */
9703 if (scan_count == 0) 9724 if (scan_count == 0)
9704 { 9725 {
9705 if (NILP (handle_over_uni)) 9726 if (NILP (handle_over_uni))
@@ -9729,19 +9750,20 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9729 9750
9730 if (scan_count == 0) 9751 if (scan_count == 0)
9731 { 9752 {
9732 /* End of the first scane */ 9753 /* End of the first scan. */
9733 outbytes = nbytes; 9754 outbytes = nbytes;
9734 if (num_8_bit == 0 9755 if (num_8_bit == 0
9735 && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt))) 9756 && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
9736 { 9757 {
9737 /* We can break the loop because there is no need of 9758 /* We can break the loop because there is no need of
9738 changing the byte sequence. This is the typical 9759 changing the byte sequence. This is the typical
9739 case. */ 9760 case. */
9740 scan_count = 1; 9761 scan_count = 1;
9741 } 9762 }
9742 else 9763 else
9743 { 9764 {
9744 /* Prepare for the next scan to handle non-Unicode characters. */ 9765 /* Prepare for handling non-Unicode characters during
9766 the next scan. */
9745 if (num_8_bit > 0) 9767 if (num_8_bit > 0)
9746 { 9768 {
9747 if (CHARACTERP (handle_8_bit)) 9769 if (CHARACTERP (handle_8_bit))
@@ -9792,7 +9814,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9792 } 9814 }
9793 } 9815 }
9794 9816
9795 /* Prepare a return value and a space to store the encoded bytes. */ 9817 /* Prepare return value and space to store the encoded bytes. */
9796 if (BUFFERP (buffer)) 9818 if (BUFFERP (buffer))
9797 { 9819 {
9798 val = make_fixnum (outbytes); 9820 val = make_fixnum (outbytes);
@@ -9822,38 +9844,51 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9822 return val; 9844 return val;
9823} 9845}
9824 9846
9825/* Decode STRING by the coding system utf-8-unix. 9847/* Decode input string by the coding system utf-8-unix.
9826 9848
9827 Ignore any :pre-write-conversion and :encode-translation-table 9849 This function is optimized for speed when the input string is
9828 properties of that coding system. 9850 already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
9851 bytes nor any UTF-8 sequences longer than 4 bytes in the string's
9852 contents.
9829 9853
9830 Assumes that arguments have values as described below. 9854 Ignore any :post-read-conversion and :decode-translation-table
9831 The validity must be assured by callers. 9855 properties.
9832 9856
9833 STRING is a unibyte string or an ASCII-only multibyte string. 9857 Assume that arguments have values as described below.
9858 The validity must be enforced and ensured by the caller.
9834 9859
9835 BUFFER is a multibyte buffer or Qnil. 9860 STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
9861 If STRING is Qnil, the input is a C string pointed by STR whose
9862 length in bytes is in STR_LEN.
9836 9863
9864 BUFFER is a multibyte buffer or Qnil.
9837 If BUFFER is a multibyte buffer, insert the decoding result of 9865 If BUFFER is a multibyte buffer, insert the decoding result of
9838 Unicode characters after point of the buffer, and return the number 9866 Unicode characters after point of the buffer, and return the number
9839 of inserted characters. The caller should have made BUFFER ready 9867 of inserted characters. The caller should have made BUFFER ready
9840 for modifying in advance (e.g., by calling invalidate_buffer_caches). 9868 for modifying in advance (e.g., by calling invalidate_buffer_caches).
9841 9869
9842 If BUFFER is Qnil, return a multibyte string from the decoded result. 9870 If BUFFER is Qnil, return a multibyte string from the decoded result.
9843 As a special case, return STRING itself in the following cases:
9844 1. STRING contains only ASCII characters.
9845 2. NOCOPY is true, and STRING contains only valid UTF-8 sequences.
9846 9871
9847 For maximum speed, always specify NOCOPY true when STRING is 9872 NOCOPY non-zero means it is OK to return the input STRING if it
9848 guaranteed to contain only valid UTF-8 sequences. 9873 contains only ASCII characters or only valid UTF-8 sequences of 2
9874 to 4 bytes. WARNING: This will return a _unibyte_ string, something
9875 that callers might not expect, especially if STRING is not
9876 pure-ASCII; only use NOCOPY non-zero if the caller will only use
9877 the byte sequence of the decoded result accessed via SDATA or
9878 SSDATA, and if the original STRING will _not_ be modified after the
9879 decoding. When in dount, always pass NOCOPY as zero. You _have_
9880 been warned!
9881
9882 If STRING is Qnil, and the original string is passed via STR, NOCOPY
9883 is ignored.
9849 9884
9850 HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid 9885 HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
9851 byte sequence. The former is for an 1-byte invalid sequence that 9886 byte sequence. The former is for a 1-byte invalid sequence that
9852 violates the fundamental UTF-8 encoding rule. The latter is for a 9887 violates the fundamental UTF-8 encoding rules. The latter is for a
9853 4 or 5-byte invalid sequence that Emacs internally uses to 9888 4 or 5-byte overlong sequences that Emacs internally uses to
9854 represent an over-unicode character (a character of code greater 9889 represent characters beyond the Unicode range (characters whose
9855 than #x10FFFF). Note that this function does not treat an overlong 9890 codepoints are greater than #x10FFFF). Note that this function does
9856 UTF-8 sequence as invalid. 9891 not in general treat such overlong UTF-8 sequences as invalid.
9857 9892
9858 If these two arguments are strings (typically a 1-char string of 9893 If these two arguments are strings (typically a 1-char string of
9859 the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte 9894 the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
@@ -9862,24 +9897,28 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9862 9897
9863 If the two arguments are characters, decode an invalid byte 9898 If the two arguments are characters, decode an invalid byte
9864 sequence into the corresponding multibyte representation of the 9899 sequence into the corresponding multibyte representation of the
9865 characters. 9900 respective character.
9866 9901
9867 If they are Qignored, skip an invalid byte sequence. 9902 If they are Qignored, skip an invalid byte sequence without
9903 producing anything in the decoded string.
9868 9904
9869 If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into 9905 If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
9870 the corresponding eight-bit character. 9906 corresponding eight-bit multibyte representation, like the usual
9907 Emacs decoding does.
9871 9908
9872 If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte invalid sequence 9909 If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
9873 that follows Emacs' representation for an over-unicode character 9910 that follows Emacs' internal representation for a character beyond
9874 into the corresponding character. 9911 Unicode range into the corresponding character, like the usual
9912 Emacs decoding does.
9875 9913
9876 If the two arguments are Qnil, return Qnil if STRING has an invalid 9914 If the two arguments are Qnil, return Qnil if the input string has
9877 sequence. */ 9915 raw bytes or overlong sequences. This allows the caller to signal
9916 an error if such inputs are not allowed. */
9878 9917
9879Lisp_Object 9918Lisp_Object
9880decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, 9919decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
9881 bool nocopy, Lisp_Object handle_8_bit, 9920 Lisp_Object buffer, bool nocopy,
9882 Lisp_Object handle_over_uni) 9921 Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
9883{ 9922{
9884 /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80 9923 /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
9885 and it returns 0 for an invalid sequence. */ 9924 and it returns 0 for an invalid sequence. */
@@ -9891,24 +9930,26 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9891 : (c) == 0xF8 ? 5 \ 9930 : (c) == 0xF8 ? 5 \
9892 : 0) 9931 : 0)
9893 9932
9894 ptrdiff_t nbytes = SBYTES (string); 9933 ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
9895 unsigned char *p = SDATA (string), *pend = p + nbytes; 9934 unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
9896 ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */ 9935 unsigned char *str_orig = p;
9897 ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */ 9936 unsigned char *pend = p + nbytes;
9898 ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */ 9937 ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences */
9899 ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */ 9938 ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences */
9900 ptrdiff_t outchars = 0; /* number of decoded characters. */ 9939 ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences */
9940 ptrdiff_t outbytes = nbytes; /* number of decoded bytes */
9941 ptrdiff_t outchars = 0; /* number of decoded characters */
9901 unsigned char *src = NULL, *dst = NULL; 9942 unsigned char *src = NULL, *dst = NULL;
9902 bool change_byte_sequence = false; 9943 bool change_byte_sequence = false;
9903 9944
9904 /* Scan bytes in STRING twice. The first scan is to count invalid 9945 /* Scan input bytes twice. The first scan is to count invalid
9905 sequences, and the second scan is to decode STRING. If the 9946 sequences, and the second scan is to decode input. If the
9906 decoding is trivial (no need of changing the byte sequence), 9947 decoding is trivial (no need of changing the byte sequence),
9907 the second scan is avoided. */ 9948 the second scan is avoided. */
9908 while (p < pend) 9949 while (p < pend)
9909 { 9950 {
9910 src = p; 9951 src = p;
9911 /* Try short cut for an ASCII-only case. */ 9952 /* Try short cut for an ASCII-only case. */
9912 while (p < pend && *p < 0x80) p++; 9953 while (p < pend && *p < 0x80) p++;
9913 outchars += (p - src); 9954 outchars += (p - src);
9914 if (p == pend) 9955 if (p == pend)
@@ -9916,7 +9957,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9916 int c = *p; 9957 int c = *p;
9917 outchars++; 9958 outchars++;
9918 int len = UTF_8_SEQUENCE_LENGTH (c); 9959 int len = UTF_8_SEQUENCE_LENGTH (c);
9919 /* len == 0, 2, 3, 4, 5 */ 9960 /* len == 0, 2, 3, 4, 5. */
9920 if (UTF_8_EXTRA_OCTET_P (p[1]) 9961 if (UTF_8_EXTRA_OCTET_P (p[1])
9921 && (len == 2 9962 && (len == 2
9922 || (UTF_8_EXTRA_OCTET_P (p[2]) 9963 || (UTF_8_EXTRA_OCTET_P (p[2])
@@ -9930,7 +9971,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9930 continue; 9971 continue;
9931 } 9972 }
9932 9973
9933 /* A sequence to change on decoding was found. A rare case. */ 9974 /* A sequence to change on decoding was found. A rare case. */
9934 if (len == 0) 9975 if (len == 0)
9935 { 9976 {
9936 if (NILP (handle_8_bit)) 9977 if (NILP (handle_8_bit))
@@ -9951,19 +9992,19 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9951 p += len; 9992 p += len;
9952 } 9993 }
9953 9994
9954 Lisp_Object val; /* the return value. */ 9995 Lisp_Object val; /* the return value */
9955 9996
9956 if (! change_byte_sequence 9997 if (! change_byte_sequence
9957 && NILP (buffer)) 9998 && NILP (buffer))
9958 { 9999 {
9959 if (nocopy) 10000 if (nocopy && STRINGP (string))
9960 return string; 10001 return string;
9961 val = make_uninit_multibyte_string (outchars, outbytes); 10002 val = make_uninit_multibyte_string (outchars, outbytes);
9962 memcpy (SDATA (val), SDATA (string), pend - SDATA (string)); 10003 memcpy (SDATA (val), str_orig, pend - str_orig);
9963 return val; 10004 return val;
9964 } 10005 }
9965 10006
9966 /* Count the number of resulting chars and bytes. */ 10007 /* Count the number of resulting chars and bytes. */
9967 unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL; 10008 unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
9968 int replace_8_bit_len = 0, replace_over_uni_len = 0; 10009 int replace_8_bit_len = 0, replace_over_uni_len = 0;
9969 10010
@@ -10022,7 +10063,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
10022 } 10063 }
10023 } 10064 }
10024 10065
10025 /* Prepare a return value and a space to store the decoded bytes. */ 10066 /* Prepare return value and space to store the decoded bytes. */
10026 if (BUFFERP (buffer)) 10067 if (BUFFERP (buffer))
10027 { 10068 {
10028 val = make_fixnum (outchars); 10069 val = make_fixnum (outchars);
@@ -10030,19 +10071,20 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
10030 } 10071 }
10031 else 10072 else
10032 { 10073 {
10033 if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0) 10074 if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
10075 && STRINGP (string))
10034 return string; 10076 return string;
10035 val = make_uninit_multibyte_string (outchars, outbytes); 10077 val = make_uninit_multibyte_string (outchars, outbytes);
10036 dst = SDATA (val); 10078 dst = SDATA (val);
10037 } 10079 }
10038 10080
10039 src = SDATA (string); 10081 src = str_orig;
10040 if (change_byte_sequence) 10082 if (change_byte_sequence)
10041 { 10083 {
10042 p = src; 10084 p = src;
10043 while (p < pend) 10085 while (p < pend)
10044 { 10086 {
10045 /* Try short cut for an ASCII-only case. */ 10087 /* Try short cut for an ASCII-only case. */
10046 /* while (p < pend && *p < 0x80) p++; */ 10088 /* while (p < pend && *p < 0x80) p++; */
10047 /* if (p == pend) */ 10089 /* if (p == pend) */
10048 /* break; */ 10090 /* break; */
@@ -10089,7 +10131,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
10089 } 10131 }
10090 else /* len == 4 or 5 */ 10132 else /* len == 4 or 5 */
10091 { 10133 {
10092 /* Handle p[0]... by handle_over_uni */ 10134 /* Handle p[0]... by handle_over_uni. */
10093 if (replace_over_uni) 10135 if (replace_over_uni)
10094 { 10136 {
10095 memcpy (dst, replace_over_uni, replace_over_uni_len); 10137 memcpy (dst, replace_over_uni, replace_over_uni_len);