diff options
| author | Eli Zaretskii | 2019-11-23 11:27:43 +0200 |
|---|---|---|
| committer | Eli Zaretskii | 2019-11-23 11:27:43 +0200 |
| commit | c26556bd18f8ca1e891bd1750c9f95b21ea457b0 (patch) | |
| tree | 6d13489bbc75c0b0eef4d38b8df9ee290cf7e5ef /src/coding.c | |
| parent | 6d4d00c63417e3479e978a373f252b9f2709ce39 (diff) | |
| download | emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.tar.gz emacs-c26556bd18f8ca1e891bd1750c9f95b21ea457b0.zip | |
Fix and speed up en/decoding of UTF-8 strings
* src/coding.c (get_char_bytes, encode_string_utf_8)
(decode_string_utf_8): Fix commentary.
(encode_string_utf_8): Return the original ASCII string only
if NOCOPY is non-zero.
(decode_string_utf_8): Accept 2 additional arguments STR and
STR_LEN, which allow to pass the input text as a C string.
(make_string_from_utf8): Delegate the job to decode_string_utf_8.
* src/coding.h: Update the prototype of decode_string_utf_8.
* src/json.c (json_encode): Call encode_string_utf_8.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 202 |
1 files changed, 122 insertions, 80 deletions
diff --git a/src/coding.c b/src/coding.c index 560ec0883ff..5f477cf9473 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -6353,11 +6353,15 @@ utf8_string_p (Lisp_Object string) | |||
| 6353 | } | 6353 | } |
| 6354 | 6354 | ||
| 6355 | /* Like make_string, but always returns a multibyte Lisp string, and | 6355 | /* Like make_string, but always returns a multibyte Lisp string, and |
| 6356 | avoids decoding if TEXT encoded in UTF-8. */ | 6356 | avoids decoding if TEXT is encoded in UTF-8. */ |
| 6357 | |||
| 6358 | Lisp_Object | 6357 | Lisp_Object |
| 6359 | make_string_from_utf8 (const char *text, ptrdiff_t nbytes) | 6358 | make_string_from_utf8 (const char *text, ptrdiff_t nbytes) |
| 6360 | { | 6359 | { |
| 6360 | #if 0 | ||
| 6361 | /* This method is on average 2 times slower than if we use | ||
| 6362 | decode_string_utf_8. However, please leave the slower | ||
| 6363 | implementation in the code for now, in case it needs to be reused | ||
| 6364 | in some situations. */ | ||
| 6361 | ptrdiff_t chars, bytes; | 6365 | ptrdiff_t chars, bytes; |
| 6362 | parse_str_as_multibyte ((const unsigned char *) text, nbytes, | 6366 | parse_str_as_multibyte ((const unsigned char *) text, nbytes, |
| 6363 | &chars, &bytes); | 6367 | &chars, &bytes); |
| @@ -6374,6 +6378,9 @@ make_string_from_utf8 (const char *text, ptrdiff_t nbytes) | |||
| 6374 | decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt); | 6378 | decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt); |
| 6375 | return coding.dst_object; | 6379 | return coding.dst_object; |
| 6376 | } | 6380 | } |
| 6381 | #else | ||
| 6382 | return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt); | ||
| 6383 | #endif | ||
| 6377 | } | 6384 | } |
| 6378 | 6385 | ||
| 6379 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by | 6386 | /* Detect how end-of-line of a text of length SRC_BYTES pointed by |
| @@ -9537,7 +9544,7 @@ get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes) | |||
| 9537 | return BUF_GPT_ADDR (buf); | 9544 | return BUF_GPT_ADDR (buf); |
| 9538 | } | 9545 | } |
| 9539 | 9546 | ||
| 9540 | /* Return a pointer to the byte sequence for C, and set the length in | 9547 | /* Return a pointer to the byte sequence for C, and its byte length in |
| 9541 | LEN. This function is used to get a byte sequence for HANDLE_8_BIT | 9548 | LEN. This function is used to get a byte sequence for HANDLE_8_BIT |
| 9542 | and HANDLE_OVER_UNI arguments of encode_string_utf_8 and | 9549 | and HANDLE_OVER_UNI arguments of encode_string_utf_8 and |
| 9543 | decode_string_utf_8 when those arguments are given by | 9550 | decode_string_utf_8 when those arguments are given by |
| @@ -9572,11 +9579,16 @@ get_char_bytes (int c, int *len) | |||
| 9572 | 9579 | ||
| 9573 | /* Encode STRING by the coding system utf-8-unix. | 9580 | /* Encode STRING by the coding system utf-8-unix. |
| 9574 | 9581 | ||
| 9582 | This function is optimized for speed when the input string is | ||
| 9583 | already a valid sequence of Unicode codepoints in the internal | ||
| 9584 | representation, i.e. there are neither 8-bit raw bytes nor | ||
| 9585 | characters beyond the Unicode range in the string's contents. | ||
| 9586 | |||
| 9575 | Ignore any :pre-write-conversion and :encode-translation-table | 9587 | Ignore any :pre-write-conversion and :encode-translation-table |
| 9576 | properties of that coding system. | 9588 | properties. |
| 9577 | 9589 | ||
| 9578 | Assume that arguments have values as described below. | 9590 | Assume that arguments have values as described below. |
| 9579 | The validity must be assured by callers. | 9591 | The validity must be enforced and ensured by the caller. |
| 9580 | 9592 | ||
| 9581 | STRING is a multibyte string or an ASCII-only unibyte string. | 9593 | STRING is a multibyte string or an ASCII-only unibyte string. |
| 9582 | 9594 | ||
| @@ -9587,17 +9599,24 @@ get_char_bytes (int c, int *len) | |||
| 9587 | inserted characters. The caller should have made BUFFER ready for | 9599 | inserted characters. The caller should have made BUFFER ready for |
| 9588 | modifying in advance (e.g., by calling invalidate_buffer_caches). | 9600 | modifying in advance (e.g., by calling invalidate_buffer_caches). |
| 9589 | 9601 | ||
| 9590 | If BUFFER is Qnil, return a unibyte string from the encoded result. | 9602 | If BUFFER is nil, return a unibyte string from the encoded result. |
| 9591 | If NOCOPY, and if STRING contains only Unicode characters (i.e., | 9603 | |
| 9592 | the encoding does not change the byte sequence), return STRING even | 9604 | If NOCOPY is non-zero, and if STRING contains only Unicode |
| 9593 | if it is multibyte. | 9605 | characters (i.e., the encoding does not change the byte sequence), |
| 9606 | return STRING even if it is multibyte. WARNING: This will return a | ||
| 9607 | _multibyte_ string, something that callers might not expect, especially | ||
| 9608 | if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller | ||
| 9609 | will only use the byte sequence of the encoded result accessed by | ||
| 9610 | SDATA or SSDATA, and the original STRING will _not_ be modified after | ||
| 9611 | the encoding. When in doubt, always pass NOCOPY as zero. You _have_ | ||
| 9612 | been warned! | ||
| 9594 | 9613 | ||
| 9595 | HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode | 9614 | HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode |
| 9596 | character. The former is for an eight-bit character (represented | 9615 | character in STRING. The former is for an eight-bit character (represented |
| 9597 | by a 2-byte overlong sequence in a multibyte STRING). The latter is | 9616 | by a 2-byte overlong sequence in a multibyte STRING). The latter is |
| 9598 | for an over-Unicode character (a character whose code is greater | 9617 | for a codepoint beyond the end of the Unicode range (a character whose |
| 9599 | than the maximum Unicode character 0x10FFFF, represented by a 4 or | 9618 | code is greater than the maximum Unicode character 0x10FFFF, represented |
| 9600 | 5-byte sequence in a multibyte STRING). | 9619 | by a 4 or 5-byte sequence in a multibyte STRING). |
| 9601 | 9620 | ||
| 9602 | If these two arguments are unibyte strings (typically | 9621 | If these two arguments are unibyte strings (typically |
| 9603 | "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT | 9622 | "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT |
| @@ -9605,18 +9624,20 @@ get_char_bytes (int c, int *len) | |||
| 9605 | unibyte sequence. | 9624 | unibyte sequence. |
| 9606 | 9625 | ||
| 9607 | If the two arguments are characters, encode a non-Unicode | 9626 | If the two arguments are characters, encode a non-Unicode |
| 9608 | character as if it was the argument. | 9627 | character as the respective argument characters. |
| 9609 | 9628 | ||
| 9610 | If they are Qignored, skip a non-Unicode character. | 9629 | If they are Qignored, skip a non-Unicode character. |
| 9611 | 9630 | ||
| 9612 | If HANDLE-8-BIT is Qt, encode an eight-bit character into one | 9631 | If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes |
| 9613 | byte of the same value. | 9632 | of the same value, like the usual Emacs encoding does. |
| 9614 | 9633 | ||
| 9615 | If HANDLE-OVER-UNI is Qt, encode an over-unicode character | 9634 | If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode |
| 9616 | into the same 4 or 5-byte sequence. | 9635 | range into the same 4 or 5-byte sequence as used by Emacs |
| 9636 | internally, like the usual Emacs encoding does. | ||
| 9617 | 9637 | ||
| 9618 | If the two arguments are Qnil, return Qnil if STRING has a | 9638 | If the two arguments are Qnil, return Qnil if STRING has a |
| 9619 | non-Unicode character. */ | 9639 | non-Unicode character. This allows the caller to signal an error |
| 9640 | if such input strings are not allowed. */ | ||
| 9620 | 9641 | ||
| 9621 | Lisp_Object | 9642 | Lisp_Object |
| 9622 | encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | 9643 | encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, |
| @@ -9624,15 +9645,15 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9624 | Lisp_Object handle_over_uni) | 9645 | Lisp_Object handle_over_uni) |
| 9625 | { | 9646 | { |
| 9626 | ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string); | 9647 | ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string); |
| 9627 | if (NILP (buffer) && nchars == nbytes) | 9648 | if (NILP (buffer) && nchars == nbytes && nocopy) |
| 9628 | /* STRING contains only ASCII characters. */ | 9649 | /* STRING contains only ASCII characters. */ |
| 9629 | return string; | 9650 | return string; |
| 9630 | 9651 | ||
| 9631 | ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */ | 9652 | ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */ |
| 9632 | /* The following two vars are counted only if handle_over_uni is not Qt. */ | 9653 | /* The following two vars are counted only if handle_over_uni is not Qt. */ |
| 9633 | ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */ | 9654 | ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */ |
| 9634 | ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */ | 9655 | ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */ |
| 9635 | ptrdiff_t outbytes; /* number of bytes of decoding result. */ | 9656 | ptrdiff_t outbytes; /* number of bytes of decoding result */ |
| 9636 | unsigned char *p = SDATA (string); | 9657 | unsigned char *p = SDATA (string); |
| 9637 | unsigned char *pend = p + nbytes; | 9658 | unsigned char *pend = p + nbytes; |
| 9638 | unsigned char *src = NULL, *dst = NULL; | 9659 | unsigned char *src = NULL, *dst = NULL; |
| @@ -9668,10 +9689,10 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9668 | } | 9689 | } |
| 9669 | 9690 | ||
| 9670 | /* A character to change the byte sequence on encoding was | 9691 | /* A character to change the byte sequence on encoding was |
| 9671 | found. A rare case. */ | 9692 | found. A rare case. */ |
| 9672 | if (len == 2) | 9693 | if (len == 2) |
| 9673 | { | 9694 | { |
| 9674 | /* Handle an eight-bit character by handle_8_bit. */ | 9695 | /* Handle an eight-bit character by handle_8_bit. */ |
| 9675 | if (scan_count == 0) | 9696 | if (scan_count == 0) |
| 9676 | { | 9697 | { |
| 9677 | if (NILP (handle_8_bit)) | 9698 | if (NILP (handle_8_bit)) |
| @@ -9699,7 +9720,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9699 | } | 9720 | } |
| 9700 | else /* len == 4 or 5 */ | 9721 | else /* len == 4 or 5 */ |
| 9701 | { | 9722 | { |
| 9702 | /* Handle an over-unicode character by handle_over_uni. */ | 9723 | /* Handle an over-unicode character by handle_over_uni. */ |
| 9703 | if (scan_count == 0) | 9724 | if (scan_count == 0) |
| 9704 | { | 9725 | { |
| 9705 | if (NILP (handle_over_uni)) | 9726 | if (NILP (handle_over_uni)) |
| @@ -9729,19 +9750,20 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9729 | 9750 | ||
| 9730 | if (scan_count == 0) | 9751 | if (scan_count == 0) |
| 9731 | { | 9752 | { |
| 9732 | /* End of the first scane */ | 9753 | /* End of the first scan. */ |
| 9733 | outbytes = nbytes; | 9754 | outbytes = nbytes; |
| 9734 | if (num_8_bit == 0 | 9755 | if (num_8_bit == 0 |
| 9735 | && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt))) | 9756 | && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt))) |
| 9736 | { | 9757 | { |
| 9737 | /* We can break the loop because there is no need of | 9758 | /* We can break the loop because there is no need of |
| 9738 | changing the byte sequence. This is the typical | 9759 | changing the byte sequence. This is the typical |
| 9739 | case. */ | 9760 | case. */ |
| 9740 | scan_count = 1; | 9761 | scan_count = 1; |
| 9741 | } | 9762 | } |
| 9742 | else | 9763 | else |
| 9743 | { | 9764 | { |
| 9744 | /* Prepare for the next scan to handle non-Unicode characters. */ | 9765 | /* Prepare for handling non-Unicode characters during |
| 9766 | the next scan. */ | ||
| 9745 | if (num_8_bit > 0) | 9767 | if (num_8_bit > 0) |
| 9746 | { | 9768 | { |
| 9747 | if (CHARACTERP (handle_8_bit)) | 9769 | if (CHARACTERP (handle_8_bit)) |
| @@ -9792,7 +9814,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9792 | } | 9814 | } |
| 9793 | } | 9815 | } |
| 9794 | 9816 | ||
| 9795 | /* Prepare a return value and a space to store the encoded bytes. */ | 9817 | /* Prepare return value and space to store the encoded bytes. */ |
| 9796 | if (BUFFERP (buffer)) | 9818 | if (BUFFERP (buffer)) |
| 9797 | { | 9819 | { |
| 9798 | val = make_fixnum (outbytes); | 9820 | val = make_fixnum (outbytes); |
| @@ -9822,38 +9844,51 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9822 | return val; | 9844 | return val; |
| 9823 | } | 9845 | } |
| 9824 | 9846 | ||
| 9825 | /* Decode STRING by the coding system utf-8-unix. | 9847 | /* Decode input string by the coding system utf-8-unix. |
| 9826 | 9848 | ||
| 9827 | Ignore any :pre-write-conversion and :encode-translation-table | 9849 | This function is optimized for speed when the input string is |
| 9828 | properties of that coding system. | 9850 | already a valid UTF-8 sequence, i.e. there are neither 8-bit raw |
| 9851 | bytes nor any UTF-8 sequences longer than 4 bytes in the string's | ||
| 9852 | contents. | ||
| 9829 | 9853 | ||
| 9830 | Assumes that arguments have values as described below. | 9854 | Ignore any :post-read-conversion and :decode-translation-table |
| 9831 | The validity must be assured by callers. | 9855 | properties. |
| 9832 | 9856 | ||
| 9833 | STRING is a unibyte string or an ASCII-only multibyte string. | 9857 | Assume that arguments have values as described below. |
| 9858 | The validity must be enforced and ensured by the caller. | ||
| 9834 | 9859 | ||
| 9835 | BUFFER is a multibyte buffer or Qnil. | 9860 | STRING is a unibyte string, an ASCII-only multibyte string, or Qnil. |
| 9861 | If STRING is Qnil, the input is a C string pointed by STR whose | ||
| 9862 | length in bytes is in STR_LEN. | ||
| 9836 | 9863 | ||
| 9864 | BUFFER is a multibyte buffer or Qnil. | ||
| 9837 | If BUFFER is a multibyte buffer, insert the decoding result of | 9865 | If BUFFER is a multibyte buffer, insert the decoding result of |
| 9838 | Unicode characters after point of the buffer, and return the number | 9866 | Unicode characters after point of the buffer, and return the number |
| 9839 | of inserted characters. The caller should have made BUFFER ready | 9867 | of inserted characters. The caller should have made BUFFER ready |
| 9840 | for modifying in advance (e.g., by calling invalidate_buffer_caches). | 9868 | for modifying in advance (e.g., by calling invalidate_buffer_caches). |
| 9841 | 9869 | ||
| 9842 | If BUFFER is Qnil, return a multibyte string from the decoded result. | 9870 | If BUFFER is Qnil, return a multibyte string from the decoded result. |
| 9843 | As a special case, return STRING itself in the following cases: | ||
| 9844 | 1. STRING contains only ASCII characters. | ||
| 9845 | 2. NOCOPY is true, and STRING contains only valid UTF-8 sequences. | ||
| 9846 | 9871 | ||
| 9847 | For maximum speed, always specify NOCOPY true when STRING is | 9872 | NOCOPY non-zero means it is OK to return the input STRING if it |
| 9848 | guaranteed to contain only valid UTF-8 sequences. | 9873 | contains only ASCII characters or only valid UTF-8 sequences of 2 |
| 9874 | to 4 bytes. WARNING: This will return a _unibyte_ string, something | ||
| 9875 | that callers might not expect, especially if STRING is not | ||
| 9876 | pure-ASCII; only use NOCOPY non-zero if the caller will only use | ||
| 9877 | the byte sequence of the decoded result accessed via SDATA or | ||
| 9878 | SSDATA, and if the original STRING will _not_ be modified after the | ||
| 9879 | decoding. When in dount, always pass NOCOPY as zero. You _have_ | ||
| 9880 | been warned! | ||
| 9881 | |||
| 9882 | If STRING is Qnil, and the original string is passed via STR, NOCOPY | ||
| 9883 | is ignored. | ||
| 9849 | 9884 | ||
| 9850 | HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid | 9885 | HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid |
| 9851 | byte sequence. The former is for an 1-byte invalid sequence that | 9886 | byte sequence. The former is for a 1-byte invalid sequence that |
| 9852 | violates the fundamental UTF-8 encoding rule. The latter is for a | 9887 | violates the fundamental UTF-8 encoding rules. The latter is for a |
| 9853 | 4 or 5-byte invalid sequence that Emacs internally uses to | 9888 | 4 or 5-byte overlong sequences that Emacs internally uses to |
| 9854 | represent an over-unicode character (a character of code greater | 9889 | represent characters beyond the Unicode range (characters whose |
| 9855 | than #x10FFFF). Note that this function does not treat an overlong | 9890 | codepoints are greater than #x10FFFF). Note that this function does |
| 9856 | UTF-8 sequence as invalid. | 9891 | not in general treat such overlong UTF-8 sequences as invalid. |
| 9857 | 9892 | ||
| 9858 | If these two arguments are strings (typically a 1-char string of | 9893 | If these two arguments are strings (typically a 1-char string of |
| 9859 | the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte | 9894 | the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte |
| @@ -9862,24 +9897,28 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9862 | 9897 | ||
| 9863 | If the two arguments are characters, decode an invalid byte | 9898 | If the two arguments are characters, decode an invalid byte |
| 9864 | sequence into the corresponding multibyte representation of the | 9899 | sequence into the corresponding multibyte representation of the |
| 9865 | characters. | 9900 | respective character. |
| 9866 | 9901 | ||
| 9867 | If they are Qignored, skip an invalid byte sequence. | 9902 | If they are Qignored, skip an invalid byte sequence without |
| 9903 | producing anything in the decoded string. | ||
| 9868 | 9904 | ||
| 9869 | If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into | 9905 | If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the |
| 9870 | the corresponding eight-bit character. | 9906 | corresponding eight-bit multibyte representation, like the usual |
| 9907 | Emacs decoding does. | ||
| 9871 | 9908 | ||
| 9872 | If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte invalid sequence | 9909 | If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence |
| 9873 | that follows Emacs' representation for an over-unicode character | 9910 | that follows Emacs' internal representation for a character beyond |
| 9874 | into the corresponding character. | 9911 | Unicode range into the corresponding character, like the usual |
| 9912 | Emacs decoding does. | ||
| 9875 | 9913 | ||
| 9876 | If the two arguments are Qnil, return Qnil if STRING has an invalid | 9914 | If the two arguments are Qnil, return Qnil if the input string has |
| 9877 | sequence. */ | 9915 | raw bytes or overlong sequences. This allows the caller to signal |
| 9916 | an error if such inputs are not allowed. */ | ||
| 9878 | 9917 | ||
| 9879 | Lisp_Object | 9918 | Lisp_Object |
| 9880 | decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | 9919 | decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len, |
| 9881 | bool nocopy, Lisp_Object handle_8_bit, | 9920 | Lisp_Object buffer, bool nocopy, |
| 9882 | Lisp_Object handle_over_uni) | 9921 | Lisp_Object handle_8_bit, Lisp_Object handle_over_uni) |
| 9883 | { | 9922 | { |
| 9884 | /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80 | 9923 | /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80 |
| 9885 | and it returns 0 for an invalid sequence. */ | 9924 | and it returns 0 for an invalid sequence. */ |
| @@ -9891,24 +9930,26 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9891 | : (c) == 0xF8 ? 5 \ | 9930 | : (c) == 0xF8 ? 5 \ |
| 9892 | : 0) | 9931 | : 0) |
| 9893 | 9932 | ||
| 9894 | ptrdiff_t nbytes = SBYTES (string); | 9933 | ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len; |
| 9895 | unsigned char *p = SDATA (string), *pend = p + nbytes; | 9934 | unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str; |
| 9896 | ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */ | 9935 | unsigned char *str_orig = p; |
| 9897 | ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */ | 9936 | unsigned char *pend = p + nbytes; |
| 9898 | ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */ | 9937 | ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences */ |
| 9899 | ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */ | 9938 | ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences */ |
| 9900 | ptrdiff_t outchars = 0; /* number of decoded characters. */ | 9939 | ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences */ |
| 9940 | ptrdiff_t outbytes = nbytes; /* number of decoded bytes */ | ||
| 9941 | ptrdiff_t outchars = 0; /* number of decoded characters */ | ||
| 9901 | unsigned char *src = NULL, *dst = NULL; | 9942 | unsigned char *src = NULL, *dst = NULL; |
| 9902 | bool change_byte_sequence = false; | 9943 | bool change_byte_sequence = false; |
| 9903 | 9944 | ||
| 9904 | /* Scan bytes in STRING twice. The first scan is to count invalid | 9945 | /* Scan input bytes twice. The first scan is to count invalid |
| 9905 | sequences, and the second scan is to decode STRING. If the | 9946 | sequences, and the second scan is to decode input. If the |
| 9906 | decoding is trivial (no need of changing the byte sequence), | 9947 | decoding is trivial (no need of changing the byte sequence), |
| 9907 | the second scan is avoided. */ | 9948 | the second scan is avoided. */ |
| 9908 | while (p < pend) | 9949 | while (p < pend) |
| 9909 | { | 9950 | { |
| 9910 | src = p; | 9951 | src = p; |
| 9911 | /* Try short cut for an ASCII-only case. */ | 9952 | /* Try short cut for an ASCII-only case. */ |
| 9912 | while (p < pend && *p < 0x80) p++; | 9953 | while (p < pend && *p < 0x80) p++; |
| 9913 | outchars += (p - src); | 9954 | outchars += (p - src); |
| 9914 | if (p == pend) | 9955 | if (p == pend) |
| @@ -9916,7 +9957,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9916 | int c = *p; | 9957 | int c = *p; |
| 9917 | outchars++; | 9958 | outchars++; |
| 9918 | int len = UTF_8_SEQUENCE_LENGTH (c); | 9959 | int len = UTF_8_SEQUENCE_LENGTH (c); |
| 9919 | /* len == 0, 2, 3, 4, 5 */ | 9960 | /* len == 0, 2, 3, 4, 5. */ |
| 9920 | if (UTF_8_EXTRA_OCTET_P (p[1]) | 9961 | if (UTF_8_EXTRA_OCTET_P (p[1]) |
| 9921 | && (len == 2 | 9962 | && (len == 2 |
| 9922 | || (UTF_8_EXTRA_OCTET_P (p[2]) | 9963 | || (UTF_8_EXTRA_OCTET_P (p[2]) |
| @@ -9930,7 +9971,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9930 | continue; | 9971 | continue; |
| 9931 | } | 9972 | } |
| 9932 | 9973 | ||
| 9933 | /* A sequence to change on decoding was found. A rare case. */ | 9974 | /* A sequence to change on decoding was found. A rare case. */ |
| 9934 | if (len == 0) | 9975 | if (len == 0) |
| 9935 | { | 9976 | { |
| 9936 | if (NILP (handle_8_bit)) | 9977 | if (NILP (handle_8_bit)) |
| @@ -9951,19 +9992,19 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 9951 | p += len; | 9992 | p += len; |
| 9952 | } | 9993 | } |
| 9953 | 9994 | ||
| 9954 | Lisp_Object val; /* the return value. */ | 9995 | Lisp_Object val; /* the return value */ |
| 9955 | 9996 | ||
| 9956 | if (! change_byte_sequence | 9997 | if (! change_byte_sequence |
| 9957 | && NILP (buffer)) | 9998 | && NILP (buffer)) |
| 9958 | { | 9999 | { |
| 9959 | if (nocopy) | 10000 | if (nocopy && STRINGP (string)) |
| 9960 | return string; | 10001 | return string; |
| 9961 | val = make_uninit_multibyte_string (outchars, outbytes); | 10002 | val = make_uninit_multibyte_string (outchars, outbytes); |
| 9962 | memcpy (SDATA (val), SDATA (string), pend - SDATA (string)); | 10003 | memcpy (SDATA (val), str_orig, pend - str_orig); |
| 9963 | return val; | 10004 | return val; |
| 9964 | } | 10005 | } |
| 9965 | 10006 | ||
| 9966 | /* Count the number of resulting chars and bytes. */ | 10007 | /* Count the number of resulting chars and bytes. */ |
| 9967 | unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL; | 10008 | unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL; |
| 9968 | int replace_8_bit_len = 0, replace_over_uni_len = 0; | 10009 | int replace_8_bit_len = 0, replace_over_uni_len = 0; |
| 9969 | 10010 | ||
| @@ -10022,7 +10063,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 10022 | } | 10063 | } |
| 10023 | } | 10064 | } |
| 10024 | 10065 | ||
| 10025 | /* Prepare a return value and a space to store the decoded bytes. */ | 10066 | /* Prepare return value and space to store the decoded bytes. */ |
| 10026 | if (BUFFERP (buffer)) | 10067 | if (BUFFERP (buffer)) |
| 10027 | { | 10068 | { |
| 10028 | val = make_fixnum (outchars); | 10069 | val = make_fixnum (outchars); |
| @@ -10030,19 +10071,20 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 10030 | } | 10071 | } |
| 10031 | else | 10072 | else |
| 10032 | { | 10073 | { |
| 10033 | if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0) | 10074 | if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0 |
| 10075 | && STRINGP (string)) | ||
| 10034 | return string; | 10076 | return string; |
| 10035 | val = make_uninit_multibyte_string (outchars, outbytes); | 10077 | val = make_uninit_multibyte_string (outchars, outbytes); |
| 10036 | dst = SDATA (val); | 10078 | dst = SDATA (val); |
| 10037 | } | 10079 | } |
| 10038 | 10080 | ||
| 10039 | src = SDATA (string); | 10081 | src = str_orig; |
| 10040 | if (change_byte_sequence) | 10082 | if (change_byte_sequence) |
| 10041 | { | 10083 | { |
| 10042 | p = src; | 10084 | p = src; |
| 10043 | while (p < pend) | 10085 | while (p < pend) |
| 10044 | { | 10086 | { |
| 10045 | /* Try short cut for an ASCII-only case. */ | 10087 | /* Try short cut for an ASCII-only case. */ |
| 10046 | /* while (p < pend && *p < 0x80) p++; */ | 10088 | /* while (p < pend && *p < 0x80) p++; */ |
| 10047 | /* if (p == pend) */ | 10089 | /* if (p == pend) */ |
| 10048 | /* break; */ | 10090 | /* break; */ |
| @@ -10089,7 +10131,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | |||
| 10089 | } | 10131 | } |
| 10090 | else /* len == 4 or 5 */ | 10132 | else /* len == 4 or 5 */ |
| 10091 | { | 10133 | { |
| 10092 | /* Handle p[0]... by handle_over_uni */ | 10134 | /* Handle p[0]... by handle_over_uni. */ |
| 10093 | if (replace_over_uni) | 10135 | if (replace_over_uni) |
| 10094 | { | 10136 | { |
| 10095 | memcpy (dst, replace_over_uni, replace_over_uni_len); | 10137 | memcpy (dst, replace_over_uni, replace_over_uni_len); |