aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorK. Handa2019-08-04 21:14:26 +0900
committerK. Handa2019-08-04 21:14:26 +0900
commita8026dfde9734a03ad03a9872ec801871dd1d81a (patch)
tree92039f7268c2824470411cca944c7f638e645c15 /src/coding.c
parent5ec3f70527e330abf4c0c3519fa4914c5f094358 (diff)
downloademacs-a8026dfde9734a03ad03a9872ec801871dd1d81a.tar.gz
emacs-a8026dfde9734a03ad03a9872ec801871dd1d81a.zip
Add Unicode-safe UTF-8 converter
* src/coding.c (encode_string_utf_8, decode_string_utf_8): New functions. * src/coding.h (encode_string_utf_8, decode_string_utf_8): Extern them.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c730
1 files changed, 730 insertions, 0 deletions
diff --git a/src/coding.c b/src/coding.c
index 189a4b39d15..ab0e15119f3 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9515 return code_convert_string (string, coding_system, Qt, encodep, 0, 1); 9515 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9516} 9516}
9517 9517
9518
9519/* Return the gap address of BUFFER. If the gap size is less than
9520 NBYTES, enlarge the gap in advance. */
9521
9522static unsigned char *
9523get_buffer_gap_address (Lisp_Object buffer, int nbytes)
9524{
9525 struct buffer *buf = XBUFFER (buffer);
9526
9527 if (BUF_GPT (buf) != BUF_PT (buf))
9528 {
9529 struct buffer *oldb = current_buffer;
9530
9531 current_buffer = buf;
9532 move_gap_both (PT, PT_BYTE);
9533 current_buffer = oldb;
9534 }
9535 if (BUF_GAP_SIZE (buf) < nbytes)
9536 make_gap_1 (buf, nbytes);
9537 return BUF_GPT_ADDR (buf);
9538}
9539
9540/* Return a pointer to the byte sequence for C, and set the length in
9541 LEN. This function is used to get a byte sequence for HANDLE_8_BIT
9542 and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
9543 decode_string_utf_8 when those arguments are given by
9544 characters. */
9545
9546static unsigned char *
9547get_char_bytes (int c, int *len)
9548{
9549 /* We uses two chaches considering the situation that
9550 encode/decode_string_utf_8 are called repeatedly with the same
9551 values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments. */
9552 static int chars[2];
9553 static unsigned char bytes[2][6];
9554 static int nbytes[2];
9555 static int last_index;
9556
9557 if (chars[last_index] == c)
9558 {
9559 *len = nbytes[last_index];
9560 return bytes[last_index];
9561 }
9562 if (chars[1 - last_index] == c)
9563 {
9564 *len = nbytes[1 - last_index];
9565 return bytes[1 - last_index];
9566 }
9567 last_index = 1 - last_index;
9568 chars[last_index] = c;
9569 *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
9570 return bytes[last_index];
9571}
9572
9573/* Encode STRING by the coding system utf-8-unix.
9574
9575 Even if :pre-write-conversion and :encode-translation-table
9576 properties are put to that coding system, they are ignored.
9577
9578 It ignores :pre-write-conversion and :encode-translation-table
9579 propeties of that coding system.
9580
9581 This function assumes that arguments have values as described
9582 below. The validity must be assured by callers.
9583
9584 STRING is a multibyte string or an ASCII-only unibyte string.
9585
9586 BUFFER is a unibyte buffer or Qnil.
9587
9588 If BUFFER is a unibyte buffer, the encoding result of UTF-8
9589 sequence is inserted after point of the buffer, and the number of
9590 inserted characters is returned. Note that a caller should have
9591 made BUFFER ready for modifying in advance (e.g. by calling
9592 invalidate_buffer_caches).
9593
9594 If BUFFER is Qnil, a unibyte string is made from the encodnig
9595 result of UTF-8 sequence, and it is returned. If NOCOPY and STRING
9596 contains only Unicode characters (i.e. the encoding does not change
9597 the byte sequence), STRING is returned even if it is multibyte.
9598
9599 HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode
9600 character. The former is for an eight-bit character (represented
9601 by 2-byte overlong sequence in multibyte STRING). The latter is
9602 for an over-unicode character (a character whose code is greater
9603 than the maximum Unicode character 0x10FFFF, and is represented by
9604 4 or 5-byte sequence in multibyte STRING).
9605
9606 If they are unibyte strings (typically "\357\277\275"; UTF-8
9607 sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a
9608 non-Unicode character is encoded into that sequence.
9609
9610 If they are characters, a non-Unicode chracters is encoded into the
9611 corresponding UTF-8 sequences.
9612
9613 If they are Qignored, a non-Unicode character is skipped on
9614 encoding.
9615
9616 If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one
9617 byte of the same value.
9618
9619 If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded
9620 into the the same 4 or 5-byte sequence.
9621
9622 If they are Qnil, Qnil is returned if STRING has a non-Unicode
9623 character. */
9624
9625Lisp_Object
9626encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9627 bool nocopy, Lisp_Object handle_8_bit,
9628 Lisp_Object handle_over_uni)
9629{
9630 ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
9631 if (NILP (buffer) && nchars == nbytes)
9632 /* STRING contains only ASCII characters. */
9633 return string;
9634
9635 ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */
9636 /* The following two vars are counted only if handle_over_uni is not Qt */
9637 ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
9638 ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
9639 ptrdiff_t outbytes; /* number of bytes of decoding result. */
9640 unsigned char *p = SDATA (string);
9641 unsigned char *pend = p + nbytes;
9642 unsigned char *src = NULL, *dst = NULL;
9643 unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
9644 int replace_8_bit_len = 0, replace_over_uni_len = 0;
9645 Lisp_Object val; /* the return value */
9646
9647 /* Scan bytes in STRING twice. The first scan is to count non-Unicode
9648 characters, and the second scan is to encode STRING. If the
9649 encoding is trivial (no need of changing the byte sequence),
9650 the second scan is avoided. */
9651 for (int scan_count = 0; scan_count < 2; scan_count++)
9652 {
9653 while (p < pend)
9654 {
9655 if (nchars == pend - p)
9656 /* There is no multibyte character remaining. */
9657 break;
9658
9659 int c = *p;
9660 int len = BYTES_BY_CHAR_HEAD (c);
9661
9662 nchars--;
9663 if (len == 1
9664 || len == 3
9665 || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
9666 : (EQ (handle_over_uni, Qt)
9667 || (len == 4
9668 && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))
9669 {
9670 p += len;
9671 continue;
9672 }
9673
9674 /* A character to change the byte sequence on encoding was
9675 found. A rare case. */
9676 if (len == 2)
9677 {
9678 /* Handle an eight-bit character by handle_8_bit. */
9679 if (scan_count == 0)
9680 {
9681 if (NILP (handle_8_bit))
9682 return Qnil;
9683 num_8_bit++;
9684 }
9685 else
9686 {
9687 if (src < p)
9688 {
9689 memcpy (dst, src, p - src);
9690 dst += p - src;
9691 }
9692 if (replace_8_bit_len > 0)
9693 {
9694 memcpy (dst, replace_8_bit, replace_8_bit_len);
9695 dst += replace_8_bit_len;
9696 }
9697 else if (EQ (handle_8_bit, Qt))
9698 {
9699 int char8 = STRING_CHAR (p);
9700 *dst++ = CHAR_TO_BYTE8 (char8);
9701 }
9702 }
9703 }
9704 else /* len == 4 or 5 */
9705 {
9706 /* Handle an over-unicode character by handle_over_uni. */
9707 if (scan_count == 0)
9708 {
9709 if (NILP (handle_over_uni))
9710 return Qnil;
9711 if (len == 4)
9712 num_over_4++;
9713 else
9714 num_over_5++;
9715 }
9716 else
9717 {
9718 if (src < p)
9719 {
9720 memcpy (dst, src, p - src);
9721 dst += p - src;
9722 }
9723 if (replace_over_uni_len > 0)
9724 {
9725 memcpy (dst, replace_over_uni, replace_over_uni_len);
9726 dst += replace_over_uni_len;
9727 }
9728 }
9729 }
9730 p += len;
9731 src = p;
9732 }
9733
9734 if (scan_count == 0)
9735 {
9736 /* End of the first scane */
9737 outbytes = nbytes;
9738 if (num_8_bit == 0
9739 && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
9740 {
9741 /* We can break the loop because there is no need of
9742 changing the byte sequence. This is the typical
9743 case. */
9744 scan_count = 1;
9745 }
9746 else
9747 {
9748 /* Prepare for the next scan to handle non-Unicode characters. */
9749 if (num_8_bit > 0)
9750 {
9751 if (CHARACTERP (handle_8_bit))
9752 replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
9753 &replace_8_bit_len);
9754 else if (STRINGP (handle_8_bit))
9755 {
9756 replace_8_bit = SDATA (handle_8_bit);
9757 replace_8_bit_len = SBYTES (handle_8_bit);
9758 }
9759 if (replace_8_bit)
9760 outbytes += (replace_8_bit_len - 2) * num_8_bit;
9761 else if (EQ (handle_8_bit, Qignored))
9762 outbytes -= 2 * num_8_bit;
9763 else if (EQ (handle_8_bit, Qt))
9764 outbytes -= num_8_bit;
9765 else
9766 return Qnil;
9767 }
9768 if (num_over_4 + num_over_5 > 0)
9769 {
9770 if (CHARACTERP (handle_over_uni))
9771 replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
9772 &replace_over_uni_len);
9773 else if (STRINGP (handle_over_uni))
9774 {
9775 replace_over_uni = SDATA (handle_over_uni);
9776 replace_over_uni_len = SBYTES (handle_over_uni);
9777 }
9778 if (num_over_4 > 0)
9779 {
9780 if (replace_over_uni)
9781 outbytes += (replace_over_uni_len - 4) * num_over_4;
9782 else if (EQ (handle_over_uni, Qignored))
9783 outbytes -= 4 * num_over_4;
9784 else if (! EQ (handle_over_uni, Qt))
9785 return Qnil;
9786 }
9787 if (num_over_5 > 0)
9788 {
9789 if (replace_over_uni)
9790 outbytes += (replace_over_uni_len - 5) * num_over_5;
9791 else if (EQ (handle_over_uni, Qignored))
9792 outbytes -= 5 * num_over_5;
9793 else if (! EQ (handle_over_uni, Qt))
9794 return Qnil;
9795 }
9796 }
9797 }
9798
9799 /* Prepare a return value and a space to store the encoded bytes. */
9800 if (BUFFERP (buffer))
9801 {
9802 val = make_fixnum (outbytes);
9803 dst = get_buffer_gap_address (buffer, nbytes);
9804 }
9805 else
9806 {
9807 if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
9808 return string;
9809 val = make_uninit_string (outbytes);
9810 dst = SDATA (val);
9811 }
9812 p = src = SDATA (string);
9813 }
9814 }
9815
9816 if (src < pend)
9817 memcpy (dst, src, pend - src);
9818 if (BUFFERP (buffer))
9819 {
9820 struct buffer *oldb = current_buffer;
9821
9822 current_buffer = XBUFFER (buffer);
9823 insert_from_gap (outbytes, outbytes, false);
9824 current_buffer = oldb;
9825 }
9826 return val;
9827}
9828
9829/* Decode STRING by the coding system utf-8-unix.
9830
9831 Even if :post-read-conversion and :decode-translation-table
9832 properties are put to that coding system, they are ignored.
9833
9834 This function assumes that arguments have values as described
9835 below. The validity must be assured by callers.
9836
9837 STRING is a unibyte string or an ASCII-only multibyte string.
9838
9839 BUFFER is a multibyte buffer or Qnil.
9840
9841 If BUFFER is a multibyte buffer, the decoding result of Unicode
9842 characters are inserted after point of the buffer, and the number
9843 of inserted characters is returned. Note that a caller should have
9844 made BUFFER ready for modifying in advance (e.g. by calling
9845 invalidate_buffer_caches).
9846
9847 If BUFFER is Qnil, a multibyte string is made from the decoding
9848 result of Unicode characters, and it is returned. As a special
9849 case, STRING itself is returned in the following cases:
9850 1. STRING contains only ASCII characters.
9851 2. NOCOPY, and STRING contains only valid UTF-8 sequences.
9852
9853 HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
9854 byte sequence. The former is for an 1-byte invalid sequence that
9855 violates the fundamental UTF-8 encoding rule. The latter is for a
9856 4 or 5-byte invalid sequence that Emacs internally uses to
9857 represent an over-unicode character (a character of code greater
9858 than #x10FFFF). Note that this function does not treat an overlong
9859 UTF-8 sequence as invalid.
9860
9861 If they are strings (typically 1-char string of the Unicode
9862 REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into
9863 that string. They must be multibyte strings if they contain a
9864 non-ASCII character.
9865
9866 If they are characters, an invalid sequence is decoded into the
9867 corresponding multibyte representation of the characters.
9868
9869 If they are Qignored, an invalid sequence is skipped on decoding.
9870
9871 If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into
9872 the corresponding eight-bit character.
9873
9874 If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that
9875 follows Emacs' representation for an over-unicode character is
9876 decoded into the corresponding character.
9877
9878 If they are Qnil, Qnil is returned if STRING has an invalid sequence. */
9879
9880Lisp_Object
9881decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9882 bool nocopy, Lisp_Object handle_8_bit,
9883 Lisp_Object handle_over_uni)
9884{
9885 /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
9886 and it returns 0 for invalid sequence. */
9887#define UTF_8_SEQUENCE_LENGTH(c) \
9888 ((c) < 0xC2 ? 0 \
9889 : (c) < 0xE0 ? 2 \
9890 : (c) < 0xF0 ? 3 \
9891 : (c) < 0xF8 ? 4 \
9892 : (c) == 0xF8 ? 5 \
9893 : 0)
9894
9895 ptrdiff_t nbytes = SBYTES (string);
9896 unsigned char *p = SDATA (string), *pend = p + nbytes;
9897 ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */
9898 ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */
9899 ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */
9900 ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */
9901 ptrdiff_t outchars = 0; /* number of decoded characters. */
9902 unsigned char *src = NULL, *dst = NULL;
9903 bool change_byte_sequence = false;
9904
9905 /* Scan bytes in STRING twice. The first scan is to count invalid
9906 sequences, and the second scan is to decode STRING. If the
9907 decoding is trivial (no need of changing the byte sequence),
9908 the second scan is avoided. */
9909 while (p < pend)
9910 {
9911 src = p;
9912 /* Try short cut for an ASCII-only case. */
9913 while (p < pend && *p < 0x80) p++;
9914 outchars += (p - src);
9915 if (p == pend)
9916 break;
9917 int c = *p;
9918 outchars++;
9919 int len = UTF_8_SEQUENCE_LENGTH (c);
9920 /* len == 0, 2, 3, 4, 5 */
9921 if (UTF_8_EXTRA_OCTET_P (p[1])
9922 && (len == 2
9923 || (UTF_8_EXTRA_OCTET_P (p[2])
9924 && (len == 3
9925 || (UTF_8_EXTRA_OCTET_P (p[3])
9926 && len == 4
9927 && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))))
9928 {
9929 p += len;
9930 continue;
9931 }
9932
9933 /* A sequence to change on decoding was found. A rare case. */
9934 if (len == 0)
9935 {
9936 if (NILP (handle_8_bit))
9937 return Qnil;
9938 num_8_bit++;
9939 len = 1;
9940 }
9941 else /* len == 4 or 5 */
9942 {
9943 if (NILP (handle_over_uni))
9944 return Qnil;
9945 if (len == 4)
9946 num_over_4++;
9947 else
9948 num_over_5++;
9949 }
9950 change_byte_sequence = true;
9951 p += len;
9952 }
9953
9954 Lisp_Object val; /* the return value. */
9955
9956 if (! change_byte_sequence
9957 && NILP (buffer))
9958 {
9959 if (nocopy)
9960 return string;
9961 val = make_uninit_multibyte_string (outchars, outbytes);
9962 memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
9963 return val;
9964 }
9965
9966 /* Count the number of resulting chars and bytes. */
9967 unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
9968 int replace_8_bit_len = 0, replace_over_uni_len = 0;
9969
9970 if (change_byte_sequence)
9971 {
9972 if (num_8_bit > 0)
9973 {
9974 if (CHARACTERP (handle_8_bit))
9975 replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
9976 &replace_8_bit_len);
9977 else if (STRINGP (handle_8_bit))
9978 {
9979 replace_8_bit = SDATA (handle_8_bit);
9980 replace_8_bit_len = SBYTES (handle_8_bit);
9981 }
9982 if (replace_8_bit)
9983 outbytes += (replace_8_bit_len - 1) * num_8_bit;
9984 else if (EQ (handle_8_bit, Qignored))
9985 {
9986 outbytes -= num_8_bit;
9987 outchars -= num_8_bit;
9988 }
9989 else /* EQ (handle_8_bit, Qt)) */
9990 outbytes += num_8_bit;
9991 }
9992 else if (num_over_4 + num_over_5 > 0)
9993 {
9994 if (CHARACTERP (handle_over_uni))
9995 replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
9996 &replace_over_uni_len);
9997 else if (STRINGP (handle_over_uni))
9998 {
9999 replace_over_uni = SDATA (handle_over_uni);
10000 replace_over_uni_len = SBYTES (handle_over_uni);
10001 }
10002 if (num_over_4 > 0)
10003 {
10004 if (replace_over_uni)
10005 outbytes += (replace_over_uni_len - 4) * num_over_4;
10006 else if (EQ (handle_over_uni, Qignored))
10007 {
10008 outbytes -= 4 * num_over_4;
10009 outchars -= num_over_4;
10010 }
10011 }
10012 if (num_over_5 > 0)
10013 {
10014 if (replace_over_uni)
10015 outbytes += (replace_over_uni_len - 5) * num_over_5;
10016 else if (EQ (handle_over_uni, Qignored))
10017 {
10018 outbytes -= 5 * num_over_5;
10019 outchars -= num_over_5;
10020 }
10021 }
10022 }
10023 }
10024
10025 /* Prepare a return value and a space to store the decoded bytes. */
10026 if (BUFFERP (buffer))
10027 {
10028 val = make_fixnum (outchars);
10029 dst = get_buffer_gap_address (buffer, outbytes);
10030 }
10031 else
10032 {
10033 if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
10034 return string;
10035 val = make_uninit_multibyte_string (outchars, outbytes);
10036 dst = SDATA (val);
10037 }
10038
10039 src = SDATA (string);
10040 if (change_byte_sequence)
10041 {
10042 p = src;
10043 while (p < pend)
10044 {
10045 /* Try short cut for an ASCII-only case. */
10046 /* while (p < pend && *p < 0x80) p++; */
10047 /* if (p == pend) */
10048 /* break; */
10049 int c = *p;
10050 if (c < 0x80)
10051 {
10052 p++;
10053 continue;
10054 }
10055 int len = UTF_8_SEQUENCE_LENGTH (c);
10056 if (len > 1)
10057 {
10058 int mlen;
10059 for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
10060 mlen++);
10061 if (mlen == len
10062 && (len <= 3
10063 || (len == 4
10064 && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)
10065 || EQ (handle_over_uni, Qt)))
10066 {
10067 p += len;
10068 continue;
10069 }
10070 }
10071
10072 if (src < p)
10073 {
10074 memcpy (dst, src, p - src);
10075 dst += p - src;
10076 }
10077 if (len == 0)
10078 {
10079 if (replace_8_bit)
10080 {
10081 memcpy (dst, replace_8_bit, replace_8_bit_len);
10082 dst += replace_8_bit_len;
10083 }
10084 else if (EQ (handle_8_bit, Qt))
10085 {
10086 dst += BYTE8_STRING (c, dst);
10087 }
10088 len = 1;
10089 }
10090 else /* len == 4 or 5 */
10091 {
10092 /* Handle p[0]... by handle_over_uni */
10093 if (replace_over_uni)
10094 {
10095 memcpy (dst, replace_over_uni, replace_over_uni_len);
10096 dst += replace_over_uni_len;
10097 }
10098 }
10099 p += len;
10100 src = p;
10101 }
10102 }
10103
10104 if (src < pend)
10105 memcpy (dst, src, pend - src);
10106 if (BUFFERP (buffer))
10107 {
10108 struct buffer *oldb = current_buffer;
10109
10110 current_buffer = XBUFFER (buffer);
10111 insert_from_gap (outchars, outbytes, false);
10112 current_buffer = oldb;
10113 }
10114 return val;
10115}
10116
10117/* #define ENABLE_UTF_8_CONVERTER_TEST */
10118
10119#ifdef ENABLE_UTF_8_CONVERTER_TEST
10120
10121/* These functions are useful for testing and benchmarking
10122 encode_string_utf_8 and decode_string_utf_8. */
10123
10124/* ENCODE_METHOD specifies which internal decoder to use.
10125 If it is Qnil, use encode_string_utf_8.
10126 Otherwise, use code_convert_string.
10127
10128 COUNT, if integer, specifies how many times to call those functions
10129 with the same arguments (for benchmarking). */
10130
10131DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
10132 Sinternal_encode_string_utf_8, 7, 7, 0,
10133 doc: /* Internal use only.*/)
10134 (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
10135 Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
10136 Lisp_Object encode_method, Lisp_Object count)
10137{
10138 int repeat_count;
10139 Lisp_Object val;
10140
10141 /* Check arguments. Return Qnil when an argmement is invalid. */
10142 if (! STRINGP (string))
10143 return Qnil;
10144 if (! NILP (buffer)
10145 && (! BUFFERP (buffer)
10146 || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
10147 return Qnil;
10148 if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
10149 && ! EQ (handle_8_bit, Qignored)
10150 && ! CHARACTERP (handle_8_bit)
10151 && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
10152 return Qnil;
10153 if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
10154 && ! EQ (handle_over_uni, Qignored)
10155 && ! CHARACTERP (handle_over_uni)
10156 && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
10157 return Qnil;
10158
10159 CHECK_FIXNUM (count);
10160 repeat_count = XFIXNUM (count);
10161
10162 val = Qnil;
10163 /* Run an encoder according to ENCODE_METHOD. */
10164 if (NILP (encode_method))
10165 {
10166 for (int i = 0; i < repeat_count; i++)
10167 val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
10168 handle_8_bit, handle_over_uni);
10169 }
10170 else
10171 {
10172 for (int i = 0; i < repeat_count; i++)
10173 val = code_convert_string (string, Qutf_8_unix, Qnil, true,
10174 ! NILP (nocopy), true);
10175 }
10176 return val;
10177}
10178
10179/* DECODE_METHOD specifies which internal decoder to use.
10180 If it is Qnil, use decode_string_utf_8.
10181 If it is Qt, use code_convert_string.
10182 Otherwise, use make_string_from_utf8.
10183
10184 COUNT, if integer, specifies how many times to call those functions
10185 with the same arguments (for benchmarking). */
10186
10187DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
10188 Sinternal_decode_string_utf_8, 7, 7, 0,
10189 doc: /* Internal use only.*/)
10190 (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
10191 Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
10192 Lisp_Object decode_method, Lisp_Object count)
10193{
10194 int repeat_count;
10195 Lisp_Object val;
10196
10197 /* Check arguments. Return Qnil when an argmement is invalid. */
10198 if (! STRINGP (string))
10199 return Qnil;
10200 if (! NILP (buffer)
10201 && (! BUFFERP (buffer)
10202 || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
10203 return Qnil;
10204 if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
10205 && ! EQ (handle_8_bit, Qignored)
10206 && ! CHARACTERP (handle_8_bit)
10207 && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
10208 return Qnil;
10209 if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
10210 && ! EQ (handle_over_uni, Qignored)
10211 && ! CHARACTERP (handle_over_uni)
10212 && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
10213 return Qnil;
10214
10215 CHECK_FIXNUM (count);
10216 repeat_count = XFIXNUM (count);
10217
10218 val = Qnil;
10219 /* Run a decoder according to DECODE_METHOD. */
10220 if (NILP (decode_method))
10221 {
10222 for (int i = 0; i < repeat_count; i++)
10223 val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
10224 handle_8_bit, handle_over_uni);
10225 }
10226 else if (EQ (decode_method, Qt))
10227 {
10228 if (! BUFFERP (buffer))
10229 buffer = Qt;
10230 for (int i = 0; i < repeat_count; i++)
10231 val = code_convert_string (string, Qutf_8_unix, buffer, false,
10232 ! NILP (nocopy), true);
10233 }
10234 else if (! NILP (decode_method))
10235 {
10236 for (int i = 0; i < repeat_count; i++)
10237 val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
10238 }
10239 return val;
10240}
10241
10242#endif /* ENABLE_UTF_8_CONVERTER_TEST */
10243
9518/* Encode or decode a file name, to or from a unibyte string suitable 10244/* Encode or decode a file name, to or from a unibyte string suitable
9519 for passing to C library functions. */ 10245 for passing to C library functions. */
9520Lisp_Object 10246Lisp_Object
@@ -10974,6 +11700,10 @@ syms_of_coding (void)
10974 defsubr (&Sencode_coding_region); 11700 defsubr (&Sencode_coding_region);
10975 defsubr (&Sdecode_coding_string); 11701 defsubr (&Sdecode_coding_string);
10976 defsubr (&Sencode_coding_string); 11702 defsubr (&Sencode_coding_string);
11703#ifdef ENABLE_UTF_8_CONVERTER_TEST
11704 defsubr (&Sinternal_encode_string_utf_8);
11705 defsubr (&Sinternal_decode_string_utf_8);
11706#endif /* ENABLE_UTF_8_CONVERTER_TEST */
10977 defsubr (&Sdecode_sjis_char); 11707 defsubr (&Sdecode_sjis_char);
10978 defsubr (&Sencode_sjis_char); 11708 defsubr (&Sencode_sjis_char);
10979 defsubr (&Sdecode_big5_char); 11709 defsubr (&Sdecode_big5_char);