diff options
| author | K. Handa | 2019-08-04 21:14:26 +0900 |
|---|---|---|
| committer | K. Handa | 2019-08-04 21:14:26 +0900 |
| commit | a8026dfde9734a03ad03a9872ec801871dd1d81a (patch) | |
| tree | 92039f7268c2824470411cca944c7f638e645c15 /src/coding.c | |
| parent | 5ec3f70527e330abf4c0c3519fa4914c5f094358 (diff) | |
| download | emacs-a8026dfde9734a03ad03a9872ec801871dd1d81a.tar.gz emacs-a8026dfde9734a03ad03a9872ec801871dd1d81a.zip | |
Add Unicode-safe UTF-8 converter
* src/coding.c (encode_string_utf_8, decode_string_utf_8): New functions.
* src/coding.h (encode_string_utf_8, decode_string_utf_8): Extern them.
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 730 |
1 files changed, 730 insertions, 0 deletions
diff --git a/src/coding.c b/src/coding.c index 189a4b39d15..ab0e15119f3 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system, | |||
| 9515 | return code_convert_string (string, coding_system, Qt, encodep, 0, 1); | 9515 | return code_convert_string (string, coding_system, Qt, encodep, 0, 1); |
| 9516 | } | 9516 | } |
| 9517 | 9517 | ||
| 9518 | |||
| 9519 | /* Return the gap address of BUFFER. If the gap size is less than | ||
| 9520 | NBYTES, enlarge the gap in advance. */ | ||
| 9521 | |||
| 9522 | static unsigned char * | ||
| 9523 | get_buffer_gap_address (Lisp_Object buffer, int nbytes) | ||
| 9524 | { | ||
| 9525 | struct buffer *buf = XBUFFER (buffer); | ||
| 9526 | |||
| 9527 | if (BUF_GPT (buf) != BUF_PT (buf)) | ||
| 9528 | { | ||
| 9529 | struct buffer *oldb = current_buffer; | ||
| 9530 | |||
| 9531 | current_buffer = buf; | ||
| 9532 | move_gap_both (PT, PT_BYTE); | ||
| 9533 | current_buffer = oldb; | ||
| 9534 | } | ||
| 9535 | if (BUF_GAP_SIZE (buf) < nbytes) | ||
| 9536 | make_gap_1 (buf, nbytes); | ||
| 9537 | return BUF_GPT_ADDR (buf); | ||
| 9538 | } | ||
| 9539 | |||
| 9540 | /* Return a pointer to the byte sequence for C, and set the length in | ||
| 9541 | LEN. This function is used to get a byte sequence for HANDLE_8_BIT | ||
| 9542 | and HANDLE_OVER_UNI arguments of encode_string_utf_8 and | ||
| 9543 | decode_string_utf_8 when those arguments are given by | ||
| 9544 | characters. */ | ||
| 9545 | |||
| 9546 | static unsigned char * | ||
| 9547 | get_char_bytes (int c, int *len) | ||
| 9548 | { | ||
| 9549 | /* We uses two chaches considering the situation that | ||
| 9550 | encode/decode_string_utf_8 are called repeatedly with the same | ||
| 9551 | values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments. */ | ||
| 9552 | static int chars[2]; | ||
| 9553 | static unsigned char bytes[2][6]; | ||
| 9554 | static int nbytes[2]; | ||
| 9555 | static int last_index; | ||
| 9556 | |||
| 9557 | if (chars[last_index] == c) | ||
| 9558 | { | ||
| 9559 | *len = nbytes[last_index]; | ||
| 9560 | return bytes[last_index]; | ||
| 9561 | } | ||
| 9562 | if (chars[1 - last_index] == c) | ||
| 9563 | { | ||
| 9564 | *len = nbytes[1 - last_index]; | ||
| 9565 | return bytes[1 - last_index]; | ||
| 9566 | } | ||
| 9567 | last_index = 1 - last_index; | ||
| 9568 | chars[last_index] = c; | ||
| 9569 | *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]); | ||
| 9570 | return bytes[last_index]; | ||
| 9571 | } | ||
| 9572 | |||
| 9573 | /* Encode STRING by the coding system utf-8-unix. | ||
| 9574 | |||
| 9575 | Even if :pre-write-conversion and :encode-translation-table | ||
| 9576 | properties are put to that coding system, they are ignored. | ||
| 9577 | |||
| 9578 | It ignores :pre-write-conversion and :encode-translation-table | ||
| 9579 | propeties of that coding system. | ||
| 9580 | |||
| 9581 | This function assumes that arguments have values as described | ||
| 9582 | below. The validity must be assured by callers. | ||
| 9583 | |||
| 9584 | STRING is a multibyte string or an ASCII-only unibyte string. | ||
| 9585 | |||
| 9586 | BUFFER is a unibyte buffer or Qnil. | ||
| 9587 | |||
| 9588 | If BUFFER is a unibyte buffer, the encoding result of UTF-8 | ||
| 9589 | sequence is inserted after point of the buffer, and the number of | ||
| 9590 | inserted characters is returned. Note that a caller should have | ||
| 9591 | made BUFFER ready for modifying in advance (e.g. by calling | ||
| 9592 | invalidate_buffer_caches). | ||
| 9593 | |||
| 9594 | If BUFFER is Qnil, a unibyte string is made from the encodnig | ||
| 9595 | result of UTF-8 sequence, and it is returned. If NOCOPY and STRING | ||
| 9596 | contains only Unicode characters (i.e. the encoding does not change | ||
| 9597 | the byte sequence), STRING is returned even if it is multibyte. | ||
| 9598 | |||
| 9599 | HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode | ||
| 9600 | character. The former is for an eight-bit character (represented | ||
| 9601 | by 2-byte overlong sequence in multibyte STRING). The latter is | ||
| 9602 | for an over-unicode character (a character whose code is greater | ||
| 9603 | than the maximum Unicode character 0x10FFFF, and is represented by | ||
| 9604 | 4 or 5-byte sequence in multibyte STRING). | ||
| 9605 | |||
| 9606 | If they are unibyte strings (typically "\357\277\275"; UTF-8 | ||
| 9607 | sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a | ||
| 9608 | non-Unicode character is encoded into that sequence. | ||
| 9609 | |||
| 9610 | If they are characters, a non-Unicode chracters is encoded into the | ||
| 9611 | corresponding UTF-8 sequences. | ||
| 9612 | |||
| 9613 | If they are Qignored, a non-Unicode character is skipped on | ||
| 9614 | encoding. | ||
| 9615 | |||
| 9616 | If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one | ||
| 9617 | byte of the same value. | ||
| 9618 | |||
| 9619 | If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded | ||
| 9620 | into the the same 4 or 5-byte sequence. | ||
| 9621 | |||
| 9622 | If they are Qnil, Qnil is returned if STRING has a non-Unicode | ||
| 9623 | character. */ | ||
| 9624 | |||
| 9625 | Lisp_Object | ||
| 9626 | encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | ||
| 9627 | bool nocopy, Lisp_Object handle_8_bit, | ||
| 9628 | Lisp_Object handle_over_uni) | ||
| 9629 | { | ||
| 9630 | ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string); | ||
| 9631 | if (NILP (buffer) && nchars == nbytes) | ||
| 9632 | /* STRING contains only ASCII characters. */ | ||
| 9633 | return string; | ||
| 9634 | |||
| 9635 | ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */ | ||
| 9636 | /* The following two vars are counted only if handle_over_uni is not Qt */ | ||
| 9637 | ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */ | ||
| 9638 | ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */ | ||
| 9639 | ptrdiff_t outbytes; /* number of bytes of decoding result. */ | ||
| 9640 | unsigned char *p = SDATA (string); | ||
| 9641 | unsigned char *pend = p + nbytes; | ||
| 9642 | unsigned char *src = NULL, *dst = NULL; | ||
| 9643 | unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL; | ||
| 9644 | int replace_8_bit_len = 0, replace_over_uni_len = 0; | ||
| 9645 | Lisp_Object val; /* the return value */ | ||
| 9646 | |||
| 9647 | /* Scan bytes in STRING twice. The first scan is to count non-Unicode | ||
| 9648 | characters, and the second scan is to encode STRING. If the | ||
| 9649 | encoding is trivial (no need of changing the byte sequence), | ||
| 9650 | the second scan is avoided. */ | ||
| 9651 | for (int scan_count = 0; scan_count < 2; scan_count++) | ||
| 9652 | { | ||
| 9653 | while (p < pend) | ||
| 9654 | { | ||
| 9655 | if (nchars == pend - p) | ||
| 9656 | /* There is no multibyte character remaining. */ | ||
| 9657 | break; | ||
| 9658 | |||
| 9659 | int c = *p; | ||
| 9660 | int len = BYTES_BY_CHAR_HEAD (c); | ||
| 9661 | |||
| 9662 | nchars--; | ||
| 9663 | if (len == 1 | ||
| 9664 | || len == 3 | ||
| 9665 | || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c) | ||
| 9666 | : (EQ (handle_over_uni, Qt) | ||
| 9667 | || (len == 4 | ||
| 9668 | && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))) | ||
| 9669 | { | ||
| 9670 | p += len; | ||
| 9671 | continue; | ||
| 9672 | } | ||
| 9673 | |||
| 9674 | /* A character to change the byte sequence on encoding was | ||
| 9675 | found. A rare case. */ | ||
| 9676 | if (len == 2) | ||
| 9677 | { | ||
| 9678 | /* Handle an eight-bit character by handle_8_bit. */ | ||
| 9679 | if (scan_count == 0) | ||
| 9680 | { | ||
| 9681 | if (NILP (handle_8_bit)) | ||
| 9682 | return Qnil; | ||
| 9683 | num_8_bit++; | ||
| 9684 | } | ||
| 9685 | else | ||
| 9686 | { | ||
| 9687 | if (src < p) | ||
| 9688 | { | ||
| 9689 | memcpy (dst, src, p - src); | ||
| 9690 | dst += p - src; | ||
| 9691 | } | ||
| 9692 | if (replace_8_bit_len > 0) | ||
| 9693 | { | ||
| 9694 | memcpy (dst, replace_8_bit, replace_8_bit_len); | ||
| 9695 | dst += replace_8_bit_len; | ||
| 9696 | } | ||
| 9697 | else if (EQ (handle_8_bit, Qt)) | ||
| 9698 | { | ||
| 9699 | int char8 = STRING_CHAR (p); | ||
| 9700 | *dst++ = CHAR_TO_BYTE8 (char8); | ||
| 9701 | } | ||
| 9702 | } | ||
| 9703 | } | ||
| 9704 | else /* len == 4 or 5 */ | ||
| 9705 | { | ||
| 9706 | /* Handle an over-unicode character by handle_over_uni. */ | ||
| 9707 | if (scan_count == 0) | ||
| 9708 | { | ||
| 9709 | if (NILP (handle_over_uni)) | ||
| 9710 | return Qnil; | ||
| 9711 | if (len == 4) | ||
| 9712 | num_over_4++; | ||
| 9713 | else | ||
| 9714 | num_over_5++; | ||
| 9715 | } | ||
| 9716 | else | ||
| 9717 | { | ||
| 9718 | if (src < p) | ||
| 9719 | { | ||
| 9720 | memcpy (dst, src, p - src); | ||
| 9721 | dst += p - src; | ||
| 9722 | } | ||
| 9723 | if (replace_over_uni_len > 0) | ||
| 9724 | { | ||
| 9725 | memcpy (dst, replace_over_uni, replace_over_uni_len); | ||
| 9726 | dst += replace_over_uni_len; | ||
| 9727 | } | ||
| 9728 | } | ||
| 9729 | } | ||
| 9730 | p += len; | ||
| 9731 | src = p; | ||
| 9732 | } | ||
| 9733 | |||
| 9734 | if (scan_count == 0) | ||
| 9735 | { | ||
| 9736 | /* End of the first scane */ | ||
| 9737 | outbytes = nbytes; | ||
| 9738 | if (num_8_bit == 0 | ||
| 9739 | && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt))) | ||
| 9740 | { | ||
| 9741 | /* We can break the loop because there is no need of | ||
| 9742 | changing the byte sequence. This is the typical | ||
| 9743 | case. */ | ||
| 9744 | scan_count = 1; | ||
| 9745 | } | ||
| 9746 | else | ||
| 9747 | { | ||
| 9748 | /* Prepare for the next scan to handle non-Unicode characters. */ | ||
| 9749 | if (num_8_bit > 0) | ||
| 9750 | { | ||
| 9751 | if (CHARACTERP (handle_8_bit)) | ||
| 9752 | replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit), | ||
| 9753 | &replace_8_bit_len); | ||
| 9754 | else if (STRINGP (handle_8_bit)) | ||
| 9755 | { | ||
| 9756 | replace_8_bit = SDATA (handle_8_bit); | ||
| 9757 | replace_8_bit_len = SBYTES (handle_8_bit); | ||
| 9758 | } | ||
| 9759 | if (replace_8_bit) | ||
| 9760 | outbytes += (replace_8_bit_len - 2) * num_8_bit; | ||
| 9761 | else if (EQ (handle_8_bit, Qignored)) | ||
| 9762 | outbytes -= 2 * num_8_bit; | ||
| 9763 | else if (EQ (handle_8_bit, Qt)) | ||
| 9764 | outbytes -= num_8_bit; | ||
| 9765 | else | ||
| 9766 | return Qnil; | ||
| 9767 | } | ||
| 9768 | if (num_over_4 + num_over_5 > 0) | ||
| 9769 | { | ||
| 9770 | if (CHARACTERP (handle_over_uni)) | ||
| 9771 | replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni), | ||
| 9772 | &replace_over_uni_len); | ||
| 9773 | else if (STRINGP (handle_over_uni)) | ||
| 9774 | { | ||
| 9775 | replace_over_uni = SDATA (handle_over_uni); | ||
| 9776 | replace_over_uni_len = SBYTES (handle_over_uni); | ||
| 9777 | } | ||
| 9778 | if (num_over_4 > 0) | ||
| 9779 | { | ||
| 9780 | if (replace_over_uni) | ||
| 9781 | outbytes += (replace_over_uni_len - 4) * num_over_4; | ||
| 9782 | else if (EQ (handle_over_uni, Qignored)) | ||
| 9783 | outbytes -= 4 * num_over_4; | ||
| 9784 | else if (! EQ (handle_over_uni, Qt)) | ||
| 9785 | return Qnil; | ||
| 9786 | } | ||
| 9787 | if (num_over_5 > 0) | ||
| 9788 | { | ||
| 9789 | if (replace_over_uni) | ||
| 9790 | outbytes += (replace_over_uni_len - 5) * num_over_5; | ||
| 9791 | else if (EQ (handle_over_uni, Qignored)) | ||
| 9792 | outbytes -= 5 * num_over_5; | ||
| 9793 | else if (! EQ (handle_over_uni, Qt)) | ||
| 9794 | return Qnil; | ||
| 9795 | } | ||
| 9796 | } | ||
| 9797 | } | ||
| 9798 | |||
| 9799 | /* Prepare a return value and a space to store the encoded bytes. */ | ||
| 9800 | if (BUFFERP (buffer)) | ||
| 9801 | { | ||
| 9802 | val = make_fixnum (outbytes); | ||
| 9803 | dst = get_buffer_gap_address (buffer, nbytes); | ||
| 9804 | } | ||
| 9805 | else | ||
| 9806 | { | ||
| 9807 | if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0) | ||
| 9808 | return string; | ||
| 9809 | val = make_uninit_string (outbytes); | ||
| 9810 | dst = SDATA (val); | ||
| 9811 | } | ||
| 9812 | p = src = SDATA (string); | ||
| 9813 | } | ||
| 9814 | } | ||
| 9815 | |||
| 9816 | if (src < pend) | ||
| 9817 | memcpy (dst, src, pend - src); | ||
| 9818 | if (BUFFERP (buffer)) | ||
| 9819 | { | ||
| 9820 | struct buffer *oldb = current_buffer; | ||
| 9821 | |||
| 9822 | current_buffer = XBUFFER (buffer); | ||
| 9823 | insert_from_gap (outbytes, outbytes, false); | ||
| 9824 | current_buffer = oldb; | ||
| 9825 | } | ||
| 9826 | return val; | ||
| 9827 | } | ||
| 9828 | |||
| 9829 | /* Decode STRING by the coding system utf-8-unix. | ||
| 9830 | |||
| 9831 | Even if :post-read-conversion and :decode-translation-table | ||
| 9832 | properties are put to that coding system, they are ignored. | ||
| 9833 | |||
| 9834 | This function assumes that arguments have values as described | ||
| 9835 | below. The validity must be assured by callers. | ||
| 9836 | |||
| 9837 | STRING is a unibyte string or an ASCII-only multibyte string. | ||
| 9838 | |||
| 9839 | BUFFER is a multibyte buffer or Qnil. | ||
| 9840 | |||
| 9841 | If BUFFER is a multibyte buffer, the decoding result of Unicode | ||
| 9842 | characters are inserted after point of the buffer, and the number | ||
| 9843 | of inserted characters is returned. Note that a caller should have | ||
| 9844 | made BUFFER ready for modifying in advance (e.g. by calling | ||
| 9845 | invalidate_buffer_caches). | ||
| 9846 | |||
| 9847 | If BUFFER is Qnil, a multibyte string is made from the decoding | ||
| 9848 | result of Unicode characters, and it is returned. As a special | ||
| 9849 | case, STRING itself is returned in the following cases: | ||
| 9850 | 1. STRING contains only ASCII characters. | ||
| 9851 | 2. NOCOPY, and STRING contains only valid UTF-8 sequences. | ||
| 9852 | |||
| 9853 | HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid | ||
| 9854 | byte sequence. The former is for an 1-byte invalid sequence that | ||
| 9855 | violates the fundamental UTF-8 encoding rule. The latter is for a | ||
| 9856 | 4 or 5-byte invalid sequence that Emacs internally uses to | ||
| 9857 | represent an over-unicode character (a character of code greater | ||
| 9858 | than #x10FFFF). Note that this function does not treat an overlong | ||
| 9859 | UTF-8 sequence as invalid. | ||
| 9860 | |||
| 9861 | If they are strings (typically 1-char string of the Unicode | ||
| 9862 | REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into | ||
| 9863 | that string. They must be multibyte strings if they contain a | ||
| 9864 | non-ASCII character. | ||
| 9865 | |||
| 9866 | If they are characters, an invalid sequence is decoded into the | ||
| 9867 | corresponding multibyte representation of the characters. | ||
| 9868 | |||
| 9869 | If they are Qignored, an invalid sequence is skipped on decoding. | ||
| 9870 | |||
| 9871 | If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into | ||
| 9872 | the corresponding eight-bit character. | ||
| 9873 | |||
| 9874 | If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that | ||
| 9875 | follows Emacs' representation for an over-unicode character is | ||
| 9876 | decoded into the corresponding character. | ||
| 9877 | |||
| 9878 | If they are Qnil, Qnil is returned if STRING has an invalid sequence. */ | ||
| 9879 | |||
| 9880 | Lisp_Object | ||
| 9881 | decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer, | ||
| 9882 | bool nocopy, Lisp_Object handle_8_bit, | ||
| 9883 | Lisp_Object handle_over_uni) | ||
| 9884 | { | ||
| 9885 | /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80 | ||
| 9886 | and it returns 0 for invalid sequence. */ | ||
| 9887 | #define UTF_8_SEQUENCE_LENGTH(c) \ | ||
| 9888 | ((c) < 0xC2 ? 0 \ | ||
| 9889 | : (c) < 0xE0 ? 2 \ | ||
| 9890 | : (c) < 0xF0 ? 3 \ | ||
| 9891 | : (c) < 0xF8 ? 4 \ | ||
| 9892 | : (c) == 0xF8 ? 5 \ | ||
| 9893 | : 0) | ||
| 9894 | |||
| 9895 | ptrdiff_t nbytes = SBYTES (string); | ||
| 9896 | unsigned char *p = SDATA (string), *pend = p + nbytes; | ||
| 9897 | ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */ | ||
| 9898 | ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */ | ||
| 9899 | ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */ | ||
| 9900 | ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */ | ||
| 9901 | ptrdiff_t outchars = 0; /* number of decoded characters. */ | ||
| 9902 | unsigned char *src = NULL, *dst = NULL; | ||
| 9903 | bool change_byte_sequence = false; | ||
| 9904 | |||
| 9905 | /* Scan bytes in STRING twice. The first scan is to count invalid | ||
| 9906 | sequences, and the second scan is to decode STRING. If the | ||
| 9907 | decoding is trivial (no need of changing the byte sequence), | ||
| 9908 | the second scan is avoided. */ | ||
| 9909 | while (p < pend) | ||
| 9910 | { | ||
| 9911 | src = p; | ||
| 9912 | /* Try short cut for an ASCII-only case. */ | ||
| 9913 | while (p < pend && *p < 0x80) p++; | ||
| 9914 | outchars += (p - src); | ||
| 9915 | if (p == pend) | ||
| 9916 | break; | ||
| 9917 | int c = *p; | ||
| 9918 | outchars++; | ||
| 9919 | int len = UTF_8_SEQUENCE_LENGTH (c); | ||
| 9920 | /* len == 0, 2, 3, 4, 5 */ | ||
| 9921 | if (UTF_8_EXTRA_OCTET_P (p[1]) | ||
| 9922 | && (len == 2 | ||
| 9923 | || (UTF_8_EXTRA_OCTET_P (p[2]) | ||
| 9924 | && (len == 3 | ||
| 9925 | || (UTF_8_EXTRA_OCTET_P (p[3]) | ||
| 9926 | && len == 4 | ||
| 9927 | && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))) | ||
| 9928 | { | ||
| 9929 | p += len; | ||
| 9930 | continue; | ||
| 9931 | } | ||
| 9932 | |||
| 9933 | /* A sequence to change on decoding was found. A rare case. */ | ||
| 9934 | if (len == 0) | ||
| 9935 | { | ||
| 9936 | if (NILP (handle_8_bit)) | ||
| 9937 | return Qnil; | ||
| 9938 | num_8_bit++; | ||
| 9939 | len = 1; | ||
| 9940 | } | ||
| 9941 | else /* len == 4 or 5 */ | ||
| 9942 | { | ||
| 9943 | if (NILP (handle_over_uni)) | ||
| 9944 | return Qnil; | ||
| 9945 | if (len == 4) | ||
| 9946 | num_over_4++; | ||
| 9947 | else | ||
| 9948 | num_over_5++; | ||
| 9949 | } | ||
| 9950 | change_byte_sequence = true; | ||
| 9951 | p += len; | ||
| 9952 | } | ||
| 9953 | |||
| 9954 | Lisp_Object val; /* the return value. */ | ||
| 9955 | |||
| 9956 | if (! change_byte_sequence | ||
| 9957 | && NILP (buffer)) | ||
| 9958 | { | ||
| 9959 | if (nocopy) | ||
| 9960 | return string; | ||
| 9961 | val = make_uninit_multibyte_string (outchars, outbytes); | ||
| 9962 | memcpy (SDATA (val), SDATA (string), pend - SDATA (string)); | ||
| 9963 | return val; | ||
| 9964 | } | ||
| 9965 | |||
| 9966 | /* Count the number of resulting chars and bytes. */ | ||
| 9967 | unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL; | ||
| 9968 | int replace_8_bit_len = 0, replace_over_uni_len = 0; | ||
| 9969 | |||
| 9970 | if (change_byte_sequence) | ||
| 9971 | { | ||
| 9972 | if (num_8_bit > 0) | ||
| 9973 | { | ||
| 9974 | if (CHARACTERP (handle_8_bit)) | ||
| 9975 | replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit), | ||
| 9976 | &replace_8_bit_len); | ||
| 9977 | else if (STRINGP (handle_8_bit)) | ||
| 9978 | { | ||
| 9979 | replace_8_bit = SDATA (handle_8_bit); | ||
| 9980 | replace_8_bit_len = SBYTES (handle_8_bit); | ||
| 9981 | } | ||
| 9982 | if (replace_8_bit) | ||
| 9983 | outbytes += (replace_8_bit_len - 1) * num_8_bit; | ||
| 9984 | else if (EQ (handle_8_bit, Qignored)) | ||
| 9985 | { | ||
| 9986 | outbytes -= num_8_bit; | ||
| 9987 | outchars -= num_8_bit; | ||
| 9988 | } | ||
| 9989 | else /* EQ (handle_8_bit, Qt)) */ | ||
| 9990 | outbytes += num_8_bit; | ||
| 9991 | } | ||
| 9992 | else if (num_over_4 + num_over_5 > 0) | ||
| 9993 | { | ||
| 9994 | if (CHARACTERP (handle_over_uni)) | ||
| 9995 | replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni), | ||
| 9996 | &replace_over_uni_len); | ||
| 9997 | else if (STRINGP (handle_over_uni)) | ||
| 9998 | { | ||
| 9999 | replace_over_uni = SDATA (handle_over_uni); | ||
| 10000 | replace_over_uni_len = SBYTES (handle_over_uni); | ||
| 10001 | } | ||
| 10002 | if (num_over_4 > 0) | ||
| 10003 | { | ||
| 10004 | if (replace_over_uni) | ||
| 10005 | outbytes += (replace_over_uni_len - 4) * num_over_4; | ||
| 10006 | else if (EQ (handle_over_uni, Qignored)) | ||
| 10007 | { | ||
| 10008 | outbytes -= 4 * num_over_4; | ||
| 10009 | outchars -= num_over_4; | ||
| 10010 | } | ||
| 10011 | } | ||
| 10012 | if (num_over_5 > 0) | ||
| 10013 | { | ||
| 10014 | if (replace_over_uni) | ||
| 10015 | outbytes += (replace_over_uni_len - 5) * num_over_5; | ||
| 10016 | else if (EQ (handle_over_uni, Qignored)) | ||
| 10017 | { | ||
| 10018 | outbytes -= 5 * num_over_5; | ||
| 10019 | outchars -= num_over_5; | ||
| 10020 | } | ||
| 10021 | } | ||
| 10022 | } | ||
| 10023 | } | ||
| 10024 | |||
| 10025 | /* Prepare a return value and a space to store the decoded bytes. */ | ||
| 10026 | if (BUFFERP (buffer)) | ||
| 10027 | { | ||
| 10028 | val = make_fixnum (outchars); | ||
| 10029 | dst = get_buffer_gap_address (buffer, outbytes); | ||
| 10030 | } | ||
| 10031 | else | ||
| 10032 | { | ||
| 10033 | if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0) | ||
| 10034 | return string; | ||
| 10035 | val = make_uninit_multibyte_string (outchars, outbytes); | ||
| 10036 | dst = SDATA (val); | ||
| 10037 | } | ||
| 10038 | |||
| 10039 | src = SDATA (string); | ||
| 10040 | if (change_byte_sequence) | ||
| 10041 | { | ||
| 10042 | p = src; | ||
| 10043 | while (p < pend) | ||
| 10044 | { | ||
| 10045 | /* Try short cut for an ASCII-only case. */ | ||
| 10046 | /* while (p < pend && *p < 0x80) p++; */ | ||
| 10047 | /* if (p == pend) */ | ||
| 10048 | /* break; */ | ||
| 10049 | int c = *p; | ||
| 10050 | if (c < 0x80) | ||
| 10051 | { | ||
| 10052 | p++; | ||
| 10053 | continue; | ||
| 10054 | } | ||
| 10055 | int len = UTF_8_SEQUENCE_LENGTH (c); | ||
| 10056 | if (len > 1) | ||
| 10057 | { | ||
| 10058 | int mlen; | ||
| 10059 | for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]); | ||
| 10060 | mlen++); | ||
| 10061 | if (mlen == len | ||
| 10062 | && (len <= 3 | ||
| 10063 | || (len == 4 | ||
| 10064 | && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR) | ||
| 10065 | || EQ (handle_over_uni, Qt))) | ||
| 10066 | { | ||
| 10067 | p += len; | ||
| 10068 | continue; | ||
| 10069 | } | ||
| 10070 | } | ||
| 10071 | |||
| 10072 | if (src < p) | ||
| 10073 | { | ||
| 10074 | memcpy (dst, src, p - src); | ||
| 10075 | dst += p - src; | ||
| 10076 | } | ||
| 10077 | if (len == 0) | ||
| 10078 | { | ||
| 10079 | if (replace_8_bit) | ||
| 10080 | { | ||
| 10081 | memcpy (dst, replace_8_bit, replace_8_bit_len); | ||
| 10082 | dst += replace_8_bit_len; | ||
| 10083 | } | ||
| 10084 | else if (EQ (handle_8_bit, Qt)) | ||
| 10085 | { | ||
| 10086 | dst += BYTE8_STRING (c, dst); | ||
| 10087 | } | ||
| 10088 | len = 1; | ||
| 10089 | } | ||
| 10090 | else /* len == 4 or 5 */ | ||
| 10091 | { | ||
| 10092 | /* Handle p[0]... by handle_over_uni */ | ||
| 10093 | if (replace_over_uni) | ||
| 10094 | { | ||
| 10095 | memcpy (dst, replace_over_uni, replace_over_uni_len); | ||
| 10096 | dst += replace_over_uni_len; | ||
| 10097 | } | ||
| 10098 | } | ||
| 10099 | p += len; | ||
| 10100 | src = p; | ||
| 10101 | } | ||
| 10102 | } | ||
| 10103 | |||
| 10104 | if (src < pend) | ||
| 10105 | memcpy (dst, src, pend - src); | ||
| 10106 | if (BUFFERP (buffer)) | ||
| 10107 | { | ||
| 10108 | struct buffer *oldb = current_buffer; | ||
| 10109 | |||
| 10110 | current_buffer = XBUFFER (buffer); | ||
| 10111 | insert_from_gap (outchars, outbytes, false); | ||
| 10112 | current_buffer = oldb; | ||
| 10113 | } | ||
| 10114 | return val; | ||
| 10115 | } | ||
| 10116 | |||
| 10117 | /* #define ENABLE_UTF_8_CONVERTER_TEST */ | ||
| 10118 | |||
| 10119 | #ifdef ENABLE_UTF_8_CONVERTER_TEST | ||
| 10120 | |||
| 10121 | /* These functions are useful for testing and benchmarking | ||
| 10122 | encode_string_utf_8 and decode_string_utf_8. */ | ||
| 10123 | |||
| 10124 | /* ENCODE_METHOD specifies which internal decoder to use. | ||
| 10125 | If it is Qnil, use encode_string_utf_8. | ||
| 10126 | Otherwise, use code_convert_string. | ||
| 10127 | |||
| 10128 | COUNT, if integer, specifies how many times to call those functions | ||
| 10129 | with the same arguments (for benchmarking). */ | ||
| 10130 | |||
| 10131 | DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8, | ||
| 10132 | Sinternal_encode_string_utf_8, 7, 7, 0, | ||
| 10133 | doc: /* Internal use only.*/) | ||
| 10134 | (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy, | ||
| 10135 | Lisp_Object handle_8_bit, Lisp_Object handle_over_uni, | ||
| 10136 | Lisp_Object encode_method, Lisp_Object count) | ||
| 10137 | { | ||
| 10138 | int repeat_count; | ||
| 10139 | Lisp_Object val; | ||
| 10140 | |||
| 10141 | /* Check arguments. Return Qnil when an argmement is invalid. */ | ||
| 10142 | if (! STRINGP (string)) | ||
| 10143 | return Qnil; | ||
| 10144 | if (! NILP (buffer) | ||
| 10145 | && (! BUFFERP (buffer) | ||
| 10146 | || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters)))) | ||
| 10147 | return Qnil; | ||
| 10148 | if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt) | ||
| 10149 | && ! EQ (handle_8_bit, Qignored) | ||
| 10150 | && ! CHARACTERP (handle_8_bit) | ||
| 10151 | && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit))) | ||
| 10152 | return Qnil; | ||
| 10153 | if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt) | ||
| 10154 | && ! EQ (handle_over_uni, Qignored) | ||
| 10155 | && ! CHARACTERP (handle_over_uni) | ||
| 10156 | && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni))) | ||
| 10157 | return Qnil; | ||
| 10158 | |||
| 10159 | CHECK_FIXNUM (count); | ||
| 10160 | repeat_count = XFIXNUM (count); | ||
| 10161 | |||
| 10162 | val = Qnil; | ||
| 10163 | /* Run an encoder according to ENCODE_METHOD. */ | ||
| 10164 | if (NILP (encode_method)) | ||
| 10165 | { | ||
| 10166 | for (int i = 0; i < repeat_count; i++) | ||
| 10167 | val = encode_string_utf_8 (string, buffer, ! NILP (nocopy), | ||
| 10168 | handle_8_bit, handle_over_uni); | ||
| 10169 | } | ||
| 10170 | else | ||
| 10171 | { | ||
| 10172 | for (int i = 0; i < repeat_count; i++) | ||
| 10173 | val = code_convert_string (string, Qutf_8_unix, Qnil, true, | ||
| 10174 | ! NILP (nocopy), true); | ||
| 10175 | } | ||
| 10176 | return val; | ||
| 10177 | } | ||
| 10178 | |||
| 10179 | /* DECODE_METHOD specifies which internal decoder to use. | ||
| 10180 | If it is Qnil, use decode_string_utf_8. | ||
| 10181 | If it is Qt, use code_convert_string. | ||
| 10182 | Otherwise, use make_string_from_utf8. | ||
| 10183 | |||
| 10184 | COUNT, if integer, specifies how many times to call those functions | ||
| 10185 | with the same arguments (for benchmarking). */ | ||
| 10186 | |||
| 10187 | DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8, | ||
| 10188 | Sinternal_decode_string_utf_8, 7, 7, 0, | ||
| 10189 | doc: /* Internal use only.*/) | ||
| 10190 | (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy, | ||
| 10191 | Lisp_Object handle_8_bit, Lisp_Object handle_over_uni, | ||
| 10192 | Lisp_Object decode_method, Lisp_Object count) | ||
| 10193 | { | ||
| 10194 | int repeat_count; | ||
| 10195 | Lisp_Object val; | ||
| 10196 | |||
| 10197 | /* Check arguments. Return Qnil when an argmement is invalid. */ | ||
| 10198 | if (! STRINGP (string)) | ||
| 10199 | return Qnil; | ||
| 10200 | if (! NILP (buffer) | ||
| 10201 | && (! BUFFERP (buffer) | ||
| 10202 | || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters)))) | ||
| 10203 | return Qnil; | ||
| 10204 | if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt) | ||
| 10205 | && ! EQ (handle_8_bit, Qignored) | ||
| 10206 | && ! CHARACTERP (handle_8_bit) | ||
| 10207 | && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit))) | ||
| 10208 | return Qnil; | ||
| 10209 | if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt) | ||
| 10210 | && ! EQ (handle_over_uni, Qignored) | ||
| 10211 | && ! CHARACTERP (handle_over_uni) | ||
| 10212 | && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni))) | ||
| 10213 | return Qnil; | ||
| 10214 | |||
| 10215 | CHECK_FIXNUM (count); | ||
| 10216 | repeat_count = XFIXNUM (count); | ||
| 10217 | |||
| 10218 | val = Qnil; | ||
| 10219 | /* Run a decoder according to DECODE_METHOD. */ | ||
| 10220 | if (NILP (decode_method)) | ||
| 10221 | { | ||
| 10222 | for (int i = 0; i < repeat_count; i++) | ||
| 10223 | val = decode_string_utf_8 (string, buffer, ! NILP (nocopy), | ||
| 10224 | handle_8_bit, handle_over_uni); | ||
| 10225 | } | ||
| 10226 | else if (EQ (decode_method, Qt)) | ||
| 10227 | { | ||
| 10228 | if (! BUFFERP (buffer)) | ||
| 10229 | buffer = Qt; | ||
| 10230 | for (int i = 0; i < repeat_count; i++) | ||
| 10231 | val = code_convert_string (string, Qutf_8_unix, buffer, false, | ||
| 10232 | ! NILP (nocopy), true); | ||
| 10233 | } | ||
| 10234 | else if (! NILP (decode_method)) | ||
| 10235 | { | ||
| 10236 | for (int i = 0; i < repeat_count; i++) | ||
| 10237 | val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string)); | ||
| 10238 | } | ||
| 10239 | return val; | ||
| 10240 | } | ||
| 10241 | |||
| 10242 | #endif /* ENABLE_UTF_8_CONVERTER_TEST */ | ||
| 10243 | |||
| 9518 | /* Encode or decode a file name, to or from a unibyte string suitable | 10244 | /* Encode or decode a file name, to or from a unibyte string suitable |
| 9519 | for passing to C library functions. */ | 10245 | for passing to C library functions. */ |
| 9520 | Lisp_Object | 10246 | Lisp_Object |
| @@ -10974,6 +11700,10 @@ syms_of_coding (void) | |||
| 10974 | defsubr (&Sencode_coding_region); | 11700 | defsubr (&Sencode_coding_region); |
| 10975 | defsubr (&Sdecode_coding_string); | 11701 | defsubr (&Sdecode_coding_string); |
| 10976 | defsubr (&Sencode_coding_string); | 11702 | defsubr (&Sencode_coding_string); |
| 11703 | #ifdef ENABLE_UTF_8_CONVERTER_TEST | ||
| 11704 | defsubr (&Sinternal_encode_string_utf_8); | ||
| 11705 | defsubr (&Sinternal_decode_string_utf_8); | ||
| 11706 | #endif /* ENABLE_UTF_8_CONVERTER_TEST */ | ||
| 10977 | defsubr (&Sdecode_sjis_char); | 11707 | defsubr (&Sdecode_sjis_char); |
| 10978 | defsubr (&Sencode_sjis_char); | 11708 | defsubr (&Sencode_sjis_char); |
| 10979 | defsubr (&Sdecode_big5_char); | 11709 | defsubr (&Sdecode_big5_char); |