aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKenichi Handa2000-05-19 23:52:27 +0000
committerKenichi Handa2000-05-19 23:52:27 +0000
commit2e344af3e731463b6733239d0c9520645072ce11 (patch)
tree93361dd3c2f08fe274ebc46d83a5a48fd86b5272 /src
parentbd045987c20414ae1de41518d39dcb34e652b696 (diff)
downloademacs-2e344af3e731463b6733239d0c9520645072ce11.tar.gz
emacs-2e344af3e731463b6733239d0c9520645072ce11.zip
(Qeight_bit_control, Qeight_bit_graphic): New
variables. (SPLIT_CHARACTER_SEQ): This macro deleted. (SPLIT_MULTIBYTE_SEQ): Assume that multibyte sequence at STR is valid. (CHAR_COMPONENTS_VALID_P): Handle new charsets; eight-bit-control and eight-bit-graphic. (char_to_string): Likewise. Signal an error for too large character code. (char_printable_p): Return 0 for 8-bit characters. (update_charset_table): Update iso_charset_table only when a final character is non-negative. (find_charset_in_text): Renamed from find_charset_in_str. Arguments and return value changed. Callers changed. (Fdefine_charset): Args ISO-FINAL-CHAR and ISO-GRAPHIC-PLANE can be -1 if CHARSET is used only internally. (Fmake_char_internal): Handle new charsets; eight-bit-control and eight-bit-graphic. (Fcharset_after): Simplified. (char_valid_p): Use SPLIT_CHAR, not SPLIT_NON_ASCII_CHAR. (char_bytes): Return 2 for chars of the range 0xA0..0xFF. (multibyte_chars_in_text): Simplified by assuming there's no invalid multibyte sequence. (parse_str_as_multibyte, str_as_multibyte, str_to_multibyte, str_as_unibyte): New functions. (Fstring): Simpified by assuming that byte combining never happens. (init_charset_once): Initialization for LEADING_CODE_8_BIT_CONTROL. (syms_of_charset): Intern and staticpro Qeight_bit_control and Qeight_bit_graphic. Include them in Vcharset_list. Make charsets eight-bit-control and eight-bit-graphic.
Diffstat (limited to 'src')
-rw-r--r--src/charset.c595
1 files changed, 370 insertions, 225 deletions
diff --git a/src/charset.c b/src/charset.c
index cea0c57207c..bf9ce66d927 100644
--- a/src/charset.c
+++ b/src/charset.c
@@ -43,7 +43,7 @@ Boston, MA 02111-1307, USA. */
43 43
44#endif /* emacs */ 44#endif /* emacs */
45 45
46Lisp_Object Qcharset, Qascii; 46Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
47Lisp_Object Qunknown; 47Lisp_Object Qunknown;
48 48
49/* Declaration of special leading-codes. */ 49/* Declaration of special leading-codes. */
@@ -52,8 +52,8 @@ int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52int leading_code_private_21; /* for private DIMENSION2 of 1-column */ 52int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53int leading_code_private_22; /* for private DIMENSION2 of 2-column */ 53int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54 54
55/* Declaration of special charsets. */ 55/* Declaration of special charsets. The values are set by
56int charset_ascii; /* ASCII */ 56 Fsetup_special_charsets. */
57int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */ 57int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */ 58int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */ 59int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
@@ -115,63 +115,46 @@ invalid_character (c)
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c); 115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116} 116}
117 117
118/* Parse a multibyte character string STR of length LENGTH (>= 2) set 118/* Parse string STR of length LENGTH and fetch information of a
119 BYTES to the length of actual multibyte sequence, CHARSET, C1, and 119 character at STR. Set BYTES to the byte length the character
120 C2 to such values that MAKE_CHAR can make the multibyte character 120 occupies, CHARSET, C1, C2 to proper values of the character. */
121 from them. 121
122 122#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
123 It is assumed that *STR is one of base leading codes and the 123 do { \
124 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P. 124 (c1) = *(str); \
125 125 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
126 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */ 126 if ((bytes) == 1) \
127 127 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
128#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \ 128 else if ((bytes) == 2) \
129 do { \ 129 { \
130 (bytes) = 1; \ 130 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
131 (charset) = (str)[0]; \ 131 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
132 if ((charset) >= LEADING_CODE_PRIVATE_11 \ 132 else \
133 && (charset) <= LEADING_CODE_PRIVATE_22) \ 133 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
134 (charset) = (str)[(bytes)++]; \ 134 } \
135 if ((bytes) < (length)) \ 135 else if ((bytes) == 3) \
136 { \ 136 { \
137 (c1) = (str)[(bytes)++] & 0x7F; \ 137 if ((c1) < LEADING_CODE_PRIVATE_11) \
138 if ((bytes) < (length)) \ 138 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
139 (c2) = (str)[(bytes)++] & 0x7F; \ 139 else \
140 else \ 140 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
141 (c2) = -1; \ 141 } \
142 } \ 142 else \
143 else \ 143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
144 (c1) = (c2) = -1; \
145 } while (0)
146
147/* Parse string STR of length LENGTH and check if a multibyte
148 characters is at STR. Set BYTES to the actual length, CHARSET, C1,
149 C2 to proper values for that character. */
150
151#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
152 do { \
153 int i; \
154 if (ASCII_BYTE_P ((str)[0])) \
155 i = 1; \
156 else \
157 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
158 if (i == 1) \
159 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
160 else \
161 { \
162 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
163 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
164 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
165 } \
166 } while (0) 144 } while (0)
167 145
168/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */ 146/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
169#define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \ 147#define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
170 (charset == CHARSET_ASCII \ 148 ((charset) == CHARSET_ASCII \
171 ? ((c1) >= 0 && (c1) <= 0x7F) \ 149 ? ((c1) >= 0 && (c1) <= 0x7F) \
172 : (CHARSET_DIMENSION (charset) == 1 \ 150 : ((charset) == CHARSET_8_BIT_CONTROL \
173 ? ((c1) >= 0x20 && (c1) <= 0x7F) \ 151 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
174 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))) 152 : ((charset) == CHARSET_8_BIT_GRAPHIC \
153 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
154 : (CHARSET_DIMENSION (charset) == 1 \
155 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
156 : ((c1) >= 0x20 && (c1) <= 0x7F \
157 && (c2) >= 0x20 && (c2) <= 0x7F)))))
175 158
176/* Store multi-byte form of the character C in STR. The caller should 159/* Store multi-byte form of the character C in STR. The caller should
177 allocate at least 4-byte area at STR in advance. Returns the 160 allocate at least 4-byte area at STR in advance. Returns the
@@ -227,14 +210,22 @@ char_to_string (c, str)
227 /* If C still has any modifier bits, it is an invalid character. */ 210 /* If C still has any modifier bits, it is an invalid character. */
228 if (c & CHAR_MODIFIER_MASK) 211 if (c & CHAR_MODIFIER_MASK)
229 invalid_character (c); 212 invalid_character (c);
230 213 }
231 *p++ = c; 214 if (SINGLE_BYTE_CHAR_P (c))
215 {
216 if (ASCII_BYTE_P (c) || c >= 0xA0)
217 *p++ = c;
218 else
219 {
220 *p++ = LEADING_CODE_8_BIT_CONTROL;
221 *p++ = c + 0x20;
222 }
232 } 223 }
233 else if (c < MAX_CHAR) 224 else if (c < MAX_CHAR)
234 { 225 {
235 int charset, c1, c2; 226 int charset, c1, c2;
236 227
237 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); 228 SPLIT_CHAR (c, charset, c1, c2);
238 229
239 if (charset >= LEADING_CODE_EXT_11) 230 if (charset >= LEADING_CODE_EXT_11)
240 *p++ = (charset < LEADING_CODE_EXT_12 231 *p++ = (charset < LEADING_CODE_EXT_12
@@ -254,8 +245,10 @@ char_to_string (c, str)
254 *p++ = c2 | 0x80; 245 *p++ = c2 | 0x80;
255 } 246 }
256 } 247 }
248 else
249 invalid_character (c);
257 250
258 return (p -str); 251 return (p - str);
259} 252}
260 253
261/* Return the non-ASCII character corresponding to multi-byte form at 254/* Return the non-ASCII character corresponding to multi-byte form at
@@ -324,12 +317,14 @@ char_printable_p (c)
324{ 317{
325 int charset, c1, c2, chars; 318 int charset, c1, c2, chars;
326 319
327 if (SINGLE_BYTE_CHAR_P (c)) 320 if (ASCII_BYTE_P (c))
328 return 1; 321 return 1;
329 if (c >= MAX_CHAR) 322 else if (SINGLE_BYTE_CHAR_P (c))
323 return 0;
324 else if (c >= MAX_CHAR)
330 return 0; 325 return 0;
331 326
332 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); 327 SPLIT_CHAR (c, charset, c1, c2);
333 if (! CHARSET_DEFINED_P (charset)) 328 if (! CHARSET_DEFINED_P (charset))
334 return 0; 329 return 0;
335 if (CHARSET_CHARS (charset) == 94 330 if (CHARSET_CHARS (charset) == 94
@@ -479,7 +474,7 @@ update_charset_table (charset_id, dimension, chars, width, direction,
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1) 474 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
480 { 475 {
481 /* Official charset, it doesn't have an extended leading-code. */ 476 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII) 477 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
483 bytes += 1; /* For a base leading-code. */ 478 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset; 479 leading_code_base = charset;
485 leading_code_ext = 0; 480 leading_code_ext = 0;
@@ -499,7 +494,8 @@ update_charset_table (charset_id, dimension, chars, width, direction,
499 leading_code_ext = charset; 494 leading_code_ext = charset;
500 } 495 }
501 496
502 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes) 497 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
498 &&BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
503 error ("Invalid dimension for the charset-ID %d", charset); 499 error ("Invalid dimension for the charset-ID %d", charset);
504 500
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id; 501 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
@@ -562,7 +558,8 @@ update_charset_table (charset_id, dimension, chars, width, direction,
562 } 558 }
563 559
564 /* Update table iso_charset_table. */ 560 /* Update table iso_charset_table. */
565 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0) 561 if (iso_final_char >= 0
562 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
566 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset; 563 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
567} 564}
568 565
@@ -633,10 +630,12 @@ render from right to left.\n\
633\n\ 630\n\
634ISO-FINAL-CHAR (character) is the final character of the\n\ 631ISO-FINAL-CHAR (character) is the final character of the\n\
635corresponding ISO 2022 charset.\n\ 632corresponding ISO 2022 charset.\n\
633It may be -1 if the charset is internal use only.\n\
636\n\ 634\n\
637ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\ 635ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
638while encoding to variants of ISO 2022 coding system, one of the\n\ 636while encoding to variants of ISO 2022 coding system, one of the\n\
639following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\ 637following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
638It may be -1 if the charset is internal use only.\n\
640\n\ 639\n\
641SHORT-NAME (string) is the short name to refer to the charset.\n\ 640SHORT-NAME (string) is the short name to refer to the charset.\n\
642\n\ 641\n\
@@ -667,8 +666,10 @@ DESCRIPTION (string) is the description string of the charset.")
667 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96) 666 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
668 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2) 667 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
669 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1) 668 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
670 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~') 669 || !INTEGERP (vec[4])
671 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1) 670 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5])
672 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
672 || !STRINGP (vec[6]) 673 || !STRINGP (vec[6])
673 || !STRINGP (vec[7]) 674 || !STRINGP (vec[7])
674 || !STRINGP (vec[8])) 675 || !STRINGP (vec[8]))
@@ -757,69 +758,84 @@ CHARSET should be defined by `defined-charset' in advance.")
757 return Qnil; 758 return Qnil;
758} 759}
759 760
760/* Return number of different charsets in STR of length LEN. In 761/* Return information about charsets in the text at PTR of NBYTES
761 addition, for each found charset N, CHARSETS[N] is set 1. The 762 bytes, which are NCHARS characters. The value is:
762 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance. 763 0: No multibyte characters (including 8-bit code of range 0x80..0x9F)
763 It may lookup a translation table TABLE if supplied. 764 are found.
765 1: No charsets other than ascii eight-bit-control,
766 eight-bit-graphic, and latin-1 are found.
767 2: Otherwise.
764 768
765 If MULTIBYTE is zero, do not check multibyte characters, i.e. if 769 In addition, if CHARSETS is nonzero, for each found charset N, set
766 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any 770 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
767 8-bit codes are found CHARSET[1] is set to 1. */ 771 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
772 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
773 1 (note that there's no charset whose ID is 1). */
768 774
769int 775int
770find_charset_in_str (str, len, charsets, table, multibyte) 776find_charset_in_text (ptr, nchars, nbytes, charsets, table)
771 unsigned char *str; 777 unsigned char *ptr;
772 int len, *charsets; 778 int nchars, nbytes, *charsets;
773 Lisp_Object table; 779 Lisp_Object table;
774 int multibyte;
775{ 780{
776 register int num = 0; 781 if (nchars == nbytes)
777
778 if (! multibyte)
779 { 782 {
780 unsigned char *endp = str + len; 783 if (charsets && nbytes > 0)
781 int maskbits = 0;
782
783 while (str < endp && maskbits != 3)
784 maskbits |= (*str++ < 0x80 ? 1 : 2);
785 if (maskbits & 1)
786 {
787 charsets[0] = 1;
788 num++;
789 }
790 if (maskbits & 2)
791 { 784 {
792 charsets[1] = 1; 785 unsigned char *endp = ptr + nbytes;
793 num++; 786 int maskbits = 0;
787
788 while (ptr < endp && maskbits != 7)
789 {
790 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
791 ptr++;
792 }
793
794 if (maskbits & 1)
795 charsets[CHARSET_ASCII] = 1;
796 if (maskbits & 2)
797 charsets[CHARSET_8_BIT_CONTROL] = 1;
798 if (maskbits & 4)
799 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
794 } 800 }
795 return num; 801 return 0;
796 } 802 }
797 803 else
798 if (! CHAR_TABLE_P (table))
799 table = Qnil;
800
801 while (len > 0)
802 { 804 {
805 int return_val = 1;
803 int bytes, charset, c1, c2; 806 int bytes, charset, c1, c2;
804 807
805 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2); 808 if (! CHAR_TABLE_P (table))
809 table = Qnil;
806 810
807 if (! NILP (table)) 811 while (nchars-- > 0)
808 { 812 {
809 int c1 = translate_char (table, -1, charset, c1, c2); 813 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
810 if (c1 >= 0) 814 ptr += bytes;
811 charset = CHAR_CHARSET (c1);
812 }
813 815
814 if (!charsets[charset]) 816 if (!CHARSET_DEFINED_P (charset))
815 { 817 charset = 1;
816 charsets[charset] = 1; 818 else if (! NILP (table))
817 num += 1; 819 {
820 int c = translate_char (table, -1, charset, c1, c2);
821 if (c >= 0)
822 charset = CHAR_CHARSET (c);
823 }
824
825 if (return_val == 1
826 && charset != CHARSET_ASCII
827 && charset != CHARSET_8_BIT_CONTROL
828 && charset != CHARSET_8_BIT_GRAPHIC
829 && charset != charset_latin_iso8859_1)
830 return_val = 2;
831
832 if (charsets)
833 charsets[charset] = 1;
834 else if (return_val == 2)
835 break;
818 } 836 }
819 str += bytes; 837 return return_val;
820 len -= bytes;
821 } 838 }
822 return num;
823} 839}
824 840
825DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region, 841DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
@@ -831,17 +847,14 @@ Optional arg TABLE if non-nil is a translation table to look up.\n\
831If the region contains invalid multiybte characters,\n\ 847If the region contains invalid multiybte characters,\n\
832`unknown' is included in the returned list.\n\ 848`unknown' is included in the returned list.\n\
833\n\ 849\n\
834If the current buffer is unibyte, the returned list contains\n\ 850If the current buffer is unibyte, the returned list may contain\n\
835`ascii' if any 7-bit characters are found,\n\ 851only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
836and `unknown' if any 8-bit characters are found.")
837 (beg, end, table) 852 (beg, end, table)
838 Lisp_Object beg, end, table; 853 Lisp_Object beg, end, table;
839{ 854{
840 int charsets[MAX_CHARSET + 1]; 855 int charsets[MAX_CHARSET + 1];
841 int from, from_byte, to, stop, stop_byte, i; 856 int from, from_byte, to, stop, stop_byte, i;
842 Lisp_Object val; 857 Lisp_Object val;
843 int undefined;
844 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
845 858
846 validate_region (&beg, &end); 859 validate_region (&beg, &end);
847 from = XFASTINT (beg); 860 from = XFASTINT (beg);
@@ -860,8 +873,8 @@ and `unknown' if any 8-bit characters are found.")
860 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); 873 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
861 while (1) 874 while (1)
862 { 875 {
863 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte, 876 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
864 charsets, table, multibyte); 877 stop_byte - from_byte, charsets, table);
865 if (stop < to) 878 if (stop < to)
866 { 879 {
867 from = stop, from_byte = stop_byte; 880 from = stop, from_byte = stop_byte;
@@ -872,17 +885,13 @@ and `unknown' if any 8-bit characters are found.")
872 } 885 }
873 886
874 val = Qnil; 887 val = Qnil;
875 undefined = 0; 888 if (charsets[1])
876 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
877 if (charsets[i])
878 {
879 if (CHARSET_DEFINED_P (i))
880 val = Fcons (CHARSET_SYMBOL (i), val);
881 else
882 undefined = 1;
883 }
884 if (undefined)
885 val = Fcons (Qunknown, val); 889 val = Fcons (Qunknown, val);
890 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
891 if (charsets[i])
892 val = Fcons (CHARSET_SYMBOL (i), val);
893 if (charsets[0])
894 val = Fcons (Qascii, val);
886 return val; 895 return val;
887} 896}
888 897
@@ -894,38 +903,32 @@ Optional arg TABLE if non-nil is a translation table to look up.\n\
894If the region contains invalid multiybte characters,\n\ 903If the region contains invalid multiybte characters,\n\
895`unknown' is included in the returned list.\n\ 904`unknown' is included in the returned list.\n\
896\n\ 905\n\
897If STR is unibyte, the returned list contains\n\ 906If STR is unibyte, the returned list may contain\n\
898`ascii' if any 7-bit characters are found,\n\ 907only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
899and `unknown' if any 8-bit characters are found.")
900 (str, table) 908 (str, table)
901 Lisp_Object str, table; 909 Lisp_Object str, table;
902{ 910{
903 int charsets[MAX_CHARSET + 1]; 911 int charsets[MAX_CHARSET + 1];
904 int i; 912 int i;
905 Lisp_Object val; 913 Lisp_Object val;
906 int undefined;
907 int multibyte;
908 914
909 CHECK_STRING (str, 0); 915 CHECK_STRING (str, 0);
910 multibyte = STRING_MULTIBYTE (str);
911 916
912 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); 917 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
913 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)), 918 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
914 charsets, table, multibyte); 919 STRING_BYTES (XSTRING (str)), charsets, table);
920
915 val = Qnil; 921 val = Qnil;
916 undefined = 0; 922 if (charsets[1])
917 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
918 if (charsets[i])
919 {
920 if (CHARSET_DEFINED_P (i))
921 val = Fcons (CHARSET_SYMBOL (i), val);
922 else
923 undefined = 1;
924 }
925 if (undefined)
926 val = Fcons (Qunknown, val); 923 val = Fcons (Qunknown, val);
924 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
925 if (charsets[i])
926 val = Fcons (CHARSET_SYMBOL (i), val);
927 if (charsets[0])
928 val = Fcons (Qascii, val);
927 return val; 929 return val;
928} 930}
931
929 932
930DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, 933DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
931 "") 934 "")
@@ -954,8 +957,26 @@ DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
954 c2 = XINT (code2); 957 c2 = XINT (code2);
955 } 958 }
956 959
957 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF) 960 if (charset_id == CHARSET_ASCII)
958 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2); 961 {
962 if (c1 < 0 || c1 > 0x7F)
963 goto invalid_code_posints;
964 return make_number (c1);
965 }
966 else if (charset_id == CHARSET_8_BIT_CONTROL)
967 {
968 if (c1 < 0x80 || c1 > 0x9F)
969 goto invalid_code_posints;
970 return make_number (c1);
971 }
972 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
973 {
974 if (c1 < 0xA0 || c1 > 0xFF)
975 goto invalid_code_posints;
976 return make_number (c1);
977 }
978 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
979 goto invalid_code_posints;
959 c1 &= 0x7F; 980 c1 &= 0x7F;
960 c2 &= 0x7F; 981 c2 &= 0x7F;
961 if (c1 == 0 982 if (c1 == 0
@@ -963,9 +984,11 @@ DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
963 : (c2 == 0 984 : (c2 == 0
964 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20) 985 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
965 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2))) 986 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
966 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2); 987 goto invalid_code_posints;
967
968 return make_number (MAKE_CHAR (charset_id, c1, c2)); 988 return make_number (MAKE_CHAR (charset_id, c1, c2));
989
990 invalid_code_posints:
991 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
969} 992}
970 993
971DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0, 994DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
@@ -1006,34 +1029,13 @@ If POS is out of range, the value is nil.")
1006 (pos) 1029 (pos)
1007 Lisp_Object pos; 1030 Lisp_Object pos;
1008{ 1031{
1009 register int pos_byte, bytes, charset, c1, c2; 1032 Lisp_Object ch;
1010 register unsigned char *p; 1033 int charset;
1011
1012 if (NILP (pos))
1013 pos_byte = PT_BYTE;
1014 else if (MARKERP (pos))
1015 {
1016 pos_byte = marker_byte_position (pos);
1017 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018 return Qnil;
1019 }
1020 else
1021 {
1022 CHECK_NUMBER (pos, 0);
1023 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024 return Qnil;
1025 pos_byte = CHAR_TO_BYTE (XINT (pos));
1026 }
1027 p = BYTE_POS_ADDR (pos_byte);
1028 if (BASE_LEADING_CODE_P (*p))
1029 {
1030 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031 if (charset < 0)
1032 charset = 1;
1033 }
1034 else
1035 charset = CHARSET_ASCII;
1036 1034
1035 ch = Fchar_after (pos);
1036 if (! INTEGERP (ch))
1037 return ch;
1038 charset = CHAR_CHARSET (XINT (ch));
1037 return CHARSET_SYMBOL (charset); 1039 return CHARSET_SYMBOL (charset);
1038} 1040}
1039 1041
@@ -1073,7 +1075,7 @@ char_valid_p (c, genericp)
1073 return 0; 1075 return 0;
1074 if (SINGLE_BYTE_CHAR_P (c)) 1076 if (SINGLE_BYTE_CHAR_P (c))
1075 return 1; 1077 return 1;
1076 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); 1078 SPLIT_CHAR (c, charset, c1, c2);
1077 if (genericp) 1079 if (genericp)
1078 { 1080 {
1079 if (c1) 1081 if (c1)
@@ -1161,7 +1163,9 @@ char_bytes (c)
1161{ 1163{
1162 int charset; 1164 int charset;
1163 1165
1164 if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1))) 1166 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1167 return 1;
1168 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1165 return 1; 1169 return 1;
1166 1170
1167 charset = CHAR_CHARSET (c); 1171 charset = CHAR_CHARSET (c);
@@ -1334,23 +1338,146 @@ multibyte_chars_in_text (ptr, nbytes)
1334 1338
1335 while (ptr < endp) 1339 while (ptr < endp)
1336 { 1340 {
1337 if (BASE_LEADING_CODE_P (*ptr)) 1341 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1342 ptr += bytes;
1343 chars++;
1344 }
1345
1346 return chars;
1347}
1348
1349/* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1350 count the numbers of characters and bytes in it. On counting
1351 bytes, pay attention to that 8-bit characters in the range
1352 0x80..0x9F are represented by 2-byte in a multibyte text. */
1353void
1354parse_str_as_multibyte (str, len, nchars, nbytes)
1355 unsigned char *str;
1356 int len, *nchars, *nbytes;
1357{
1358 unsigned char *endp = str + len;
1359 int n, chars = 0, bytes = 0;
1360
1361 while (str < endp)
1362 {
1363 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1364 str += n, bytes += n;
1365 else
1366 str++, bytes += 2;
1367 chars++;
1368 }
1369 *nchars = chars;
1370 *nbytes = bytes;
1371 return;
1372}
1373
1374/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1375 It actually converts only 8-bit characters in the range 0x80..0x9F
1376 that don't contruct multibyte characters to multibyte forms. If
1377 NCHARS is nonzero, set *NCHARS to the number of characters in the
1378 text. It is assured that we can use LEN bytes at STR as a work
1379 area and that is enough. Return the number of bytes of the
1380 resulting text. */
1381
1382int
1383str_as_multibyte (str, len, nbytes, nchars)
1384 unsigned char *str;
1385 int len, nbytes, *nchars;
1386{
1387 unsigned char *p = str, *endp = str + nbytes;
1388 unsigned char *to;
1389 int chars = 0;
1390 int n;
1391
1392 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1393 p += n, chars++;
1394 if (nchars)
1395 *nchars = chars;
1396 if (p == endp)
1397 return nbytes;
1398
1399 to = p;
1400 nbytes = endp - p;
1401 endp = str + len;
1402 safe_bcopy (p, endp - nbytes, nbytes);
1403 p = endp - nbytes;
1404 while (p < endp)
1405 {
1406 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1338 { 1407 {
1339 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes); 1408 while (n--)
1340 ptr += bytes; 1409 *to++ = *p++;
1341 nbytes -= bytes; 1410 }
1342 }
1343 else 1411 else
1344 { 1412 {
1345 ptr++; 1413 *to++ = LEADING_CODE_8_BIT_CONTROL;
1346 nbytes--; 1414 *to++ = *p++ + 0x20;
1347 } 1415 }
1348 chars++; 1416 chars++;
1349 } 1417 }
1418 if (nchars)
1419 *nchars = chars;
1420 return (to - str);
1421}
1350 1422
1351 return chars; 1423/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1424 that contains the same single-byte characters. It actually
1425 converts all 8-bit characters to multibyte forms. It is assured
1426 that we can use LEN bytes at STR as a work area and that is
1427 enough. */
1428
1429int
1430str_to_multibyte (str, len, bytes)
1431 unsigned char *str;
1432 int len, bytes;
1433{
1434 unsigned char *p = str, *endp = str + bytes;
1435 unsigned char *to;
1436 int c;
1437
1438 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1439 if (p == endp)
1440 return bytes;
1441 to = p;
1442 bytes = endp - p;
1443 endp = str + len;
1444 safe_bcopy (p, endp - bytes, bytes);
1445 p = endp - bytes;
1446 while (p < endp)
1447 {
1448 if (*p < 0x80 || *p >= 0xA0)
1449 *to++ = *p++;
1450 else
1451 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1452 }
1453 return (to - str);
1352} 1454}
1353 1455
1456/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1457 actually converts only 8-bit characters in the range 0x80..0x9F to
1458 unibyte forms. */
1459
1460int
1461str_as_unibyte (str, bytes)
1462 unsigned char *str;
1463 int bytes;
1464{
1465 unsigned char *p = str, *endp = str + bytes;
1466 unsigned char *to = str;
1467
1468 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1469 to = p;
1470 while (p < endp)
1471 {
1472 if (*p == LEADING_CODE_8_BIT_CONTROL)
1473 *to++ = *(p + 1) - 0x20, p += 2;
1474 else
1475 *to++ = *p++;
1476 }
1477 return (to - str);
1478}
1479
1480
1354DEFUN ("string", Fstring, Sstring, 1, MANY, 0, 1481DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1355 "Concatenate all the argument characters and make the result a string.") 1482 "Concatenate all the argument characters and make the result a string.")
1356 (n, args) 1483 (n, args)
@@ -1360,28 +1487,16 @@ DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1360 int i; 1487 int i;
1361 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n); 1488 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1362 unsigned char *p = buf; 1489 unsigned char *p = buf;
1363 Lisp_Object val; 1490 int c;
1364 int c, multibyte_p = 0;
1365 1491
1366 for (i = 0; i < n; i++) 1492 for (i = 0; i < n; i++)
1367 { 1493 {
1368 CHECK_NUMBER (args[i], 0); 1494 CHECK_NUMBER (args[i], 0);
1369 c = XINT (args[i]); 1495 c = XINT (args[i]);
1370 p += CHAR_STRING (c, p); 1496 p += CHAR_STRING (c, p);
1371
1372 if (!SINGLE_BYTE_CHAR_P (c))
1373 multibyte_p = 1;
1374 } 1497 }
1375 1498
1376 /* Here, we can't use make_string_from_bytes because of the byte 1499 return make_string_from_bytes (buf, n, p - buf);
1377 combining problem. Make a multibyte string if there is any
1378 multibyte character in ARGS to make sure that `(string 2276)'
1379 returns a multibyte string if running --unibyte. */
1380 if (multibyte_p)
1381 val = make_multibyte_string (buf, n, p - buf);
1382 else
1383 val = make_unibyte_string (buf, p - buf);
1384 return val;
1385} 1500}
1386 1501
1387#endif /* emacs */ 1502#endif /* emacs */
@@ -1448,31 +1563,27 @@ init_charset_once ()
1448 iso_charset_table [i][j][k] = -1; 1563 iso_charset_table [i][j][k] = -1;
1449 1564
1450 for (i = 0; i < 256; i++) 1565 for (i = 0; i < 256; i++)
1451 BYTES_BY_CHAR_HEAD (i) = 1; 1566 bytes_by_char_head[i] = 1;
1452 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1; 1567 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1453 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++) 1568 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1454 BYTES_BY_CHAR_HEAD (i) = 2; 1569 bytes_by_char_head[i] = 2;
1455 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2; 1570 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1456 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++) 1571 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1457 BYTES_BY_CHAR_HEAD (i) = 3; 1572 bytes_by_char_head[i] = 3;
1458 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3; 1573 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1459 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3; 1574 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1460 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4; 1575 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1461 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4; 1576 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1462 /* The followings don't reflect the actual bytes, but just to tell 1577 bytes_by_char_head[LEADING_CODE_8_BIT_CONTROL] = 2;
1463 that it is a start of a multibyte character. */
1464 BYTES_BY_CHAR_HEAD (0x80) = 2;
1465 BYTES_BY_CHAR_HEAD (0x9E) = 2;
1466 BYTES_BY_CHAR_HEAD (0x9F) = 2;
1467 1578
1468 for (i = 0; i < 128; i++) 1579 for (i = 0; i < 128; i++)
1469 WIDTH_BY_CHAR_HEAD (i) = 1; 1580 width_by_char_head[i] = 1;
1470 for (; i < 256; i++) 1581 for (; i < 256; i++)
1471 WIDTH_BY_CHAR_HEAD (i) = 4; 1582 width_by_char_head[i] = 4;
1472 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1; 1583 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1473 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2; 1584 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1474 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1; 1585 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1475 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2; 1586 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1476 1587
1477 { 1588 {
1478 Lisp_Object val; 1589 Lisp_Object val;
@@ -1498,13 +1609,20 @@ init_charset_once ()
1498void 1609void
1499syms_of_charset () 1610syms_of_charset ()
1500{ 1611{
1612 Qcharset = intern ("charset");
1613 staticpro (&Qcharset);
1614
1501 Qascii = intern ("ascii"); 1615 Qascii = intern ("ascii");
1502 staticpro (&Qascii); 1616 staticpro (&Qascii);
1503 1617
1504 Qcharset = intern ("charset"); 1618 Qeight_bit_control = intern ("eight-bit-control");
1505 staticpro (&Qcharset); 1619 staticpro (&Qeight_bit_control);
1620
1621 Qeight_bit_graphic = intern ("eight-bit-graphic");
1622 staticpro (&Qeight_bit_graphic);
1506 1623
1507 /* Define ASCII charset now. */ 1624 /* Define special charsets ascii, eight-bit-control, and
1625 eight-bit-graphic. */
1508 update_charset_table (make_number (CHARSET_ASCII), 1626 update_charset_table (make_number (CHARSET_ASCII),
1509 make_number (1), make_number (94), 1627 make_number (1), make_number (94),
1510 make_number (1), 1628 make_number (1),
@@ -1517,6 +1635,32 @@ syms_of_charset ()
1517 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii; 1635 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1518 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII)); 1636 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1519 1637
1638 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1639 make_number (1), make_number (96),
1640 make_number (4),
1641 make_number (0),
1642 make_number (-1),
1643 make_number (-1),
1644 build_string ("8-bit control code (0x80..0x9F)"),
1645 build_string ("8-bit control code (0x80..0x9F)"),
1646 build_string ("8-bit control code (0x80..0x9F)"));
1647 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1648 Fput (Qeight_bit_control, Qcharset,
1649 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1650
1651 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1652 make_number (1), make_number (96),
1653 make_number (4),
1654 make_number (0),
1655 make_number (-1),
1656 make_number (-1),
1657 build_string ("8-bit graphic char"),
1658 build_string ("8-bit graphic char (0xA0..0xFF)"),
1659 build_string ("8-bit graphic char (0xA0..0xFF)"));
1660 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1661 Fput (Qeight_bit_graphic, Qcharset,
1662 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1663
1520 Qauto_fill_chars = intern ("auto-fill-chars"); 1664 Qauto_fill_chars = intern ("auto-fill-chars");
1521 staticpro (&Qauto_fill_chars); 1665 staticpro (&Qauto_fill_chars);
1522 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0)); 1666 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
@@ -1545,7 +1689,8 @@ syms_of_charset ()
1545 1689
1546 DEFVAR_LISP ("charset-list", &Vcharset_list, 1690 DEFVAR_LISP ("charset-list", &Vcharset_list,
1547 "List of charsets ever defined."); 1691 "List of charsets ever defined.");
1548 Vcharset_list = Fcons (Qascii, Qnil); 1692 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1693 Fcons (Qeight_bit_graphic, Qnil)));
1549 1694
1550 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, 1695 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1551 "Vector of cons cell of a symbol and translation table ever defined.\n\ 1696 "Vector of cons cell of a symbol and translation table ever defined.\n\