aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/charset.c2788
-rw-r--r--src/charset.h1179
2 files changed, 1700 insertions, 2267 deletions
diff --git a/src/charset.c b/src/charset.c
index ff177a6cb29..72a30b410d4 100644
--- a/src/charset.c
+++ b/src/charset.c
@@ -1,7 +1,10 @@
1/* Basic multilingual character support. 1/* Basic character set support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. 2 Copyright (C) 1995, 97, 98, 2000, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation. 3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc. 4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
5 8
6This file is part of GNU Emacs. 9This file is part of GNU Emacs.
7 10
@@ -20,24 +23,23 @@ along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */ 24Boston, MA 02111-1307, USA. */
22 25
23/* At first, see the document in `charset.h' to understand the code in
24 this file. */
25
26#ifdef emacs 26#ifdef emacs
27#include <config.h> 27#include <config.h>
28#endif 28#endif
29 29
30#include <stdio.h> 30#include <stdio.h>
31#include <unistd.h>
32#include <ctype.h>
31 33
32#ifdef emacs 34#ifdef emacs
33 35
34#include <sys/types.h> 36#include <sys/types.h>
35#include "lisp.h" 37#include "lisp.h"
36#include "buffer.h" 38#include "character.h"
37#include "charset.h" 39#include "charset.h"
38#include "composite.h"
39#include "coding.h" 40#include "coding.h"
40#include "disptab.h" 41#include "disptab.h"
42#include "buffer.h"
41 43
42#else /* not emacs */ 44#else /* not emacs */
43 45
@@ -45,694 +47,873 @@ Boston, MA 02111-1307, USA. */
45 47
46#endif /* emacs */ 48#endif /* emacs */
47 49
48Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
49Lisp_Object Qunknown;
50 50
51/* Declaration of special leading-codes. */ 51/*** GENERAL NOTE on CODED CHARACTER SET (CHARSET) ***
52int leading_code_private_11; /* for private DIMENSION1 of 1-column */
53int leading_code_private_12; /* for private DIMENSION1 of 2-column */
54int leading_code_private_21; /* for private DIMENSION2 of 1-column */
55int leading_code_private_22; /* for private DIMENSION2 of 2-column */
56 52
57/* Declaration of special charsets. The values are set by 53 A coded character set ("charset" hereafter) is a meaningful
58 Fsetup_special_charsets. */ 54 collection (i.e. language, culture, functionality, etc) of
59int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */ 55 characters. Emacs handles multiple charsets at once. In Emacs Lisp
60int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */ 56 code, a charset is represented by symbol. In C code, a charset is
61int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */ 57 represented by its ID number or by a pointer the struct charset.
62int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
63int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
64int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
65int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
66 58
67Lisp_Object Qcharset_table; 59 The actual information about each charset is stored in two places.
60 Lispy information is stored in the hash table Vcharset_hash_table as
61 a vector (charset attributes). The other information is stored in
62 charset_table as struct charset.
68 63
69/* A char-table containing information of each character set. */ 64*/
70Lisp_Object Vcharset_table;
71 65
72/* A vector of charset symbol indexed by charset-id. This is used 66/* List of all charsets. This variable is used only from Emacs
73 only for returning charset symbol from C functions. */ 67 Lisp. */
74Lisp_Object Vcharset_symbol_table;
75
76/* A list of charset symbols ever defined. */
77Lisp_Object Vcharset_list; 68Lisp_Object Vcharset_list;
78 69
79/* Vector of translation table ever defined. 70/* Hash table that contains attributes of each charset. Keys are
80 ID of a translation table is used to index this vector. */ 71 charset symbols, and values are vectors of charset attributes. */
81Lisp_Object Vtranslation_table_vector; 72Lisp_Object Vcharset_hash_table;
73
74/* Table of struct charset. */
75struct charset *charset_table;
76
77static int charset_table_size;
78int charset_table_used;
79
80Lisp_Object Qcharsetp;
81
82/* Special charset symbols. */
83Lisp_Object Qascii;
84Lisp_Object Qeight_bit_control;
85Lisp_Object Qeight_bit_graphic;
86Lisp_Object Qiso_8859_1;
87Lisp_Object Qunicode;
88
89/* The corresponding charsets. */
90int charset_ascii;
91int charset_8_bit_control;
92int charset_8_bit_graphic;
93int charset_iso_8859_1;
94int charset_unicode;
82 95
83/* A char-table for characters which may invoke auto-filling. */ 96/* Value of charset attribute `charset-iso-plane'. */
84Lisp_Object Vauto_fill_chars; 97Lisp_Object Qgl, Qgr;
85 98
86Lisp_Object Qauto_fill_chars; 99/* The primary charset. It is a charset of unibyte characters. */
100int charset_primary;
87 101
88/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */ 102/* List of charsets ordered by the priority. */
89int bytes_by_char_head[256]; 103Lisp_Object Vcharset_ordered_list;
90int width_by_char_head[256]; 104
105/* List of iso-2022 charsets. */
106Lisp_Object Viso_2022_charset_list;
107
108/* List of emacs-mule charsets. */
109Lisp_Object Vemacs_mule_charset_list;
110
111struct charset *emacs_mule_charset[256];
91 112
92/* Mapping table from ISO2022's charset (specified by DIMENSION, 113/* Mapping table from ISO2022's charset (specified by DIMENSION,
93 CHARS, and FINAL-CHAR) to Emacs' charset. */ 114 CHARS, and FINAL-CHAR) to Emacs' charset. */
94int iso_charset_table[2][2][128]; 115int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
116
117Lisp_Object Vcharset_map_directory;
118
119Lisp_Object Vchar_unified_charset_table;
120
121#define CODE_POINT_TO_INDEX(charset, code) \
122 ((charset)->code_linear_p \
123 ? (code) - (charset)->min_code \
124 : ((((code) >> 24) <= (charset)->code_space[13]) \
125 && ((((code) >> 16) & 0xFF) <= (charset)->code_space[9]) \
126 && ((((code) >> 8) & 0xFF) <= (charset)->code_space[5]) \
127 && (((code) & 0xFF) <= (charset)->code_space[1])) \
128 ? (((((code) >> 24) - (charset)->code_space[12]) \
129 * (charset)->code_space[11]) \
130 + (((((code) >> 16) & 0xFF) - (charset)->code_space[8]) \
131 * (charset)->code_space[7]) \
132 + (((((code) >> 8) & 0xFF) - (charset)->code_space[4]) \
133 * (charset)->code_space[3]) \
134 + (((code) & 0xFF) - (charset)->code_space[0])) \
135 : -1)
136
137
138/* Convert the character index IDX to code-point CODE for CHARSET.
139 It is assumed that IDX is in a valid range. */
140
141#define INDEX_TO_CODE_POINT(charset, idx) \
142 ((charset)->code_linear_p \
143 ? (idx) + (charset)->min_code \
144 : (((charset)->code_space[0] + (idx) % (charset)->code_space[2]) \
145 | (((charset)->code_space[4] \
146 + ((idx) / (charset)->code_space[3] % (charset)->code_space[6])) \
147 << 8) \
148 | (((charset)->code_space[8] \
149 + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
150 << 16) \
151 | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11])) \
152 << 24)))
95 153
96/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */ 154
97unsigned char *_fetch_multibyte_char_p;
98int _fetch_multibyte_char_len;
99 155
100/* Offset to add to a non-ASCII value when inserting it. */ 156/* Set to 1 when a charset map is loaded to warn that a buffer text
101int nonascii_insert_offset; 157 and a string data may be relocated. */
158int charset_map_loaded;
102 159
103/* Translation table for converting non-ASCII unibyte characters 160/* Parse the mapping vector MAP which has this form:
104 to multibyte codes, or nil. */ 161 [CODE0 CHAR0 CODE1 CHAR1 ... ]
105Lisp_Object Vnonascii_translation_table;
106 162
107/* List of all possible generic characters. */ 163 If CONTROL_FLAG is 0, setup CHARSET->min_char and CHARSET->max_char.
108Lisp_Object Vgeneric_character_list;
109 164
110 165 If CONTROL_FLAG is 1, setup CHARSET->min_char, CHARSET->max_char,
111void 166 CHARSET->decoder, and CHARSET->encoder.
112invalid_character (c)
113 int c;
114{
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116}
117 167
118/* Parse string STR of length LENGTH and fetch information of a 168 If CONTROL_FLAG is 2, setup CHARSET->deunifier and
119 character at STR. Set BYTES to the byte length the character 169 Vchar_unify_table. If Vchar_unified_charset_table is non-nil,
120 occupies, CHARSET, C1, C2 to proper values of the character. */ 170 setup it too. */
121
122#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
123 do { \
124 (c1) = *(str); \
125 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
126 if ((bytes) == 1) \
127 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
128 else if ((bytes) == 2) \
129 { \
130 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
131 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
132 else \
133 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
134 } \
135 else if ((bytes) == 3) \
136 { \
137 if ((c1) < LEADING_CODE_PRIVATE_11) \
138 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
139 else \
140 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
141 } \
142 else \
143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
144 } while (0)
145
146/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
147#define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
148 ((charset) == CHARSET_ASCII \
149 ? ((c1) >= 0 && (c1) <= 0x7F) \
150 : ((charset) == CHARSET_8_BIT_CONTROL \
151 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
152 : ((charset) == CHARSET_8_BIT_GRAPHIC \
153 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
154 : (CHARSET_DIMENSION (charset) == 1 \
155 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
156 : ((c1) >= 0x20 && (c1) <= 0x7F \
157 && (c2) >= 0x20 && (c2) <= 0x7F)))))
158
159/* Store multi-byte form of the character C in STR. The caller should
160 allocate at least 4-byte area at STR in advance. Returns the
161 length of the multi-byte form. If C is an invalid character code,
162 return -1. */
163 171
164int 172static void
165char_to_string_1 (c, str) 173parse_charset_map (charset, map, control_flag)
166 int c; 174 struct charset *charset;
167 unsigned char *str; 175 Lisp_Object map;
176 int control_flag;
168{ 177{
169 unsigned char *p = str; 178 Lisp_Object vec, table;
179 unsigned min_code = CHARSET_MIN_CODE (charset);
180 unsigned max_code = CHARSET_MAX_CODE (charset);
181 int ascii_compatible_p = charset->ascii_compatible_p;
182 int min_char, max_char, nonascii_min_char;
183 int size;
184 int i;
185 int first;
186 unsigned char *fast_map = charset->fast_map;
170 187
171 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */ 188 if (control_flag)
172 { 189 {
173 /* Multibyte character can't have a modifier bit. */ 190 int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
174 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) 191 unsigned invalid_code = CHARSET_INVALID_CODE (charset);
175 return -1;
176 192
177 /* For Meta, Shift, and Control modifiers, we need special care. */ 193 table = Fmake_char_table (Qnil, make_number (invalid_code));
178 if (c & CHAR_META) 194 if (control_flag == 1)
179 { 195 vec = Fmake_vector (make_number (n), make_number (-1));
180 /* Move the meta bit to the right place for a string. */ 196 else if (! CHAR_TABLE_P (Vchar_unify_table))
181 c = (c & ~CHAR_META) | 0x80; 197 Vchar_unify_table = Fmake_char_table (Qnil, make_number (-1));
182 }
183 if (c & CHAR_SHIFT)
184 {
185 /* Shift modifier is valid only with [A-Za-z]. */
186 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
187 c &= ~CHAR_SHIFT;
188 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
189 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
190 }
191 if (c & CHAR_CTL)
192 {
193 /* Simulate the code in lread.c. */
194 /* Allow `\C- ' and `\C-?'. */
195 if (c == (CHAR_CTL | ' '))
196 c = 0;
197 else if (c == (CHAR_CTL | '?'))
198 c = 127;
199 /* ASCII control chars are made from letters (both cases),
200 as well as the non-letters within 0100...0137. */
201 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
202 c &= (037 | (~0177 & ~CHAR_CTL));
203 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
204 c &= (037 | (~0177 & ~CHAR_CTL));
205 }
206 198
207 /* If C still has any modifier bits, just ignore it. */ 199 charset_map_loaded = 1;
208 c &= ~CHAR_MODIFIER_MASK;
209 } 200 }
210 201
211 if (SINGLE_BYTE_CHAR_P (c)) 202 size = ASIZE (map);
203 nonascii_min_char = MAX_CHAR;
204 CHARSET_COMPACT_CODES_P (charset) = 1;
205 for (first = 1, i = 0; i < size; i += 2)
212 { 206 {
213 if (ASCII_BYTE_P (c) || c >= 0xA0) 207 Lisp_Object val;
214 *p++ = c; 208 unsigned code, temp;
215 else 209 int c, char_index;
210
211 val = AREF (map, i);
212 CHECK_NATNUM (val);
213 code = XFASTINT (val);
214 val = AREF (map, i + 1);
215 CHECK_NATNUM (val);
216 c = XFASTINT (val);
217
218 if (code < min_code || code > max_code)
219 continue;
220 char_index = CODE_POINT_TO_INDEX (charset, code);
221 if (char_index < 0
222 || c > MAX_CHAR)
223 continue;
224
225 if (control_flag < 2)
216 { 226 {
217 *p++ = LEADING_CODE_8_BIT_CONTROL; 227 if (first)
218 *p++ = c + 0x20; 228 {
229 min_char = max_char = c;
230 first = 0;
231 }
232 else if (c > max_char)
233 max_char = c;
234 else if (c < min_char)
235 min_char = c;
236 if (ascii_compatible_p && ! ASCII_BYTE_P (c)
237 && c < nonascii_min_char)
238 nonascii_min_char = c;
239
240 CHARSET_FAST_MAP_SET (c, fast_map);
241 }
242
243 if (control_flag)
244 {
245 if (control_flag == 1)
246 {
247 if (char_index >= ASIZE (vec))
248 abort ();
249 ASET (vec, char_index, make_number (c));
250 if (code > 0x7FFFFFF)
251 {
252 CHAR_TABLE_SET (table, c,
253 Fcons (make_number (code >> 16),
254 make_number (code & 0xFFFF)));
255 CHARSET_COMPACT_CODES_P (charset) = 0;
256 }
257 else
258 CHAR_TABLE_SET (table, c, make_number (code));
259 }
260 else
261 {
262 int c1 = DECODE_CHAR (charset, code);
263 if (c1 >= 0)
264 {
265 CHAR_TABLE_SET (table, c, make_number (c1));
266 CHAR_TABLE_SET (Vchar_unify_table, c1, c);
267 if (CHAR_TABLE_P (Vchar_unified_charset_table))
268 CHAR_TABLE_SET (Vchar_unified_charset_table, c1,
269 CHARSET_NAME (charset));
270 }
271 }
219 } 272 }
220 } 273 }
221 else if (CHAR_VALID_P (c, 0)) 274
275 if (control_flag < 2)
222 { 276 {
223 int charset, c1, c2; 277 CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
224 278 ? nonascii_min_char : min_char);
225 SPLIT_CHAR (c, charset, c1, c2); 279 CHARSET_MAX_CHAR (charset) = max_char;
226 280 if (control_flag)
227 if (charset >= LEADING_CODE_EXT_11)
228 *p++ = (charset < LEADING_CODE_EXT_12
229 ? LEADING_CODE_PRIVATE_11
230 : (charset < LEADING_CODE_EXT_21
231 ? LEADING_CODE_PRIVATE_12
232 : (charset < LEADING_CODE_EXT_22
233 ? LEADING_CODE_PRIVATE_21
234 : LEADING_CODE_PRIVATE_22)));
235 *p++ = charset;
236 if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32))
237 return -1;
238 if (c1)
239 { 281 {
240 *p++ = c1 | 0x80; 282 CHARSET_DECODER (charset) = vec;
241 if (c2 > 0) 283 CHARSET_ENCODER (charset) = table;
242 *p++ = c2 | 0x80;
243 } 284 }
244 } 285 }
245 else 286 else
246 return -1; 287 CHARSET_DEUNIFIER (charset) = table;
247
248 return (p - str);
249} 288}
250 289
251 290
252/* Store multi-byte form of the character C in STR. The caller should 291/* Read a hexadecimal number (preceded by "0x") from the file FP while
253 allocate at least 4-byte area at STR in advance. Returns the 292 paying attention to comment charcter '#'. */
254 length of the multi-byte form. If C is an invalid character code,
255 signal an error.
256
257 Use macro `CHAR_STRING (C, STR)' instead of calling this function
258 directly if C can be an ASCII character. */
259 293
260int 294static INLINE unsigned
261char_to_string (c, str) 295read_hex (fp, eof)
262 int c; 296 FILE *fp;
263 unsigned char *str; 297 int *eof;
264{ 298{
265 int len; 299 int c;
266 len = char_to_string_1 (c, str); 300 unsigned n;
267 if (len == -1)
268 invalid_character (c);
269 return len;
270}
271 301
302 while ((c = getc (fp)) != EOF)
303 {
304 if (c == '#' || c == ' ')
305 {
306 while ((c = getc (fp)) != EOF && c != '\n');
307 }
308 else if (c == '0')
309 {
310 if ((c = getc (fp)) == EOF || c == 'x')
311 break;
312 }
313 }
314 if (c == EOF)
315 {
316 *eof = 1;
317 return 0;
318 }
319 *eof = 0;
320 n = 0;
321 if (c == 'x')
322 while ((c = getc (fp)) != EOF && isxdigit (c))
323 n = ((n << 4)
324 | (c <= '9' ? c - '0' : c <= 'F' ? c - 'A' + 10 : c - 'a' + 10));
325 else
326 while ((c = getc (fp)) != EOF && isdigit (c))
327 n = (n * 10) + c - '0';
328 return n;
329}
272 330
273/* Return the non-ASCII character corresponding to multi-byte form at
274 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
275 length of the multibyte form in *ACTUAL_LEN.
276 331
277 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling 332/* Return a mapping vector for CHARSET loaded from MAPFILE.
278 this function directly if you want ot handle ASCII characters as 333 Each line of MAPFILE has this form:
279 well. */ 334 0xAAAA 0xBBBB
335 where 0xAAAA is a code-point and 0xBBBB is the corresponding
336 character code.
337 The returned vector has this form:
338 [ CODE1 CHAR1 CODE2 CHAR2 .... ]
339*/
280 340
281int 341static Lisp_Object
282string_to_char (str, len, actual_len) 342load_charset_map (charset, mapfile)
283 const unsigned char *str; 343 struct charset *charset;
284 int len, *actual_len; 344 Lisp_Object mapfile;
285{ 345{
286 int c, bytes, charset, c1, c2; 346 int fd;
347 FILE *fp;
348 int num;
349 unsigned *numbers_table[256];
350 int numbers_table_used;
351 unsigned *numbers;
352 int eof;
353 Lisp_Object suffixes;
354 Lisp_Object vec;
355 int i;
287 356
288 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2); 357 suffixes = Fcons (build_string (".map"),
289 c = MAKE_CHAR (charset, c1, c2); 358 Fcons (build_string (".TXT"), Qnil));
290 if (actual_len)
291 *actual_len = bytes;
292 return c;
293}
294 359
295/* Return the length of the multi-byte form at string STR of length LEN. 360 fd = openp (Fcons (Vcharset_map_directory, Qnil), mapfile, suffixes,
296 Use the macro MULTIBYTE_FORM_LENGTH instead. */ 361 NULL, 0);
297int 362 if (fd < 0
298multibyte_form_length (str, len) 363 || ! (fp = fdopen (fd, "r")))
299 const unsigned char *str; 364 {
300 int len; 365 add_to_log ("Failure in loading charset map: %S", mapfile, Qnil);
301{ 366 return Qnil;
302 int bytes; 367 }
303 368
304 PARSE_MULTIBYTE_SEQ (str, len, bytes); 369 numbers_table_used = 0;
305 return bytes; 370 num = 0;
306} 371 eof = 0;
372 while (1)
373 {
374 unsigned n = read_hex (fp, &eof);
307 375
308/* Check multibyte form at string STR of length LEN and set variables 376 if (eof)
309 pointed by CHARSET, C1, and C2 to charset and position codes of the 377 break;
310 character at STR, and return 0. If there's no multibyte character, 378 if ((num % 0x10000) == 0)
311 return -1. This should be used only in the macro SPLIT_STRING 379 {
312 which checks range of STR in advance. */ 380 if (numbers_table_used == 256)
381 break;
382 numbers = (unsigned *) alloca (sizeof (unsigned) * 0x10000);
383 numbers_table[numbers_table_used++] = numbers;
384 }
385 *numbers++ = n;
386 num++;
387 }
388 fclose (fp);
389 close (fd);
313 390
314int 391 vec = Fmake_vector (make_number (num), Qnil);
315split_string (str, len, charset, c1, c2) 392 for (i = 0; i < num; i++, numbers++)
316 const unsigned char *str; 393 {
317 unsigned char *c1, *c2; 394 if ((i % 0x10000) == 0)
318 int len, *charset; 395 numbers = numbers_table[i / 0x10000];
319{ 396 ASET (vec, i, make_number (*numbers));
320 register int bytes, cs, code1, code2 = -1; 397 }
321 398
322 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2); 399 charset_map_loaded = 1;
323 if (cs == CHARSET_ASCII) 400
324 return -1; 401 return vec;
325 *charset = cs;
326 *c1 = code1;
327 *c2 = code2;
328 return 0;
329} 402}
330 403
331/* Return 1 iff character C has valid printable glyph. 404static void
332 Use the macro CHAR_PRINTABLE_P instead. */ 405load_charset (charset)
333int 406 struct charset *charset;
334char_printable_p (c)
335 int c;
336{ 407{
337 int charset, c1, c2; 408 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP_DEFERRED)
409 {
410 Lisp_Object map;
338 411
339 if (ASCII_BYTE_P (c)) 412 map = CHARSET_MAP (charset);
340 return 1; 413 if (STRINGP (map))
341 else if (SINGLE_BYTE_CHAR_P (c)) 414 map = load_charset_map (charset, map);
342 return 0; 415 parse_charset_map (charset, map, 1);
343 else if (c >= MAX_CHAR) 416 CHARSET_METHOD (charset) = CHARSET_METHOD_MAP;
344 return 0; 417 }
345
346 SPLIT_CHAR (c, charset, c1, c2);
347 if (! CHARSET_DEFINED_P (charset))
348 return 0;
349 if (CHARSET_CHARS (charset) == 94
350 ? c1 <= 32 || c1 >= 127
351 : c1 < 32)
352 return 0;
353 if (CHARSET_DIMENSION (charset) == 2
354 && (CHARSET_CHARS (charset) == 94
355 ? c2 <= 32 || c2 >= 127
356 : c2 < 32))
357 return 0;
358 return 1;
359} 418}
360 419
361/* Translate character C by translation table TABLE. If C 420
362 is negative, translate a character specified by CHARSET, C1, and C2 421DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
363 (C1 and C2 are code points of the character). If no translation is 422 doc: /* Return non-nil if and only if OBJECT is a charset.*/)
364 found in TABLE, return C. */ 423 (object)
365int 424 Lisp_Object object;
366translate_char (table, c, charset, c1, c2)
367 Lisp_Object table;
368 int c, charset, c1, c2;
369{ 425{
370 Lisp_Object ch; 426 return (CHARSETP (object) ? Qt : Qnil);
371 int alt_charset, alt_c1, alt_c2, dimension;
372
373 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
374 if (!CHAR_TABLE_P (table)
375 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
376 return c;
377
378 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
379 dimension = CHARSET_DIMENSION (alt_charset);
380 if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0))
381 /* CH is not a generic character, just return it. */
382 return XFASTINT (ch);
383
384 /* Since CH is a generic character, we must return a specific
385 charater which has the same position codes as C from CH. */
386 if (charset < 0)
387 SPLIT_CHAR (c, charset, c1, c2);
388 if (dimension != CHARSET_DIMENSION (charset))
389 /* We can't make such a character because of dimension mismatch. */
390 return c;
391 return MAKE_CHAR (alt_charset, c1, c2);
392} 427}
393 428
394/* Convert the unibyte character C to multibyte based on
395 Vnonascii_translation_table or nonascii_insert_offset. If they can't
396 convert C to a valid multibyte character, convert it based on
397 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
398 429
399int 430void
400unibyte_char_to_multibyte (c) 431map_charset_chars (c_function, function, charset_symbol, arg)
401 int c; 432 void (*c_function) (Lisp_Object, Lisp_Object, Lisp_Object);
433 Lisp_Object function, charset_symbol, arg;
402{ 434{
403 if (c < 0400 && c >= 0200) 435 int id;
404 { 436 struct charset *charset;
405 int c_save = c; 437 Lisp_Object range;
438
439 CHECK_CHARSET_GET_ID (charset_symbol, id);
440 charset = CHARSET_FROM_ID (id);
441
442 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP_DEFERRED)
443 load_charset (charset);
406 444
407 if (! NILP (Vnonascii_translation_table)) 445 if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
446 {
447 range = Fcons (make_number (CHARSET_MIN_CHAR (charset)),
448 make_number (CHARSET_MAX_CHAR (charset)));
449 if (NILP (function))
450 (*c_function) (arg, range, Qnil);
451 else
452 call2 (function, range, arg);
453 }
454 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
455 {
456 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
457 return;
458 if (CHARSET_ASCII_COMPATIBLE_P (charset))
408 { 459 {
409 c = XINT (Faref (Vnonascii_translation_table, make_number (c))); 460 range = Fcons (make_number (0), make_number (127));
410 if (c >= 0400 && ! char_valid_p (c, 0)) 461 if (NILP (function))
411 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; 462 (*c_function) (arg, range, Qnil);
463 else
464 call2 (function, range, arg);
412 } 465 }
413 else if (c >= 0240 && nonascii_insert_offset > 0) 466 map_char_table (c_function, function, CHARSET_ENCODER (charset), arg,
467 0, NULL);
468 }
469 else /* i.e. CHARSET_METHOD_PARENT */
470 {
471 int from, to, c;
472 unsigned code;
473 int i, j, k, l;
474 int *code_space = CHARSET_CODE_SPACE (charset);
475 Lisp_Object val;
476
477 range = Fcons (Qnil, Qnil);
478 from = to = -2;
479 for (i = code_space[12]; i <= code_space[13]; i++)
480 for (j = code_space[8]; j <= code_space[9]; j++)
481 for (k = code_space[4]; k <= code_space[5]; k++)
482 for (l = code_space[0]; l <= code_space[1]; l++)
483 {
484 code = (i << 24) | (j << 16) | (k << 8) | l;
485 c = DECODE_CHAR (charset, code);
486 if (c == to + 1)
487 {
488 to++;
489 continue;
490 }
491 if (from >= 0)
492 {
493 if (from < to)
494 {
495 XSETCAR (range, make_number (from));
496 XSETCDR (range, make_number (to));
497 val = range;
498 }
499 else
500 val = make_number (from);
501 if (NILP (function))
502 (*c_function) (arg, val, Qnil);
503 else
504 call2 (function, val, arg);
505 }
506 from = to = (c < 0 ? -2 : c);
507 }
508 if (from >= 0)
414 { 509 {
415 c += nonascii_insert_offset; 510 if (from < to)
416 if (c < 0400 || ! char_valid_p (c, 0)) 511 {
417 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; 512 XSETCAR (range, make_number (from));
513 XSETCDR (range, make_number (to));
514 val = range;
515 }
516 else
517 val = make_number (from);
518 if (NILP (function))
519 (*c_function) (arg, val, Qnil);
520 else
521 call2 (function, val, arg);
418 } 522 }
419 else if (c >= 0240)
420 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
421 } 523 }
422 return c; 524}
525
526DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 3, 0,
527 doc: /* Call FUNCTION for each characters in CHARSET.
528FUNCTION is called with three arguments; FROM, TO, and the 3rd optional
529argument ARG.
530FROM and TO indicates a range of character sequence that are contained
531in CHARSET. */)
532 (function, charset, arg)
533 Lisp_Object function, charset, arg;
534{
535 map_charset_chars (NULL, function, charset, arg);
536 return Qnil;
423} 537}
424 538
425 539
426/* Convert the multibyte character C to unibyte 8-bit character based 540/* Define a charset according to the arguments. The Nth argument is
427 on Vnonascii_translation_table or nonascii_insert_offset. If 541 the Nth attribute of the charset (the last attribute `charset-id'
428 REV_TBL is non-nil, it should be a reverse table of 542 is not included). See the docstring of `define-charset' for the
429 Vnonascii_translation_table, i.e. what given by: 543 detail. */
430 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
431 544
432int 545DEFUN ("define-charset-internal", Fdefine_charset_internal,
433multibyte_char_to_unibyte (c, rev_tbl) 546 Sdefine_charset_internal, charset_arg_max, MANY, 0,
434 int c; 547 doc: /* For internal use only. */)
435 Lisp_Object rev_tbl; 548 (nargs, args)
549 int nargs;
550 Lisp_Object *args;
436{ 551{
437 if (!SINGLE_BYTE_CHAR_P (c)) 552 /* Charset attr vector. */
553 Lisp_Object attrs;
554 Lisp_Object val;
555 unsigned hash_code;
556 struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
557 int i;
558 struct charset charset;
559 int id;
560 int dimension;
561 int new_definition_p;
562 int nchars;
563
564 if (nargs != charset_arg_max)
565 return Fsignal (Qwrong_number_of_arguments,
566 Fcons (intern ("define-charset-internal"),
567 make_number (nargs)));
568
569 attrs = Fmake_vector (make_number (charset_attr_max), Qnil);
570
571 CHECK_SYMBOL (args[charset_arg_name]);
572 ASET (attrs, charset_name, args[charset_arg_name]);
573
574 val = args[charset_arg_code_space];
575 for (i = 0, dimension = 0, nchars = 1; i < 4; i++)
438 { 576 {
439 int c_save = c; 577 int min_byte, max_byte;
578
579 min_byte = XINT (Faref (val, make_number (i * 2)));
580 max_byte = XINT (Faref (val, make_number (i * 2 + 1)));
581 if (min_byte < 0 || min_byte > max_byte || max_byte >= 256)
582 error ("Invalid :code-space value");
583 charset.code_space[i * 4] = min_byte;
584 charset.code_space[i * 4 + 1] = max_byte;
585 charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
586 nchars *= charset.code_space[i * 4 + 2];
587 charset.code_space[i * 4 + 3] = nchars;
588 if (max_byte > 0)
589 dimension = i + 1;
590 }
440 591
441 if (! CHAR_TABLE_P (rev_tbl) 592 val = args[charset_arg_dimension];
442 && CHAR_TABLE_P (Vnonascii_translation_table)) 593 if (NILP (val))
443 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table, 594 charset.dimension = dimension;
444 make_number (0)); 595 else
445 if (CHAR_TABLE_P (rev_tbl)) 596 {
446 { 597 CHECK_NATNUM (val);
447 Lisp_Object temp; 598 charset.dimension = XINT (val);
448 temp = Faref (rev_tbl, make_number (c)); 599 if (charset.dimension < 1 || charset.dimension > 4)
449 if (INTEGERP (temp)) 600 args_out_of_range_3 (val, make_number (1), make_number (4));
450 c = XINT (temp); 601 }
451 if (c >= 256) 602
452 c = (c_save & 0177) + 0200; 603 charset.code_linear_p
453 } 604 = (charset.dimension == 1
605 || (charset.code_space[2] == 256
606 && (charset.dimension == 2
607 || (charset.code_space[6] == 256
608 && (charset.dimension == 3
609 || charset.code_space[10] == 256)))));
610
611 charset.iso_chars_96 = charset.code_space[2] == 96;
612
613 charset.min_code = (charset.code_space[0]
614 | (charset.code_space[4] << 8)
615 | (charset.code_space[8] << 16)
616 | (charset.code_space[12] << 24));
617 charset.max_code = (charset.code_space[1]
618 | (charset.code_space[5] << 8)
619 | (charset.code_space[9] << 16)
620 | (charset.code_space[13] << 24));
621
622 val = args[charset_arg_invalid_code];
623 if (NILP (val))
624 {
625 if (charset.min_code > 0)
626 charset.invalid_code = 0;
454 else 627 else
455 { 628 {
456 if (nonascii_insert_offset > 0) 629 XSETINT (val, charset.max_code + 1);
457 c -= nonascii_insert_offset; 630 if (XINT (val) == charset.max_code + 1)
458 if (c < 128 || c >= 256) 631 charset.invalid_code = charset.max_code + 1;
459 c = (c_save & 0177) + 0200; 632 else
633 error ("Attribute :invalid-code must be specified");
460 } 634 }
461 } 635 }
636 else
637 {
638 CHECK_NATNUM (val);
639 charset.invalid_code = XFASTINT (val);
640 }
462 641
463 return c; 642 val = args[charset_arg_iso_final];
464} 643 if (NILP (val))
465 644 charset.iso_final = -1;
466 645 else
467/* Update the table Vcharset_table with the given arguments (see the 646 {
468 document of `define-charset' for the meaning of each argument). 647 CHECK_NUMBER (val);
469 Several other table contents are also updated. The caller should 648 if (XINT (val) < '0' || XINT (val) > 127)
470 check the validity of CHARSET-ID and the remaining arguments in 649 error ("Invalid iso-final-char: %d", XINT (val));
471 advance. */ 650 charset.iso_final = XINT (val);
472 651 }
473void 652
474update_charset_table (charset_id, dimension, chars, width, direction, 653 val = args[charset_arg_iso_revision];
475 iso_final_char, iso_graphic_plane, 654 if (NILP (val))
476 short_name, long_name, description) 655 charset.iso_revision = -1;
477 Lisp_Object charset_id, dimension, chars, width, direction; 656 else
478 Lisp_Object iso_final_char, iso_graphic_plane;
479 Lisp_Object short_name, long_name, description;
480{
481 int charset = XINT (charset_id);
482 int bytes;
483 unsigned char leading_code_base, leading_code_ext;
484
485 if (NILP (CHARSET_TABLE_ENTRY (charset)))
486 CHARSET_TABLE_ENTRY (charset)
487 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
488
489 if (NILP (long_name))
490 long_name = short_name;
491 if (NILP (description))
492 description = long_name;
493
494 /* Get byte length of multibyte form, base leading-code, and
495 extended leading-code of the charset. See the comment under the
496 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
497 bytes = XINT (dimension);
498 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
499 { 657 {
500 /* Official charset, it doesn't have an extended leading-code. */ 658 CHECK_NUMBER (val);
501 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC) 659 if (XINT (val) > 63)
502 bytes += 1; /* For a base leading-code. */ 660 args_out_of_range (make_number (63), val);
503 leading_code_base = charset; 661 charset.iso_revision = XINT (val);
504 leading_code_ext = 0;
505 } 662 }
663
664 val = args[charset_arg_emacs_mule_id];
665 if (NILP (val))
666 charset.emacs_mule_id = -1;
506 else 667 else
507 { 668 {
508 /* Private charset. */ 669 CHECK_NATNUM (val);
509 bytes += 2; /* For base and extended leading-codes. */ 670 if ((XINT (val) > 0 && XINT (val) <= 128) || XINT (val) >= 256)
510 leading_code_base 671 error ("Invalid emacs-mule-id: %d", XINT (val));
511 = (charset < LEADING_CODE_EXT_12 672 charset.emacs_mule_id = XINT (val);
512 ? LEADING_CODE_PRIVATE_11
513 : (charset < LEADING_CODE_EXT_21
514 ? LEADING_CODE_PRIVATE_12
515 : (charset < LEADING_CODE_EXT_22
516 ? LEADING_CODE_PRIVATE_21
517 : LEADING_CODE_PRIVATE_22)));
518 leading_code_ext = charset;
519 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
520 error ("Invalid dimension for the charset-ID %d", charset);
521 } 673 }
522 674
523 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id; 675 charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
524 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
525 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
526 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
527 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
528 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
529 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
530 = make_number (leading_code_base);
531 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
532 = make_number (leading_code_ext);
533 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
534 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
535 = iso_graphic_plane;
536 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
537 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
538 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
539 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
540 676
541 { 677 charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
542 /* If we have already defined a charset which has the same 678
543 DIMENSION, CHARS and ISO-FINAL-CHAR but the different 679 charset.unified_p = 0;
544 DIRECTION, we must update the entry REVERSE-CHARSET of both 680
545 charsets. If there's no such charset, the value of the entry 681 bzero (charset.fast_map, sizeof (charset.fast_map));
546 is set to nil. */ 682
547 int i; 683 if (! NILP (args[charset_arg_code_offset]))
548 684 {
549 for (i = 0; i <= MAX_CHARSET; i++) 685 val = args[charset_arg_code_offset];
550 if (!NILP (CHARSET_TABLE_ENTRY (i))) 686 CHECK_NUMBER (val);
687
688 charset.method = CHARSET_METHOD_OFFSET;
689 charset.code_offset = XINT (val);
690
691 i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
692 charset.min_char = i + charset.code_offset;
693 i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
694 charset.max_char = i + charset.code_offset;
695 if (charset.max_char > MAX_CHAR)
696 error ("Unsupported max char: %d", charset.max_char);
697
698 for (i = charset.min_char; i < 0x10000 && i <= charset.max_char;
699 i += 128)
700 CHARSET_FAST_MAP_SET (i, charset.fast_map);
701 for (; i <= charset.max_char; i += 0x1000)
702 CHARSET_FAST_MAP_SET (i, charset.fast_map);
703 }
704 else if (! NILP (args[charset_arg_map]))
705 {
706 val = args[charset_arg_map];
707 ASET (attrs, charset_map, val);
708 if (STRINGP (val))
709 val = load_charset_map (&charset, val);
710 CHECK_VECTOR (val);
711 parse_charset_map (&charset, val, 0);
712 charset.method = CHARSET_METHOD_MAP_DEFERRED;
713 }
714 else if (! NILP (args[charset_arg_parents]))
715 {
716 val = args[charset_arg_parents];
717 CHECK_LIST (val);
718 charset.method = CHARSET_METHOD_INHERIT;
719 val = Fcopy_sequence (val);
720 ASET (attrs, charset_parents, val);
721
722 charset.min_char = MAX_CHAR;
723 charset.max_char = 0;
724 for (; ! NILP (val); val = Fcdr (val))
551 { 725 {
552 if (CHARSET_DIMENSION (i) == XINT (dimension) 726 Lisp_Object elt, car_part, cdr_part;
553 && CHARSET_CHARS (i) == XINT (chars) 727 int this_id, offset;
554 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char) 728 struct charset *this_charset;
555 && CHARSET_DIRECTION (i) != XINT (direction)) 729
730 elt = Fcar (val);
731 if (CONSP (elt))
556 { 732 {
557 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX) 733 car_part = XCAR (elt);
558 = make_number (i); 734 cdr_part = XCDR (elt);
559 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id; 735 CHECK_CHARSET_GET_ID (car_part, this_id);
560 break; 736 CHECK_NUMBER (cdr_part);
737 offset = XINT (cdr_part);
561 } 738 }
739 else
740 {
741 CHECK_CHARSET_GET_ID (elt, this_id);
742 offset = 0;
743 }
744 XSETCAR (val, Fcons (make_number (this_id), make_number (offset)));
745
746 this_charset = CHARSET_FROM_ID (this_id);
747 if (charset.min_char > this_charset->min_char)
748 charset.min_char = this_charset->min_char;
749 if (charset.max_char < this_charset->max_char)
750 charset.max_char = this_charset->max_char;
751 for (i = 0; i < 190; i++)
752 charset.fast_map[i] |= this_charset->fast_map[i];
562 } 753 }
563 if (i > MAX_CHARSET)
564 /* No such a charset. */
565 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
566 = make_number (-1);
567 }
568
569 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
570 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
571 {
572 bytes_by_char_head[leading_code_base] = bytes;
573 width_by_char_head[leading_code_base] = XINT (width);
574
575 /* Update table emacs_code_class. */
576 emacs_code_class[charset] = (bytes == 2
577 ? EMACS_leading_code_2
578 : (bytes == 3
579 ? EMACS_leading_code_3
580 : EMACS_leading_code_4));
581 } 754 }
755 else
756 error ("None of :code-offset, :map, :parents are specified");
582 757
583 /* Update table iso_charset_table. */ 758 val = args[charset_arg_unify_map];
584 if (XINT (iso_final_char) >= 0 759 if (! NILP (val) && !STRINGP (val))
585 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0) 760 CHECK_VECTOR (val);
586 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset; 761 ASET (attrs, charset_unify_map, val);
587}
588 762
589#ifdef emacs 763 CHECK_LIST (args[charset_arg_plist]);
764 ASET (attrs, charset_plist, args[charset_arg_plist]);
590 765
591/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL 766 charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
592 is invalid. */ 767 &hash_code);
593int 768 if (charset.hash_index >= 0)
594get_charset_id (charset_symbol) 769 {
595 Lisp_Object charset_symbol; 770 new_definition_p = 0;
596{ 771 HASH_VALUE (hash_table, charset.hash_index) = attrs;
597 Lisp_Object val; 772 }
598 int charset;
599
600 /* This originally used a ?: operator, but reportedly the HP-UX
601 compiler version HP92453-01 A.10.32.22 miscompiles that. */
602 if (SYMBOLP (charset_symbol)
603 && VECTORP (val = Fget (charset_symbol, Qcharset))
604 && CHARSET_VALID_P (charset =
605 XINT (XVECTOR (val)->contents[CHARSET_ID_IDX])))
606 return charset;
607 else 773 else
608 return -1; 774 {
609} 775 charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
776 hash_code);
777 if (charset_table_used == charset_table_size)
778 {
779 charset_table_size += 256;
780 charset_table
781 = ((struct charset *)
782 xrealloc (charset_table,
783 sizeof (struct charset) * charset_table_size));
784 }
785 id = charset_table_used++;
786 ASET (attrs, charset_id, make_number (id));
787 new_definition_p = 1;
788 }
610 789
611/* Return an identification number for a new private charset of
612 DIMENSION and WIDTH. If there's no more room for the new charset,
613 return 0. */
614Lisp_Object
615get_new_private_charset_id (dimension, width)
616 int dimension, width;
617{
618 int charset, from, to;
619 790
620 if (dimension == 1) 791 charset.id = id;
792 charset_table[id] = charset;
793
794 if (charset.iso_final >= 0)
621 { 795 {
622 from = LEADING_CODE_EXT_11; 796 ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
623 to = LEADING_CODE_EXT_21; 797 charset.iso_final) = id;
798 if (new_definition_p)
799 Viso_2022_charset_list = nconc2 (Viso_2022_charset_list,
800 Fcons (make_number (id), Qnil));
624 } 801 }
625 else 802
803 if (charset.emacs_mule_id >= 0)
626 { 804 {
627 from = LEADING_CODE_EXT_21; 805 emacs_mule_charset[charset.emacs_mule_id] = CHARSET_FROM_ID (id);
628 to = LEADING_CODE_EXT_MAX + 1; 806 if (new_definition_p)
807 Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
808 Fcons (make_number (id), Qnil));
629 } 809 }
630 810
631 for (charset = from; charset < to; charset++) 811 if (new_definition_p)
632 if (!CHARSET_DEFINED_P (charset)) break; 812 {
813 Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
814 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
815 Fcons (make_number (id), Qnil));
816 }
633 817
634 return make_number (charset < to ? charset : 0); 818 return Qnil;
635} 819}
636 820
637DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0, 821
638 doc: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR. 822DEFUN ("define-charset-alias", Fdefine_charset_alias,
639If CHARSET-ID is nil, it is decided automatically, which means CHARSET is 823 Sdefine_charset_alias, 2, 2, 0,
640 treated as a private charset. 824 doc: /* Define ALIAS as an alias for charset CHARSET. */)
641INFO-VECTOR is a vector of the format: 825 (alias, charset)
642 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE 826 Lisp_Object alias, charset;
643 SHORT-NAME LONG-NAME DESCRIPTION]
644The meanings of each elements is as follows:
645DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
646CHARS (integer) is the number of characters in a dimension: 94 or 96.
647WIDTH (integer) is the number of columns a character in the charset
648occupies on the screen: one of 0, 1, and 2.
649
650DIRECTION (integer) is the rendering direction of characters in the
651charset when rendering. If 0, render from left to right, else
652render from right to left.
653
654ISO-FINAL-CHAR (character) is the final character of the
655corresponding ISO 2022 charset.
656It may be -1 if the charset is internal use only.
657
658ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
659while encoding to variants of ISO 2022 coding system, one of the
660following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
661It may be -1 if the charset is internal use only.
662
663SHORT-NAME (string) is the short name to refer to the charset.
664
665LONG-NAME (string) is the long name to refer to the charset.
666
667DESCRIPTION (string) is the description string of the charset. */)
668 (charset_id, charset_symbol, info_vector)
669 Lisp_Object charset_id, charset_symbol, info_vector;
670{ 827{
671 Lisp_Object *vec; 828 Lisp_Object attr;
672 829
673 if (!NILP (charset_id)) 830 CHECK_CHARSET_GET_ATTR (charset, attr);
674 CHECK_NUMBER (charset_id); 831 Fputhash (alias, attr, Vcharset_hash_table);
675 CHECK_SYMBOL (charset_symbol); 832 return Qnil;
676 CHECK_VECTOR (info_vector); 833}
677 834
678 if (! NILP (charset_id))
679 {
680 if (! CHARSET_VALID_P (XINT (charset_id)))
681 error ("Invalid CHARSET: %d", XINT (charset_id));
682 else if (CHARSET_DEFINED_P (XINT (charset_id)))
683 error ("Already defined charset: %d", XINT (charset_id));
684 }
685 835
686 vec = XVECTOR (info_vector)->contents; 836DEFUN ("primary-charset", Fprimary_charset, Sprimary_charset, 0, 0, 0,
687 if (XVECTOR (info_vector)->size != 9 837 doc: /* Return the primary charset. */)
688 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2) 838 ()
689 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96) 839{
690 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2) 840 return CHARSET_NAME (CHARSET_FROM_ID (charset_primary));
691 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1) 841}
692 || !INTEGERP (vec[4]) 842
693 || !(XINT (vec[4]) == -1 || (XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~'))
694 || !INTEGERP (vec[5])
695 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
696 || !STRINGP (vec[6])
697 || !STRINGP (vec[7])
698 || !STRINGP (vec[8]))
699 error ("Invalid info-vector argument for defining charset %s",
700 XSYMBOL (charset_symbol)->name->data);
701
702 if (NILP (charset_id))
703 {
704 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
705 if (XINT (charset_id) == 0)
706 error ("There's no room for a new private charset %s",
707 XSYMBOL (charset_symbol)->name->data);
708 }
709 843
710 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3], 844DEFUN ("set-primary-charset", Fset_primary_charset, Sset_primary_charset,
711 vec[4], vec[5], vec[6], vec[7], vec[8]); 845 1, 1, 0,
712 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id))); 846 doc: /* Set the primary charset to CHARSET. */)
713 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol; 847 (charset)
714 Vcharset_list = Fcons (charset_symbol, Vcharset_list); 848 Lisp_Object charset;
715 Fupdate_coding_systems_internal (); 849{
850 int id;
851
852 CHECK_CHARSET_GET_ID (charset, id);
853 charset_primary = id;
716 return Qnil; 854 return Qnil;
717} 855}
718 856
719DEFUN ("generic-character-list", Fgeneric_character_list, 857
720 Sgeneric_character_list, 0, 0, 0, 858DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
721 doc: /* Return a list of all possible generic characters. 859 doc: /* Return a property list of CHARSET. */)
722It includes a generic character for a charset not yet defined. */) 860 (charset)
723 () 861 Lisp_Object charset;
724{ 862{
725 return Vgeneric_character_list; 863 Lisp_Object attrs;
864
865 CHECK_CHARSET_GET_ATTR (charset, attrs);
866 return CHARSET_ATTR_PLIST (attrs);
867}
868
869
870DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
871 doc: /* Set CHARSET's property list to PLIST. */)
872 (charset, plist)
873 Lisp_Object charset, plist;
874{
875 Lisp_Object attrs;
876
877 CHECK_CHARSET_GET_ATTR (charset, attrs);
878 CHARSET_ATTR_PLIST (attrs) = plist;
879 return plist;
880}
881
882
883DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 2, 0,
884 doc: /* Unify characters of CHARSET with Unicode. */)
885 (charset, unify_map)
886 Lisp_Object charset, unify_map;
887{
888 int id;
889 struct charset *cs;
890
891 CHECK_CHARSET_GET_ID (charset, id);
892 cs = CHARSET_FROM_ID (id);
893 if (CHARSET_METHOD (cs) == CHARSET_METHOD_MAP_DEFERRED)
894 load_charset (cs);
895 if (CHARSET_UNIFIED_P (cs)
896 && CHAR_TABLE_P (CHARSET_DEUNIFIER (cs)))
897 return Qnil;
898 CHARSET_UNIFIED_P (cs) = 0;
899 if (NILP (unify_map))
900 unify_map = CHARSET_UNIFY_MAP (cs);
901 if (STRINGP (unify_map))
902 unify_map = load_charset_map (cs, unify_map);
903 parse_charset_map (cs, unify_map, 2);
904 CHARSET_UNIFIED_P (cs) = 1;
905 return Qnil;
726} 906}
727 907
728DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char, 908DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
729 Sget_unused_iso_final_char, 2, 2, 0, 909 Sget_unused_iso_final_char, 2, 2, 0,
730 doc: /* Return an unsed ISO's final char for a charset of DIMENISION and CHARS. 910 doc: /*
911Return an unsed ISO's final char for a charset of DIMENISION and CHARS.
731DIMENSION is the number of bytes to represent a character: 1 or 2. 912DIMENSION is the number of bytes to represent a character: 1 or 2.
732CHARS is the number of characters in a dimension: 94 or 96. 913CHARS is the number of characters in a dimension: 94 or 96.
733 914
734This final char is for private use, thus the range is `0' (48) .. `?' (63). 915This final char is for private use, thus the range is `0' (48) .. `?' (63).
735If there's no unused final char for the specified kind of charset, 916If there's no unused final char for the attrified kind of charset,
736return nil. */) 917return nil. */)
737 (dimension, chars) 918 (dimension, chars)
738 Lisp_Object dimension, chars; 919 Lisp_Object dimension, chars;
@@ -741,128 +922,136 @@ return nil. */)
741 922
742 CHECK_NUMBER (dimension); 923 CHECK_NUMBER (dimension);
743 CHECK_NUMBER (chars); 924 CHECK_NUMBER (chars);
744 if (XINT (dimension) != 1 && XINT (dimension) != 2) 925 if (XINT (dimension) != 1 && XINT (dimension) != 2 && XINT (dimension) != 3)
745 error ("Invalid charset dimension %d, it should be 1 or 2", 926 args_out_of_range_3 (dimension, make_number (1), make_number (3));
746 XINT (dimension));
747 if (XINT (chars) != 94 && XINT (chars) != 96) 927 if (XINT (chars) != 94 && XINT (chars) != 96)
748 error ("Invalid charset chars %d, it should be 94 or 96", 928 args_out_of_range_3 (chars, make_number (94), make_number (96));
749 XINT (chars));
750 for (final_char = '0'; final_char <= '?'; final_char++) 929 for (final_char = '0'; final_char <= '?'; final_char++)
751 { 930 if (ISO_CHARSET_TABLE (XINT (dimension), XINT (chars), final_char) < 0)
752 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0) 931 break;
753 break;
754 }
755 return (final_char <= '?' ? make_number (final_char) : Qnil); 932 return (final_char <= '?' ? make_number (final_char) : Qnil);
756} 933}
757 934
758DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset, 935static void
759 4, 4, 0, 936check_iso_charset_parameter (dimension, chars, final_char)
760 doc: /* Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET. 937 Lisp_Object dimension, chars, final_char;
761CHARSET should be defined by `defined-charset' in advance. */)
762 (dimension, chars, final_char, charset_symbol)
763 Lisp_Object dimension, chars, final_char, charset_symbol;
764{ 938{
765 int charset; 939 CHECK_NATNUM (dimension);
940 CHECK_NATNUM (chars);
941 CHECK_NATNUM (final_char);
766 942
767 CHECK_NUMBER (dimension); 943 if (XINT (dimension) > 3)
768 CHECK_NUMBER (chars); 944 error ("Invalid DIMENSION %d, it should be 1, 2, or 3", XINT (dimension));
769 CHECK_NUMBER (final_char);
770 CHECK_SYMBOL (charset_symbol);
771
772 if (XINT (dimension) != 1 && XINT (dimension) != 2)
773 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
774 if (XINT (chars) != 94 && XINT (chars) != 96) 945 if (XINT (chars) != 94 && XINT (chars) != 96)
775 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars)); 946 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
776 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~') 947 if (XINT (final_char) < '0' || XINT (final_char) > '~')
777 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars)); 948 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
778 if ((charset = get_charset_id (charset_symbol)) < 0) 949}
779 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
780 950
781 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset; 951
952DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
953 4, 4, 0,
954 doc: /*
955Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.
956CHARSET should be defined by `defined-charset' in advance. */)
957 (dimension, chars, final_char, charset)
958 Lisp_Object dimension, chars, final_char, charset;
959{
960 int id;
961
962 CHECK_CHARSET_GET_ID (charset, id);
963 check_iso_charset_parameter (dimension, chars, final_char);
964
965 ISO_CHARSET_TABLE (dimension, chars, final_char) = id;
782 return Qnil; 966 return Qnil;
783} 967}
784 968
969
785/* Return information about charsets in the text at PTR of NBYTES 970/* Return information about charsets in the text at PTR of NBYTES
786 bytes, which are NCHARS characters. The value is: 971 bytes, which are NCHARS characters. The value is:
787 972
788 0: Each character is represented by one byte. This is always 973 0: Each character is represented by one byte. This is always
789 true for unibyte text. 974 true for a unibyte string. For a multibyte string, true if
790 1: No charsets other than ascii eight-bit-control, 975 it contains only ASCII characters.
791 eight-bit-graphic, and latin-1 are found.
792 2: Otherwise.
793 976
794 In addition, if CHARSETS is nonzero, for each found charset N, set 977 1: No charsets other than ascii, eight-bit-control, and
795 CHARSETS[N] to 1. For that, callers should allocate CHARSETS 978 latin-1 are found.
796 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation 979
797 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to 980 2: Otherwise.
798 1 (note that there's no charset whose ID is 1). */ 981*/
799 982
800int 983int
801find_charset_in_text (ptr, nchars, nbytes, charsets, table) 984string_xstring_p (string)
802 unsigned char *ptr; 985 Lisp_Object string;
803 int nchars, nbytes, *charsets;
804 Lisp_Object table;
805{ 986{
806 if (nchars == nbytes) 987 unsigned char *p = XSTRING (string)->data;
988 unsigned char *endp = p + STRING_BYTES (XSTRING (string));
989 struct charset *charset;
990
991 if (XSTRING (string)->size == STRING_BYTES (XSTRING (string)))
992 return 0;
993
994 charset = CHARSET_FROM_ID (charset_iso_8859_1);
995 while (p < endp)
807 { 996 {
808 if (charsets && nbytes > 0) 997 int c = STRING_CHAR_ADVANCE (p);
809 {
810 unsigned char *endp = ptr + nbytes;
811 int maskbits = 0;
812 998
813 while (ptr < endp && maskbits != 7) 999 if (ENCODE_CHAR (charset, c) < 0)
814 { 1000 return 2;
815 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
816 ptr++;
817 }
818
819 if (maskbits & 1)
820 charsets[CHARSET_ASCII] = 1;
821 if (maskbits & 2)
822 charsets[CHARSET_8_BIT_CONTROL] = 1;
823 if (maskbits & 4)
824 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
825 }
826 return 0;
827 } 1001 }
828 else 1002 return 1;
829 { 1003}
830 int return_val = 1;
831 int bytes, charset, c1, c2;
832 1004
833 if (! CHAR_TABLE_P (table))
834 table = Qnil;
835 1005
836 while (nchars-- > 0) 1006/* Find charsets in the string at PTR of NCHARS and NBYTES.
837 {
838 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
839 ptr += bytes;
840 1007
841 if (!CHARSET_DEFINED_P (charset)) 1008 CHARSETS is a vector. Each element is a cons of CHARSET and
842 charset = 1; 1009 FOUND-FLAG. CHARSET is a charset id, and FOUND-FLAG is nil or t.
843 else if (! NILP (table)) 1010 FOUND-FLAG t (or nil) means that the corresponding charset is
844 { 1011 already found (or not yet found).
845 int c = translate_char (table, -1, charset, c1, c2);
846 if (c >= 0)
847 charset = CHAR_CHARSET (c);
848 }
849 1012
850 if (return_val == 1 1013 It may lookup a translation table TABLE if supplied. */
851 && charset != CHARSET_ASCII
852 && charset != CHARSET_8_BIT_CONTROL
853 && charset != CHARSET_8_BIT_GRAPHIC
854 && charset != charset_latin_iso8859_1)
855 return_val = 2;
856 1014
857 if (charsets) 1015static void
858 charsets[charset] = 1; 1016find_charsets_in_text (ptr, nchars, nbytes, charsets, table)
859 else if (return_val == 2) 1017 unsigned char *ptr;
860 break; 1018 int nchars, nbytes;
1019 Lisp_Object charsets, table;
1020{
1021 unsigned char *pend = ptr + nbytes;
1022 int ncharsets = ASIZE (charsets);
1023
1024 if (nchars == nbytes)
1025 return;
1026
1027 while (ptr < pend)
1028 {
1029 int c = STRING_CHAR_ADVANCE (ptr);
1030 int i;
1031 int all_found = 1;
1032 Lisp_Object elt;
1033
1034 if (!NILP (table))
1035 c = translate_char (table, c);
1036 for (i = 0; i < ncharsets; i++)
1037 {
1038 elt = AREF (charsets, i);
1039 if (NILP (XCDR (elt)))
1040 {
1041 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (elt)));
1042
1043 if (ENCODE_CHAR (charset, c) != CHARSET_INVALID_CODE (charset))
1044 XCDR (elt) = Qt;
1045 else
1046 all_found = 0;
1047 }
861 } 1048 }
862 return return_val; 1049 if (all_found)
1050 break;
863 } 1051 }
864} 1052}
865 1053
1054
866DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region, 1055DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
867 2, 3, 0, 1056 2, 3, 0,
868 doc: /* Return a list of charsets in the region between BEG and END. 1057 doc: /* Return a list of charsets in the region between BEG and END.
@@ -877,7 +1066,7 @@ only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
877 (beg, end, table) 1066 (beg, end, table)
878 Lisp_Object beg, end, table; 1067 Lisp_Object beg, end, table;
879{ 1068{
880 int charsets[MAX_CHARSET + 1]; 1069 Lisp_Object charsets;
881 int from, from_byte, to, stop, stop_byte, i; 1070 int from, from_byte, to, stop, stop_byte, i;
882 Lisp_Object val; 1071 Lisp_Object val;
883 1072
@@ -895,11 +1084,14 @@ only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
895 1084
896 from_byte = CHAR_TO_BYTE (from); 1085 from_byte = CHAR_TO_BYTE (from);
897 1086
898 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); 1087 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1088 for (i = 0; i < charset_table_used; i++)
1089 ASET (charsets, i, Fcons (make_number (i), Qnil));
1090
899 while (1) 1091 while (1)
900 { 1092 {
901 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from, 1093 find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
902 stop_byte - from_byte, charsets, table); 1094 stop_byte - from_byte, charsets, table);
903 if (stop < to) 1095 if (stop < to)
904 { 1096 {
905 from = stop, from_byte = stop_byte; 1097 from = stop, from_byte = stop_byte;
@@ -910,13 +1102,9 @@ only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
910 } 1102 }
911 1103
912 val = Qnil; 1104 val = Qnil;
913 if (charsets[1]) 1105 for (i = charset_table_used - 1; i >= 0; i--)
914 val = Fcons (Qunknown, val); 1106 if (!NILP (XCDR (AREF (charsets, i))))
915 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--) 1107 val = Fcons (CHARSET_NAME (charset_table + i), val);
916 if (charsets[i])
917 val = Fcons (CHARSET_SYMBOL (i), val);
918 if (charsets[0])
919 val = Fcons (Qascii, val);
920 return val; 1108 return val;
921} 1109}
922 1110
@@ -929,838 +1117,471 @@ If the string contains invalid multibyte characters,
929`unknown' is included in the returned list. 1117`unknown' is included in the returned list.
930 1118
931If STR is unibyte, the returned list may contain 1119If STR is unibyte, the returned list may contain
932only `ascii', `eight-bit-control', and `eight-bit-graphic'. */) 1120only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
933 (str, table) 1121 (str, table)
934 Lisp_Object str, table; 1122 Lisp_Object str, table;
935{ 1123{
936 int charsets[MAX_CHARSET + 1]; 1124 Lisp_Object charsets;
937 int i; 1125 int i;
938 Lisp_Object val; 1126 Lisp_Object val;
939 1127
940 CHECK_STRING (str); 1128 CHECK_STRING (str);
941 1129
942 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); 1130 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
943 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size, 1131 find_charsets_in_text (XSTRING (str)->data, XSTRING (str)->size,
944 STRING_BYTES (XSTRING (str)), charsets, table); 1132 STRING_BYTES (XSTRING (str)), charsets, table);
945 1133
946 val = Qnil; 1134 val = Qnil;
947 if (charsets[1]) 1135 for (i = charset_table_used - 1; i >= 0; i--)
948 val = Fcons (Qunknown, val); 1136 if (!NILP (XCDR (AREF (charsets, i))))
949 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--) 1137 val = Fcons (CHARSET_NAME (charset_table + i), val);
950 if (charsets[i])
951 val = Fcons (CHARSET_SYMBOL (i), val);
952 if (charsets[0])
953 val = Fcons (Qascii, val);
954 return val; 1138 return val;
955} 1139}
956 1140
957 1141
958DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, 1142
959 doc: /* Return a character made from arguments. 1143/* Return a character correponding to the code-point CODE of
960Internal use only. */) 1144 CHARSET. */
961 (charset, code1, code2) 1145
962 Lisp_Object charset, code1, code2; 1146int
1147decode_char (charset, code)
1148 struct charset *charset;
1149 unsigned code;
963{ 1150{
964 int charset_id, c1, c2; 1151 int c, char_index;
1152 enum charset_method method = CHARSET_METHOD (charset);
965 1153
966 CHECK_NUMBER (charset); 1154 if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
967 charset_id = XINT (charset); 1155 return -1;
968 if (!CHARSET_DEFINED_P (charset_id))
969 error ("Invalid charset ID: %d", XINT (charset));
970 1156
971 if (NILP (code1)) 1157 if (method == CHARSET_METHOD_MAP_DEFERRED)
972 c1 = 0;
973 else
974 {
975 CHECK_NUMBER (code1);
976 c1 = XINT (code1);
977 }
978 if (NILP (code2))
979 c2 = 0;
980 else
981 { 1158 {
982 CHECK_NUMBER (code2); 1159 load_charset (charset);
983 c2 = XINT (code2); 1160 method = CHARSET_METHOD (charset);
984 } 1161 }
985 1162
986 if (charset_id == CHARSET_ASCII) 1163 if (method == CHARSET_METHOD_INHERIT)
987 {
988 if (c1 < 0 || c1 > 0x7F)
989 goto invalid_code_posints;
990 return make_number (c1);
991 }
992 else if (charset_id == CHARSET_8_BIT_CONTROL)
993 { 1164 {
994 if (NILP (code1)) 1165 Lisp_Object parents;
995 c1 = 0x80;
996 else if (c1 < 0x80 || c1 > 0x9F)
997 goto invalid_code_posints;
998 return make_number (c1);
999 }
1000 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
1001 {
1002 if (NILP (code1))
1003 c1 = 0xA0;
1004 else if (c1 < 0xA0 || c1 > 0xFF)
1005 goto invalid_code_posints;
1006 return make_number (c1);
1007 }
1008 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1009 goto invalid_code_posints;
1010 c1 &= 0x7F;
1011 c2 &= 0x7F;
1012 if (c1 == 0
1013 ? c2 != 0
1014 : (c2 == 0
1015 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
1016 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
1017 goto invalid_code_posints;
1018 return make_number (MAKE_CHAR (charset_id, c1, c2));
1019
1020 invalid_code_posints:
1021 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1022}
1023
1024DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1025 doc: /* Return list of charset and one or two position-codes of CHAR.
1026If CHAR is invalid as a character code,
1027return a list of symbol `unknown' and CHAR. */)
1028 (ch)
1029 Lisp_Object ch;
1030{
1031 int c, charset, c1, c2;
1032
1033 CHECK_NUMBER (ch);
1034 c = XFASTINT (ch);
1035 if (!CHAR_VALID_P (c, 1))
1036 return Fcons (Qunknown, Fcons (ch, Qnil));
1037 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1038 return (c2 >= 0
1039 ? Fcons (CHARSET_SYMBOL (charset),
1040 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1041 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1042}
1043
1044DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1045 doc: /* Return charset of CHAR. */)
1046 (ch)
1047 Lisp_Object ch;
1048{
1049 CHECK_NUMBER (ch);
1050
1051 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1052}
1053
1054DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1055 doc: /* Return charset of a character in the current buffer at position POS.
1056If POS is nil, it defauls to the current point.
1057If POS is out of range, the value is nil. */)
1058 (pos)
1059 Lisp_Object pos;
1060{
1061 Lisp_Object ch;
1062 int charset;
1063 1166
1064 ch = Fchar_after (pos); 1167 parents = CHARSET_PARENTS (charset);
1065 if (! INTEGERP (ch)) 1168 c = -1;
1066 return ch; 1169 for (; CONSP (parents); parents = XCDR (parents))
1067 charset = CHAR_CHARSET (XINT (ch)); 1170 {
1068 return CHARSET_SYMBOL (charset); 1171 int id = XINT (XCAR (XCAR (parents)));
1069} 1172 int code_offset = XINT (XCDR (XCAR (parents)));
1070 1173 unsigned this_code = code + code_offset;
1071DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1072 doc: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1073
1074ISO 2022's designation sequence (escape sequence) distinguishes charsets
1075by their DIMENSION, CHARS, and FINAL-CHAR,
1076where as Emacs distinguishes them by charset symbol.
1077See the documentation of the function `charset-info' for the meanings of
1078DIMENSION, CHARS, and FINAL-CHAR. */)
1079 (dimension, chars, final_char)
1080 Lisp_Object dimension, chars, final_char;
1081{
1082 int charset;
1083
1084 CHECK_NUMBER (dimension);
1085 CHECK_NUMBER (chars);
1086 CHECK_NUMBER (final_char);
1087
1088 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1089 return Qnil;
1090 return CHARSET_SYMBOL (charset);
1091}
1092
1093/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1094 generic character. If GENERICP is zero, return nonzero iff C is a
1095 valid normal character. Do not call this function directly,
1096 instead use macro CHAR_VALID_P. */
1097int
1098char_valid_p (c, genericp)
1099 int c, genericp;
1100{
1101 int charset, c1, c2;
1102 1174
1103 if (c < 0 || c >= MAX_CHAR) 1175 charset = CHARSET_FROM_ID (id);
1104 return 0; 1176 if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1105 if (SINGLE_BYTE_CHAR_P (c)) 1177 break;
1106 return 1; 1178 }
1107 SPLIT_CHAR (c, charset, c1, c2); 1179 }
1108 if (genericp) 1180 else
1109 { 1181 {
1110 if (c1) 1182 char_index = CODE_POINT_TO_INDEX (charset, code);
1183
1184 if (method == CHARSET_METHOD_MAP)
1111 { 1185 {
1112 if (c2 <= 0) c2 = 0x20; 1186 Lisp_Object decoder;
1187
1188 decoder = CHARSET_DECODER (charset);
1189 if (! VECTORP (decoder))
1190 return -1;
1191 c = XINT (AREF (decoder, char_index));
1113 } 1192 }
1114 else 1193 else
1115 { 1194 {
1116 if (c2 <= 0) c1 = c2 = 0x20; 1195 c = char_index + CHARSET_CODE_OFFSET (charset);
1117 } 1196 }
1118 } 1197 }
1119 return (CHARSET_DEFINED_P (charset)
1120 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1121}
1122 1198
1123DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0, 1199 if (CHARSET_UNIFIED_P (charset)
1124 doc: /* Return t if OBJECT is a valid normal character. 1200 && c >= 0)
1125If optional arg GENERICP is non-nil, also return t if OBJECT is 1201 MAYBE_UNIFY_CHAR (c);
1126a valid generic character. */)
1127 (object, genericp)
1128 Lisp_Object object, genericp;
1129{
1130 if (! NATNUMP (object))
1131 return Qnil;
1132 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1133}
1134
1135DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1136 Sunibyte_char_to_multibyte, 1, 1, 0,
1137 doc: /* Convert the unibyte character CH to multibyte character.
1138The conversion is done based on `nonascii-translation-table' (which see)
1139 or `nonascii-insert-offset' (which see). */)
1140 (ch)
1141 Lisp_Object ch;
1142{
1143 int c;
1144 1202
1145 CHECK_NUMBER (ch); 1203 return c;
1146 c = XINT (ch);
1147 if (c < 0 || c >= 0400)
1148 error ("Invalid unibyte character: %d", c);
1149 c = unibyte_char_to_multibyte (c);
1150 if (c < 0)
1151 error ("Can't convert to multibyte character: %d", XINT (ch));
1152 return make_number (c);
1153} 1204}
1154 1205
1155DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1156 Smultibyte_char_to_unibyte, 1, 1, 0,
1157 doc: /* Convert the multibyte character CH to unibyte character.
1158The conversion is done based on `nonascii-translation-table' (which see)
1159 or `nonascii-insert-offset' (which see). */)
1160 (ch)
1161 Lisp_Object ch;
1162{
1163 int c;
1164
1165 CHECK_NUMBER (ch);
1166 c = XINT (ch);
1167 if (! CHAR_VALID_P (c, 0))
1168 error ("Invalid multibyte character: %d", c);
1169 c = multibyte_char_to_unibyte (c, Qnil);
1170 if (c < 0)
1171 error ("Can't convert to unibyte character: %d", XINT (ch));
1172 return make_number (c);
1173}
1174 1206
1175DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0, 1207/* Return a code-point of CHAR in CHARSET. If CHAR doesn't belong to
1176 doc: /* Return 1 regardless of the argument CHAR. 1208 CHARSET, return CHARSET_INVALID_CODE (CHARSET). */
1177This is now an obsolete function. We keep it just for backward compatibility. */)
1178 (ch)
1179 Lisp_Object ch;
1180{
1181 CHECK_NUMBER (ch);
1182 return make_number (1);
1183}
1184 1209
1185/* Return how many bytes C will occupy in a multibyte buffer. 1210unsigned
1186 Don't call this function directly, instead use macro CHAR_BYTES. */ 1211encode_char (charset, c)
1187int 1212 struct charset *charset;
1188char_bytes (c)
1189 int c; 1213 int c;
1190{ 1214{
1191 int charset; 1215 unsigned code;
1216 enum charset_method method = CHARSET_METHOD (charset);
1192 1217
1193 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1))) 1218 if (CHARSET_UNIFIED_P (charset))
1194 return 1;
1195 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1196 return 1;
1197
1198 charset = CHAR_CHARSET (c);
1199 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1200}
1201
1202/* Return the width of character of which multi-byte form starts with
1203 C. The width is measured by how many columns occupied on the
1204 screen when displayed in the current buffer. */
1205
1206#define ONE_BYTE_CHAR_WIDTH(c) \
1207 (c < 0x20 \
1208 ? (c == '\t' \
1209 ? XFASTINT (current_buffer->tab_width) \
1210 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1211 : (c < 0x7f \
1212 ? 1 \
1213 : (c == 0x7F \
1214 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1215 : ((! NILP (current_buffer->enable_multibyte_characters) \
1216 && BASE_LEADING_CODE_P (c)) \
1217 ? WIDTH_BY_CHAR_HEAD (c) \
1218 : 4))))
1219
1220DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1221 doc: /* Return width of CHAR when displayed in the current buffer.
1222The width is measured by how many columns it occupies on the screen.
1223Tab is taken to occupy `tab-width' columns. */)
1224 (ch)
1225 Lisp_Object ch;
1226{
1227 Lisp_Object val, disp;
1228 int c;
1229 struct Lisp_Char_Table *dp = buffer_display_table ();
1230
1231 CHECK_NUMBER (ch);
1232
1233 c = XINT (ch);
1234
1235 /* Get the way the display table would display it. */
1236 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1237
1238 if (VECTORP (disp))
1239 XSETINT (val, XVECTOR (disp)->size);
1240 else if (SINGLE_BYTE_CHAR_P (c))
1241 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1242 else
1243 { 1219 {
1244 int charset = CHAR_CHARSET (c); 1220 Lisp_Object deunifier;
1221 int deunified;
1245 1222
1246 XSETFASTINT (val, CHARSET_WIDTH (charset)); 1223 deunifier = CHARSET_DEUNIFIER (charset);
1224 if (! CHAR_TABLE_P (deunifier))
1225 {
1226 Funify_charset (CHARSET_NAME (charset), Qnil);
1227 deunifier = CHARSET_DEUNIFIER (charset);
1228 }
1229 deunified = XINT (CHAR_TABLE_REF (deunifier, c));
1230 if (deunified > 0)
1231 c = deunified;
1247 } 1232 }
1248 return val;
1249}
1250
1251/* Return width of string STR of length LEN when displayed in the
1252 current buffer. The width is measured by how many columns it
1253 occupies on the screen. */
1254 1233
1255int 1234 if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1256strwidth (str, len) 1235 || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1257 unsigned char *str; 1236 return CHARSET_INVALID_CODE (charset);
1258 int len;
1259{
1260 return c_string_width (str, len, -1, NULL, NULL);
1261}
1262 1237
1263/* Return width of string STR of length LEN when displayed in the 1238 if (method == CHARSET_METHOD_INHERIT)
1264 current buffer. The width is measured by how many columns it
1265 occupies on the screen. If PRECISION > 0, return the width of
1266 longest substring that doesn't exceed PRECISION, and set number of
1267 characters and bytes of the substring in *NCHARS and *NBYTES
1268 respectively. */
1269
1270int
1271c_string_width (str, len, precision, nchars, nbytes)
1272 unsigned char *str;
1273 int precision, *nchars, *nbytes;
1274{
1275 int i = 0, i_byte = 0;
1276 int width = 0;
1277 int chars;
1278 struct Lisp_Char_Table *dp = buffer_display_table ();
1279
1280 while (i_byte < len)
1281 { 1239 {
1282 int bytes, thiswidth; 1240 Lisp_Object parents;
1283 Lisp_Object val;
1284
1285 if (dp)
1286 {
1287 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1288 1241
1289 chars = 1; 1242 parents = CHARSET_PARENTS (charset);
1290 val = DISP_CHAR_VECTOR (dp, c); 1243 for (; CONSP (parents); parents = XCDR (parents))
1291 if (VECTORP (val))
1292 thiswidth = XVECTOR (val)->size;
1293 else
1294 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1295 }
1296 else
1297 { 1244 {
1298 chars = 1; 1245 int id = XINT (XCAR (XCAR (parents)));
1299 PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes); 1246 int code_offset = XINT (XCDR (XCAR (parents)));
1300 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]); 1247 struct charset *this_charset = CHARSET_FROM_ID (id);
1301 }
1302 1248
1303 if (precision > 0 1249 code = ENCODE_CHAR (this_charset, c);
1304 && (width + thiswidth > precision)) 1250 if (code != CHARSET_INVALID_CODE (this_charset)
1305 { 1251 && (code_offset < 0 || code >= code_offset))
1306 *nchars = i; 1252 {
1307 *nbytes = i_byte; 1253 code -= code_offset;
1308 return width; 1254 if (CODE_POINT_TO_INDEX (charset, code) >= 0)
1255 return code;
1256 }
1309 } 1257 }
1310 i++; 1258 return CHARSET_INVALID_CODE (charset);
1311 i_byte += bytes; 1259 }
1312 width += thiswidth;
1313 }
1314 1260
1315 if (precision > 0) 1261 if (method == CHARSET_METHOD_MAP_DEFERRED)
1316 { 1262 {
1317 *nchars = i; 1263 load_charset (charset);
1318 *nbytes = i_byte; 1264 method = CHARSET_METHOD (charset);
1319 } 1265 }
1320 1266
1321 return width; 1267 if (method == CHARSET_METHOD_MAP)
1322}
1323
1324/* Return width of Lisp string STRING when displayed in the current
1325 buffer. The width is measured by how many columns it occupies on
1326 the screen while paying attention to compositions. If PRECISION >
1327 0, return the width of longest substring that doesn't exceed
1328 PRECISION, and set number of characters and bytes of the substring
1329 in *NCHARS and *NBYTES respectively. */
1330
1331int
1332lisp_string_width (string, precision, nchars, nbytes)
1333 Lisp_Object string;
1334 int precision, *nchars, *nbytes;
1335{
1336 int len = XSTRING (string)->size;
1337 int len_byte = STRING_BYTES (XSTRING (string));
1338 unsigned char *str = XSTRING (string)->data;
1339 int i = 0, i_byte = 0;
1340 int width = 0;
1341 struct Lisp_Char_Table *dp = buffer_display_table ();
1342
1343 while (i < len)
1344 { 1268 {
1345 int chars, bytes, thiswidth; 1269 Lisp_Object encoder;
1346 Lisp_Object val; 1270 Lisp_Object val;
1347 int cmp_id;
1348 int ignore, end;
1349 1271
1350 if (find_composition (i, -1, &ignore, &end, &val, string) 1272 encoder = CHARSET_ENCODER (charset);
1351 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string)) 1273 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1352 >= 0)) 1274 return CHARSET_INVALID_CODE (charset);
1353 { 1275 val = CHAR_TABLE_REF (encoder, c);
1354 thiswidth = composition_table[cmp_id]->width; 1276 if (CONSP (val))
1355 chars = end - i; 1277 code = (XINT (XCAR (val)) << 16) | XINT (XCDR (val));
1356 bytes = string_char_to_byte (string, end) - i_byte;
1357 }
1358 else if (dp)
1359 {
1360 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
1361
1362 chars = 1;
1363 val = DISP_CHAR_VECTOR (dp, c);
1364 if (VECTORP (val))
1365 thiswidth = XVECTOR (val)->size;
1366 else
1367 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1368 }
1369 else 1278 else
1370 { 1279 code = XINT (val);
1371 chars = 1; 1280 }
1372 PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes); 1281 else
1373 thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]);
1374 }
1375
1376 if (precision > 0
1377 && (width + thiswidth > precision))
1378 {
1379 *nchars = i;
1380 *nbytes = i_byte;
1381 return width;
1382 }
1383 i += chars;
1384 i_byte += bytes;
1385 width += thiswidth;
1386 }
1387
1388 if (precision > 0)
1389 { 1282 {
1390 *nchars = i; 1283 code = c - CHARSET_CODE_OFFSET (charset);
1391 *nbytes = i_byte; 1284 code = INDEX_TO_CODE_POINT (charset, code);
1392 } 1285 }
1393 1286
1394 return width; 1287 return code;
1395} 1288}
1396 1289
1397DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1398 doc: /* Return width of STRING when displayed in the current buffer.
1399Width is measured by how many columns it occupies on the screen.
1400When calculating width of a multibyte character in STRING,
1401only the base leading-code is considered; the validity of
1402the following bytes is not checked. Tabs in STRING are always
1403taken to occupy `tab-width' columns. */)
1404 (str)
1405 Lisp_Object str;
1406{
1407 Lisp_Object val;
1408 1290
1409 CHECK_STRING (str); 1291DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 3, 0,
1410 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL)); 1292 doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1411 return val; 1293Return nil if CODE-POINT is not valid in CHARSET.
1412}
1413 1294
1414DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0, 1295CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE).
1415 doc: /* Return the direction of CHAR. 1296
1416The returned value is 0 for left-to-right and 1 for right-to-left. */) 1297Optional argument RESTRICTION specifies a way to map the pair of CCS
1417 (ch) 1298and CODE-POINT to a chracter. Currently not supported and just ignored. */)
1418 Lisp_Object ch; 1299 (charset, code_point, restriction)
1300 Lisp_Object charset, code_point, restriction;
1419{ 1301{
1420 int charset; 1302 int c, id;
1303 unsigned code;
1304 struct charset *charsetp;
1421 1305
1422 CHECK_NUMBER (ch); 1306 CHECK_CHARSET_GET_ID (charset, id);
1423 charset = CHAR_CHARSET (XFASTINT (ch)); 1307 if (CONSP (code_point))
1424 if (!CHARSET_DEFINED_P (charset)) 1308 {
1425 invalid_character (XINT (ch)); 1309 CHECK_NATNUM (XCAR (code_point));
1426 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX); 1310 CHECK_NATNUM (XCDR (code_point));
1311 code = (XINT (XCAR (code_point)) << 16) | (XINT (XCAR (code_point)));
1312 }
1313 else
1314 {
1315 CHECK_NATNUM (code_point);
1316 code = XINT (code_point);
1317 }
1318 charsetp = CHARSET_FROM_ID (id);
1319 c = DECODE_CHAR (charsetp, code);
1320 return (c >= 0 ? make_number (c) : Qnil);
1427} 1321}
1428 1322
1429DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1430 doc: /* Return number of characters between BEG and END. */)
1431 (beg, end)
1432 Lisp_Object beg, end;
1433{
1434 int from, to;
1435 1323
1436 CHECK_NUMBER_COERCE_MARKER (beg); 1324DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 3, 0,
1437 CHECK_NUMBER_COERCE_MARKER (end); 1325 doc: /* Encode the character CH into a code-point of CHARSET.
1326Return nil if CHARSET doesn't include CH.
1438 1327
1439 from = min (XFASTINT (beg), XFASTINT (end)); 1328Optional argument RESTRICTION specifies a way to map CHAR to a
1440 to = max (XFASTINT (beg), XFASTINT (end)); 1329code-point in CCS. Currently not supported and just ignored. */)
1330 (ch, charset, restriction)
1331 Lisp_Object ch, charset, restriction;
1332{
1333 int c, id;
1334 unsigned code;
1335 struct charset *charsetp;
1441 1336
1442 return make_number (to - from); 1337 CHECK_CHARSET_GET_ID (charset, id);
1338 CHECK_NATNUM (ch);
1339 c = XINT (ch);
1340 charsetp = CHARSET_FROM_ID (id);
1341 code = ENCODE_CHAR (charsetp, ch);
1342 if (code == CHARSET_INVALID_CODE (charsetp))
1343 return Qnil;
1344 if (code > 0x7FFFFFF)
1345 return Fcons (make_number (code >> 16), make_number (code & 0xFFFF));
1346 return make_number (code);
1443} 1347}
1444 1348
1445/* Return the number of characters in the NBYTES bytes at PTR.
1446 This works by looking at the contents and checking for multibyte sequences.
1447 However, if the current buffer has enable-multibyte-characters = nil,
1448 we treat each byte as a character. */
1449 1349
1450int 1350DEFUN ("make-char", Fmake_char, Smake_char, 1, 4, 0,
1451chars_in_text (ptr, nbytes) 1351 doc: /* Return a character of CHARSET whose position code is CODE.
1452 unsigned char *ptr;
1453 int nbytes;
1454{
1455 /* current_buffer is null at early stages of Emacs initialization. */
1456 if (current_buffer == 0
1457 || NILP (current_buffer->enable_multibyte_characters))
1458 return nbytes;
1459
1460 return multibyte_chars_in_text (ptr, nbytes);
1461}
1462 1352
1463/* Return the number of characters in the NBYTES bytes at PTR. 1353If dimension of CHARSET is two, and the third optional arg CODE2 is
1464 This works by looking at the contents and checking for multibyte sequences. 1354non-nil, CODE actually specifies the first byte of the position code,
1465 It ignores enable-multibyte-characters. */ 1355and CODE2 specifies the second byte.
1466 1356
1467int 1357If dimension of CHARSET is three, and the third optional arg CODE2 and
1468multibyte_chars_in_text (ptr, nbytes) 1358the fourth optional arg CODE3 are both non-nil, CODE actually
1469 unsigned char *ptr; 1359specifies the first byte of the position code, CODE2 the second byte,
1470 int nbytes; 1360and CODE3 the third byte. */)
1361 (charset, code, code2, code3)
1362 Lisp_Object charset, code, code2, code3;
1471{ 1363{
1472 unsigned char *endp; 1364 int id, dimension;
1473 int chars, bytes; 1365 struct charset *charsetp;
1366 unsigned c;
1474 1367
1475 endp = ptr + nbytes; 1368 CHECK_CHARSET_GET_ID (charset, id);
1476 chars = 0; 1369 charsetp = CHARSET_FROM_ID (id);
1477 1370
1478 while (ptr < endp) 1371 if (NILP (code))
1372 code = make_number (CHARSET_MIN_CODE (charsetp));
1373 else
1479 { 1374 {
1480 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes); 1375 CHECK_NATNUM (code);
1481 ptr += bytes; 1376 dimension = CHARSET_DIMENSION (charsetp);
1482 chars++; 1377
1378 if (!NILP (code2))
1379 {
1380 CHECK_NATNUM (code2);
1381 if (dimension == 3)
1382 CHECK_NATNUM (code3);
1383 }
1483 } 1384 }
1484 1385
1485 return chars; 1386 if (dimension == 1 || NILP (code2))
1387 c = XFASTINT (code);
1388 else if (dimension == 2)
1389 c = (XFASTINT (code) << 8) | XFASTINT (code2);
1390 else if (dimension == 3)
1391 c = (XFASTINT (code) << 16) | (XFASTINT (code2) << 8) | XFASTINT (code3);
1392
1393 c = DECODE_CHAR (charsetp, c);
1394 return make_number (c);
1486} 1395}
1487 1396
1488/* Parse unibyte text at STR of LEN bytes as multibyte text, and 1397
1489 count the numbers of characters and bytes in it. On counting 1398/* Return the first charset in CHARSET_LIST that contains C.
1490 bytes, pay attention to the fact that 8-bit characters in the range 1399 CHARSET_LIST is a list of charset IDs. If it is nil, use
1491 0x80..0x9F are represented by 2 bytes in multibyte text. */ 1400 Vcharset_ordered_list. */
1492void 1401
1493parse_str_as_multibyte (str, len, nchars, nbytes) 1402struct charset *
1494 unsigned char *str; 1403char_charset (c, charset_list, code_return)
1495 int len, *nchars, *nbytes; 1404 int c;
1405 Lisp_Object charset_list;
1406 unsigned *code_return;
1496{ 1407{
1497 unsigned char *endp = str + len; 1408 if (NILP (charset_list))
1498 int n, chars = 0, bytes = 0; 1409 charset_list = Vcharset_ordered_list;
1499 1410
1500 while (str < endp) 1411 while (CONSP (charset_list))
1501 { 1412 {
1502 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n)) 1413 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
1503 str += n, bytes += n; 1414 unsigned code = ENCODE_CHAR (charset, c);
1504 else 1415
1505 str++, bytes += 2; 1416 if (code != CHARSET_INVALID_CODE (charset))
1506 chars++; 1417 {
1418 if (code_return)
1419 *code_return = code;
1420 return charset;
1421 }
1422 charset_list = XCDR (charset_list);
1507 } 1423 }
1508 *nchars = chars; 1424 return NULL;
1509 *nbytes = bytes;
1510 return;
1511} 1425}
1512 1426
1513/* Arrange unibyte text at STR of NBYTES bytes as multibyte text.
1514 It actually converts only 8-bit characters in the range 0x80..0x9F
1515 that don't contruct multibyte characters to multibyte forms. If
1516 NCHARS is nonzero, set *NCHARS to the number of characters in the
1517 text. It is assured that we can use LEN bytes at STR as a work
1518 area and that is enough. Return the number of bytes of the
1519 resulting text. */
1520 1427
1521int 1428DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1522str_as_multibyte (str, len, nbytes, nchars) 1429 doc: /*Return list of charset and one or two position-codes of CHAR.
1523 unsigned char *str; 1430If CHAR is invalid as a character code,
1524 int len, nbytes, *nchars; 1431return a list of symbol `unknown' and CHAR. */)
1432 (ch)
1433 Lisp_Object ch;
1525{ 1434{
1526 unsigned char *p = str, *endp = str + nbytes; 1435 struct charset *charset;
1527 unsigned char *to; 1436 int c, dimension;
1528 int chars = 0; 1437 unsigned code;
1529 int n; 1438 Lisp_Object val;
1530 1439
1531 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n)) 1440 CHECK_CHARACTER (ch);
1532 p += n, chars++; 1441 c = XFASTINT (ch);
1533 if (nchars) 1442 charset = CHAR_CHARSET (c);
1534 *nchars = chars; 1443 if (! charset)
1535 if (p == endp) 1444 return Fcons (intern ("unknown"), Fcons (ch, Qnil));
1536 return nbytes; 1445
1537 1446 code = ENCODE_CHAR (charset, c);
1538 to = p; 1447 if (code == CHARSET_INVALID_CODE (charset))
1539 nbytes = endp - p; 1448 abort ();
1540 endp = str + len; 1449 dimension = CHARSET_DIMENSION (charset);
1541 safe_bcopy (p, endp - nbytes, nbytes); 1450 val = (dimension == 1 ? Fcons (make_number (code), Qnil)
1542 p = endp - nbytes; 1451 : dimension == 2 ? Fcons (make_number (code >> 8),
1543 while (p < endp) 1452 Fcons (make_number (code & 0xFF), Qnil))
1544 { 1453 : Fcons (make_number (code >> 16),
1545 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n)) 1454 Fcons (make_number ((code >> 8) & 0xFF),
1546 { 1455 Fcons (make_number (code & 0xFF), Qnil))));
1547 while (n--) 1456 return Fcons (CHARSET_NAME (charset), val);
1548 *to++ = *p++;
1549 }
1550 else
1551 {
1552 *to++ = LEADING_CODE_8_BIT_CONTROL;
1553 *to++ = *p++ + 0x20;
1554 }
1555 chars++;
1556 }
1557 if (nchars)
1558 *nchars = chars;
1559 return (to - str);
1560} 1457}
1561 1458
1562/* Parse unibyte string at STR of LEN bytes, and return the number of
1563 bytes it may ocupy when converted to multibyte string by
1564 `str_to_multibyte'. */
1565 1459
1566int 1460DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1567parse_str_to_multibyte (str, len) 1461 doc: /* Return the charset of highest priority that contains CHAR. */)
1568 unsigned char *str; 1462 (ch)
1569 int len; 1463 Lisp_Object ch;
1570{ 1464{
1571 unsigned char *endp = str + len; 1465 struct charset *charset;
1572 int bytes;
1573 1466
1574 for (bytes = 0; str < endp; str++) 1467 CHECK_CHARACTER (ch);
1575 bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2; 1468 charset = CHAR_CHARSET (XINT (ch));
1576 return bytes; 1469 return (CHARSET_NAME (charset));
1577} 1470}
1578 1471
1579/* Convert unibyte text at STR of NBYTES bytes to multibyte text
1580 that contains the same single-byte characters. It actually
1581 converts all 8-bit characters to multibyte forms. It is assured
1582 that we can use LEN bytes at STR as a work area and that is
1583 enough. */
1584 1472
1585int 1473DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1586str_to_multibyte (str, len, bytes) 1474 doc: /*
1587 unsigned char *str; 1475Return charset of a character in the current buffer at position POS.
1588 int len, bytes; 1476If POS is nil, it defauls to the current point.
1477If POS is out of range, the value is nil. */)
1478 (pos)
1479 Lisp_Object pos;
1589{ 1480{
1590 unsigned char *p = str, *endp = str + bytes; 1481 Lisp_Object ch;
1591 unsigned char *to; 1482 struct charset *charset;
1592 1483
1593 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++; 1484 ch = Fchar_after (pos);
1594 if (p == endp) 1485 if (! INTEGERP (ch))
1595 return bytes; 1486 return ch;
1596 to = p; 1487 charset = CHAR_CHARSET (XINT (ch));
1597 bytes = endp - p; 1488 return (CHARSET_NAME (charset));
1598 endp = str + len;
1599 safe_bcopy (p, endp - bytes, bytes);
1600 p = endp - bytes;
1601 while (p < endp)
1602 {
1603 if (*p < 0x80 || *p >= 0xA0)
1604 *to++ = *p++;
1605 else
1606 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1607 }
1608 return (to - str);
1609} 1489}
1610 1490
1611/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1612 actually converts only 8-bit characters in the range 0x80..0x9F to
1613 unibyte forms. */
1614 1491
1615int 1492DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1616str_as_unibyte (str, bytes) 1493 doc: /*
1617 unsigned char *str; 1494Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
1618 int bytes; 1495
1496ISO 2022's designation sequence (escape sequence) distinguishes charsets
1497by their DIMENSION, CHARS, and FINAL-CHAR,
1498where as Emacs distinguishes them by charset symbol.
1499See the documentation of the function `charset-info' for the meanings of
1500DIMENSION, CHARS, and FINAL-CHAR. */)
1501 (dimension, chars, final_char)
1502 Lisp_Object dimension, chars, final_char;
1619{ 1503{
1620 unsigned char *p = str, *endp = str + bytes; 1504 int id;
1621 unsigned char *to = str;
1622 1505
1623 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++; 1506 check_iso_charset_parameter (dimension, chars, final_char);
1624 to = p; 1507 id = ISO_CHARSET_TABLE (XFASTINT (dimension), XFASTINT (chars),
1625 while (p < endp) 1508 XFASTINT (final_char));
1626 { 1509 return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
1627 if (*p == LEADING_CODE_8_BIT_CONTROL)
1628 *to++ = *(p + 1) - 0x20, p += 2;
1629 else
1630 *to++ = *p++;
1631 }
1632 return (to - str);
1633} 1510}
1634 1511
1635 1512
1636DEFUN ("string", Fstring, Sstring, 1, MANY, 0, 1513DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
1637 doc: /* Concatenate all the argument characters and make the result a string. 1514 0, 0, 0,
1638usage: (string &rest CHARACTERS) */) 1515 doc: /*
1639 (n, args) 1516Clear encoder and decoder of charsets that are loaded from mapfiles. */)
1640 int n; 1517 ()
1641 Lisp_Object *args;
1642{ 1518{
1643 int i; 1519 int i;
1644 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n); 1520 struct charset *charset;
1645 unsigned char *p = buf; 1521 Lisp_Object attrs;
1646 int c;
1647 int multibyte = 0;
1648 1522
1649 for (i = 0; i < n; i++) 1523 for (i = 0; i < charset_table_used; i++)
1650 { 1524 {
1651 CHECK_NUMBER (args[i]); 1525 charset = CHARSET_FROM_ID (i);
1652 if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i]))) 1526 attrs = CHARSET_ATTRIBUTES (charset);
1653 multibyte = 1; 1527
1528 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
1529 {
1530 CHARSET_ATTR_DECODER (attrs) = Qnil;
1531 CHARSET_ATTR_ENCODER (attrs) = Qnil;
1532 CHARSET_METHOD (charset) = CHARSET_METHOD_MAP_DEFERRED;
1533 }
1534
1535 if (CHARSET_UNIFIED_P (charset))
1536 CHARSET_ATTR_DEUNIFIER (attrs) = Qnil;
1654 } 1537 }
1655 1538
1656 for (i = 0; i < n; i++) 1539 if (CHAR_TABLE_P (Vchar_unified_charset_table))
1657 { 1540 {
1658 c = XINT (args[i]); 1541 Foptimize_char_table (Vchar_unified_charset_table);
1659 if (multibyte) 1542 Vchar_unify_table = Vchar_unified_charset_table;
1660 p += CHAR_STRING (c, p); 1543 Vchar_unified_charset_table = Qnil;
1661 else
1662 *p++ = c;
1663 } 1544 }
1664 1545
1665 return make_string_from_bytes (buf, n, p - buf); 1546 return Qnil;
1666} 1547}
1667 1548
1668#endif /* emacs */
1669 1549
1670int 1550void
1671charset_id_internal (charset_name) 1551init_charset ()
1672 char *charset_name;
1673{ 1552{
1674 Lisp_Object val;
1675
1676 val= Fget (intern (charset_name), Qcharset);
1677 if (!VECTORP (val))
1678 error ("Charset %s is not defined", charset_name);
1679 1553
1680 return (XINT (XVECTOR (val)->contents[0]));
1681} 1554}
1682 1555
1683DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1684 Ssetup_special_charsets, 0, 0, 0, doc: /* Internal use only. */)
1685 ()
1686{
1687 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1688 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1689 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1690 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1691 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1692 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1693 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1694 return Qnil;
1695}
1696 1556
1697void 1557void
1698init_charset_once () 1558init_charset_once ()
1699{ 1559{
1700 int i, j, k; 1560 int i, j, k;
1701 1561
1702 staticpro (&Vcharset_table); 1562 for (i = 0; i < ISO_MAX_DIMENSION; i++)
1703 staticpro (&Vcharset_symbol_table); 1563 for (j = 0; j < ISO_MAX_CHARS; j++)
1704 staticpro (&Vgeneric_character_list); 1564 for (k = 0; k < ISO_MAX_FINAL; k++)
1565 iso_charset_table[i][j][k] = -1;
1705 1566
1706 /* This has to be done here, before we call Fmake_char_table. */ 1567 for (i = 0; i < 255; i++)
1707 Qcharset_table = intern ("charset-table"); 1568 emacs_mule_charset[i] = NULL;
1708 staticpro (&Qcharset_table); 1569
1570#if 0
1571 Vchar_charset_set = Fmake_char_table (Qnil, Qnil);
1572 CHAR_TABLE_SET (Vchar_charset_set, make_number (97), Qnil);
1573
1574 DEFSYM (Qcharset_encode_table, "charset-encode-table");
1709 1575
1710 /* Intern this now in case it isn't already done. 1576 /* Intern this now in case it isn't already done.
1711 Setting this variable twice is harmless. 1577 Setting this variable twice is harmless.
1712 But don't staticpro it here--that is done in alloc.c. */ 1578 But don't staticpro it here--that is done in alloc.c. */
1713 Qchar_table_extra_slots = intern ("char-table-extra-slots"); 1579 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1714 1580
1715 /* Now we are ready to set up this property, so we can 1581 /* Now we are ready to set up this property, so we can create syntax
1716 create the charset table. */ 1582 tables. */
1717 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0)); 1583 Fput (Qcharset_encode_table, Qchar_table_extra_slots, make_number (0));
1718 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil); 1584#endif
1719
1720 Qunknown = intern ("unknown");
1721 staticpro (&Qunknown);
1722 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1723 Qunknown);
1724
1725 /* Setup tables. */
1726 for (i = 0; i < 2; i++)
1727 for (j = 0; j < 2; j++)
1728 for (k = 0; k < 128; k++)
1729 iso_charset_table [i][j][k] = -1;
1730
1731 for (i = 0; i < 256; i++)
1732 bytes_by_char_head[i] = 1;
1733 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1734 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1735 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1736 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1737
1738 for (i = 0; i < 128; i++)
1739 width_by_char_head[i] = 1;
1740 for (; i < 256; i++)
1741 width_by_char_head[i] = 4;
1742 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1743 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1744 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1745 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1746
1747 {
1748 Lisp_Object val;
1749
1750 val = Qnil;
1751 for (i = 0x81; i < 0x90; i++)
1752 val = Fcons (make_number ((i - 0x70) << 7), val);
1753 for (; i < 0x9A; i++)
1754 val = Fcons (make_number ((i - 0x8F) << 14), val);
1755 for (i = 0xA0; i < 0xF0; i++)
1756 val = Fcons (make_number ((i - 0x70) << 7), val);
1757 for (; i < 0xFF; i++)
1758 val = Fcons (make_number ((i - 0xE0) << 14), val);
1759 Vgeneric_character_list = Fnreverse (val);
1760 }
1761
1762 nonascii_insert_offset = 0;
1763 Vnonascii_translation_table = Qnil;
1764} 1585}
1765 1586
1766#ifdef emacs 1587#ifdef emacs
@@ -1768,141 +1589,128 @@ init_charset_once ()
1768void 1589void
1769syms_of_charset () 1590syms_of_charset ()
1770{ 1591{
1771 Qcharset = intern ("charset"); 1592 char *p;
1772 staticpro (&Qcharset); 1593
1773 1594 DEFSYM (Qcharsetp, "charsetp");
1774 Qascii = intern ("ascii"); 1595
1775 staticpro (&Qascii); 1596 DEFSYM (Qascii, "ascii");
1776 1597 DEFSYM (Qunicode, "unicode");
1777 Qeight_bit_control = intern ("eight-bit-control"); 1598 DEFSYM (Qeight_bit_control, "eight-bit-control");
1778 staticpro (&Qeight_bit_control); 1599 DEFSYM (Qeight_bit_graphic, "eight-bit-graphic");
1779 1600 DEFSYM (Qiso_8859_1, "iso-8859-1");
1780 Qeight_bit_graphic = intern ("eight-bit-graphic"); 1601
1781 staticpro (&Qeight_bit_graphic); 1602 DEFSYM (Qgl, "gl");
1782 1603 DEFSYM (Qgr, "gr");
1783 /* Define special charsets ascii, eight-bit-control, and 1604
1784 eight-bit-graphic. */ 1605 p = (char *) xmalloc (30000);
1785 update_charset_table (make_number (CHARSET_ASCII), 1606
1786 make_number (1), make_number (94), 1607 staticpro (&Vcharset_ordered_list);
1787 make_number (1), 1608 Vcharset_ordered_list = Qnil;
1788 make_number (0), 1609
1789 make_number ('B'), 1610 staticpro (&Viso_2022_charset_list);
1790 make_number (0), 1611 Viso_2022_charset_list = Qnil;
1791 build_string ("ASCII"), 1612
1792 Qnil, /* same as above */ 1613 staticpro (&Vemacs_mule_charset_list);
1793 build_string ("ASCII (ISO646 IRV)")); 1614 Vemacs_mule_charset_list = Qnil;
1794 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii; 1615
1795 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII)); 1616 staticpro (&Vcharset_hash_table);
1796 1617 Vcharset_hash_table = Fmakehash (Qeq);
1797 update_charset_table (make_number (CHARSET_8_BIT_CONTROL), 1618
1798 make_number (1), make_number (96), 1619 charset_table_size = 128;
1799 make_number (4), 1620 charset_table = ((struct charset *)
1800 make_number (0), 1621 xmalloc (sizeof (struct charset) * charset_table_size));
1801 make_number (-1), 1622 charset_table_used = 0;
1802 make_number (-1), 1623
1803 build_string ("8-bit control code (0x80..0x9F)"), 1624 staticpro (&Vchar_unified_charset_table);
1804 Qnil, /* same as above */ 1625 Vchar_unified_charset_table = Fmake_char_table (Qnil, make_number (-1));
1805 Qnil); /* same as above */ 1626
1806 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control; 1627 defsubr (&Scharsetp);
1807 Fput (Qeight_bit_control, Qcharset, 1628 defsubr (&Smap_charset_chars);
1808 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL)); 1629 defsubr (&Sdefine_charset_internal);
1809 1630 defsubr (&Sdefine_charset_alias);
1810 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC), 1631 defsubr (&Sprimary_charset);
1811 make_number (1), make_number (96), 1632 defsubr (&Sset_primary_charset);
1812 make_number (4), 1633 defsubr (&Scharset_plist);
1813 make_number (0), 1634 defsubr (&Sset_charset_plist);
1814 make_number (-1), 1635 defsubr (&Sunify_charset);
1815 make_number (-1),
1816 build_string ("8-bit graphic char (0xA0..0xFF)"),
1817 Qnil, /* same as above */
1818 Qnil); /* same as above */
1819 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1820 Fput (Qeight_bit_graphic, Qcharset,
1821 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1822
1823 Qauto_fill_chars = intern ("auto-fill-chars");
1824 staticpro (&Qauto_fill_chars);
1825 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1826
1827 defsubr (&Sdefine_charset);
1828 defsubr (&Sgeneric_character_list);
1829 defsubr (&Sget_unused_iso_final_char); 1636 defsubr (&Sget_unused_iso_final_char);
1830 defsubr (&Sdeclare_equiv_charset); 1637 defsubr (&Sdeclare_equiv_charset);
1831 defsubr (&Sfind_charset_region); 1638 defsubr (&Sfind_charset_region);
1832 defsubr (&Sfind_charset_string); 1639 defsubr (&Sfind_charset_string);
1833 defsubr (&Smake_char_internal); 1640 defsubr (&Sdecode_char);
1641 defsubr (&Sencode_char);
1834 defsubr (&Ssplit_char); 1642 defsubr (&Ssplit_char);
1643 defsubr (&Smake_char);
1835 defsubr (&Schar_charset); 1644 defsubr (&Schar_charset);
1836 defsubr (&Scharset_after); 1645 defsubr (&Scharset_after);
1837 defsubr (&Siso_charset); 1646 defsubr (&Siso_charset);
1838 defsubr (&Schar_valid_p); 1647 defsubr (&Sclear_charset_maps);
1839 defsubr (&Sunibyte_char_to_multibyte); 1648
1840 defsubr (&Smultibyte_char_to_unibyte); 1649 DEFVAR_LISP ("charset-map-directory", &Vcharset_map_directory,
1841 defsubr (&Schar_bytes); 1650 doc: /* Directory of charset map files that come with GNU Emacs.
1842 defsubr (&Schar_width); 1651The default value is \"\\[data-directory]/charsets\". */);
1843 defsubr (&Sstring_width); 1652 Vcharset_map_directory = Fexpand_file_name (build_string ("charsets"),
1844 defsubr (&Schar_direction); 1653 Vdata_directory);
1845 defsubr (&Schars_in_region);
1846 defsubr (&Sstring);
1847 defsubr (&Ssetup_special_charsets);
1848 1654
1849 DEFVAR_LISP ("charset-list", &Vcharset_list, 1655 DEFVAR_LISP ("charset-list", &Vcharset_list,
1850 doc: /* List of charsets ever defined. */); 1656 doc: /* List of charsets ever defined. */);
1851 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control, 1657 Vcharset_list = Qnil;
1852 Fcons (Qeight_bit_graphic, Qnil))); 1658
1853 1659 /* Make the prerequisite charset `ascii' and `unicode'. */
1854 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, 1660 {
1855 doc: /* Vector of cons cell of a symbol and translation table ever defined. 1661 Lisp_Object args[charset_arg_max];
1856An ID of a translation table is an index of this vector. */); 1662 Lisp_Object plist[14];
1857 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil); 1663 Lisp_Object val;
1858 1664
1859 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11, 1665 plist[0] = intern (":name");
1860 doc: /* Leading-code of private TYPE9N charset of column-width 1. */); 1666 plist[1] = args[charset_arg_name] = Qascii;
1861 leading_code_private_11 = LEADING_CODE_PRIVATE_11; 1667 plist[2] = intern (":dimension");
1862 1668 plist[3] = args[charset_arg_dimension] = make_number (1);
1863 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12, 1669 val = Fmake_vector (make_number (8), make_number (0));
1864 doc: /* Leading-code of private TYPE9N charset of column-width 2. */); 1670 ASET (val, 1, make_number (127));
1865 leading_code_private_12 = LEADING_CODE_PRIVATE_12; 1671 plist[4] = intern (":code-space");
1866 1672 plist[5] = args[charset_arg_code_space] = val;
1867 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21, 1673 plist[6] = intern (":iso-final-char");
1868 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */); 1674 plist[7] = args[charset_arg_iso_final] = make_number ('B');
1869 leading_code_private_21 = LEADING_CODE_PRIVATE_21; 1675 args[charset_arg_iso_revision] = Qnil;
1870 1676 plist[8] = intern (":emacs-mule-id");
1871 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22, 1677 plist[9] = args[charset_arg_emacs_mule_id] = make_number (0);
1872 doc: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */); 1678 plist[10] = intern (":ascii-compatible-p");
1873 leading_code_private_22 = LEADING_CODE_PRIVATE_22; 1679 plist[11] = args[charset_arg_ascii_compatible_p] = Qt;
1874 1680 args[charset_arg_supplementary_p] = Qnil;
1875 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset, 1681 args[charset_arg_invalid_code] = Qnil;
1876 doc: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte. 1682 plist[12] = intern (":code-offset");
1877This is used for converting unibyte text to multibyte, 1683 plist[13] = args[charset_arg_code_offset] = make_number (0);
1878and for inserting character codes specified by number. 1684 args[charset_arg_map] = Qnil;
1879 1685 args[charset_arg_parents] = Qnil;
1880This serves to convert a Latin-1 or similar 8-bit character code 1686 args[charset_arg_unify_map] = Qnil;
1881to the corresponding Emacs multibyte character code. 1687 /* The actual plist is set by mule-conf.el. */
1882Typically the value should be (- (make-char CHARSET 0) 128), 1688 args[charset_arg_plist] = Flist (14, plist);
1883for your choice of character set. 1689 Fdefine_charset_internal (charset_arg_max, args);
1884If `nonascii-translation-table' is non-nil, it overrides this variable. */); 1690 charset_ascii = CHARSET_SYMBOL_ID (Qascii);
1885 nonascii_insert_offset = 0; 1691
1886 1692 plist[1] = args[charset_arg_name] = Qunicode;
1887 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table, 1693 plist[3] = args[charset_arg_dimension] = make_number (3);
1888 doc: /* Translation table to convert non-ASCII unibyte codes to multibyte. 1694 val = Fmake_vector (make_number (8), make_number (0));
1889This is used for converting unibyte text to multibyte, 1695 ASET (val, 1, make_number (255));
1890and for inserting character codes specified by number. 1696 ASET (val, 3, make_number (255));
1891 1697 ASET (val, 5, make_number (16));
1892Conversion is performed only when multibyte characters are enabled, 1698 plist[5] = args[charset_arg_code_space] = val;
1893and it serves to convert a Latin-1 or similar 8-bit character code 1699 plist[7] = args[charset_arg_iso_final] = Qnil;
1894to the corresponding Emacs character code. 1700 args[charset_arg_iso_revision] = Qnil;
1895 1701 plist[9] = args[charset_arg_emacs_mule_id] = Qnil;
1896If this is nil, `nonascii-insert-offset' is used instead. 1702 plist[11] = args[charset_arg_ascii_compatible_p] = Qt;
1897See also the docstring of `make-translation-table'. */); 1703 args[charset_arg_supplementary_p] = Qnil;
1898 Vnonascii_translation_table = Qnil; 1704 args[charset_arg_invalid_code] = Qnil;
1899 1705 plist[13] = args[charset_arg_code_offset] = make_number (0);
1900 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars, 1706 args[charset_arg_map] = Qnil;
1901 doc: /* A char-table for characters which invoke auto-filling. 1707 args[charset_arg_parents] = Qnil;
1902Such characters have value t in this table. */); 1708 args[charset_arg_unify_map] = Qnil;
1903 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil); 1709 /* The actual plist is set by mule-conf.el. */
1904 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt); 1710 args[charset_arg_plist] = Flist (14, plist);
1905 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt); 1711 Fdefine_charset_internal (charset_arg_max, args);
1712 charset_unicode = CHARSET_SYMBOL_ID (Qunicode);
1713 }
1906} 1714}
1907 1715
1908#endif /* emacs */ 1716#endif /* emacs */
diff --git a/src/charset.h b/src/charset.h
index d4e85d91ebf..58649eabb51 100644
--- a/src/charset.h
+++ b/src/charset.h
@@ -1,7 +1,10 @@
1/* Header for multibyte character handler. 1/* Header for charset handler.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation. 3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc. 4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
5 8
6This file is part of GNU Emacs. 9This file is part of GNU Emacs.
7 10
@@ -23,806 +26,428 @@ Boston, MA 02111-1307, USA. */
23#ifndef EMACS_CHARSET_H 26#ifndef EMACS_CHARSET_H
24#define EMACS_CHARSET_H 27#define EMACS_CHARSET_H
25 28
26/* #define BYTE_COMBINING_DEBUG */ 29/* Index to arguments of Fdefine_charset_internal. */
27 30
28/*** GENERAL NOTE on CHARACTER SET (CHARSET) *** 31enum define_charset_arg_index
29 32 {
30 A character set ("charset" hereafter) is a meaningful collection 33 charset_arg_name,
31 (i.e. language, culture, functionality, etc) of characters. Emacs 34 charset_arg_dimension,
32 handles multiple charsets at once. Each charset corresponds to one 35 charset_arg_code_space,
33 of the ISO charsets. Emacs identifies a charset by a unique 36 charset_arg_iso_final,
34 identification number, whereas ISO identifies a charset by a triplet 37 charset_arg_iso_revision,
35 of DIMENSION, CHARS and FINAL-CHAR. So, hereafter, just saying 38 charset_arg_emacs_mule_id,
36 "charset" means an identification number (integer value). 39 charset_arg_ascii_compatible_p,
37 40 charset_arg_supplementary_p,
38 The value range of charsets is 0x00, 0x81..0xFE. There are four 41 charset_arg_invalid_code,
39 kinds of charset depending on DIMENSION (1 or 2) and CHARS (94 or 42 charset_arg_code_offset,
40 96). For instance, a charset of DIMENSION2_CHARS94 contains 94x94 43 charset_arg_map,
41 characters. 44 charset_arg_parents,
42 45 charset_arg_unify_map,
43 Within Emacs Lisp, a charset is treated as a symbol which has a 46 charset_arg_plist,
44 property `charset'. The property value is a vector containing 47 charset_arg_max
45 various information about the charset. For readability of C code, 48 };
46 we use the following convention for C variable names: 49
47 charset_symbol: Emacs Lisp symbol of a charset 50
48 charset_id: Emacs Lisp integer of an identification number of a charset 51/* Indices to charset attributes vector. */
49 charset: C integer of an identification number of a charset 52
50 53enum charset_attr_index
51 Each charset (except for ascii) is assigned a base leading-code 54 {
52 (range 0x80..0x9E). In addition, a charset of greater than 0xA0 55 /* ID number of the charset. */
53 (whose base leading-code is 0x9A..0x9D) is assigned an extended 56 charset_id,
54 leading-code (range 0xA0..0xFE). In this case, each base
55 leading-code specifies the allowable range of extended leading-code
56 as shown in the table below. A leading-code is used to represent a
57 character in Emacs' buffer and string.
58
59 We call a charset which has extended leading-code a "private
60 charset" because those are mainly for a charset which is not yet
61 registered by ISO. On the contrary, we call a charset which does
62 not have extended leading-code an "official charset".
63
64 ---------------------------------------------------------------------------
65 charset dimension base leading-code extended leading-code
66 ---------------------------------------------------------------------------
67 0x00 official dim1 -- none -- -- none --
68 (ASCII)
69 0x01..0x7F --never used--
70 0x80 official dim1 -- none -- -- none --
71 (eight-bit-graphic)
72 0x81..0x8F official dim1 same as charset -- none --
73 0x90..0x99 official dim2 same as charset -- none --
74 0x9A..0x9D --never used--
75 0x9E official dim1 same as charset -- none --
76 (eight-bit-control)
77 0x9F --never used--
78 0xA0..0xDF private dim1 0x9A same as charset
79 of 1-column width
80 0xE0..0xEF private dim1 0x9B same as charset
81 of 2-column width
82 0xF0..0xF4 private dim2 0x9C same as charset
83 of 1-column width
84 0xF5..0xFE private dim2 0x9D same as charset
85 of 2-column width
86 0xFF --never used--
87 ---------------------------------------------------------------------------
88
89*/
90
91/* Definition of special leading-codes. */
92/* Leading-code followed by extended leading-code. */
93#define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
94#define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
95#define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
96#define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
97
98#define LEADING_CODE_8_BIT_CONTROL 0x9E /* for `eight-bit-control' */
99
100/* Extended leading-code. */
101/* Start of each extended leading-codes. */
102#define LEADING_CODE_EXT_11 0xA0 /* follows LEADING_CODE_PRIVATE_11 */
103#define LEADING_CODE_EXT_12 0xE0 /* follows LEADING_CODE_PRIVATE_12 */
104#define LEADING_CODE_EXT_21 0xF0 /* follows LEADING_CODE_PRIVATE_21 */
105#define LEADING_CODE_EXT_22 0xF5 /* follows LEADING_CODE_PRIVATE_22 */
106/* Maximum value of extended leading-codes. */
107#define LEADING_CODE_EXT_MAX 0xFE
108
109/* Definition of minimum/maximum charset of each DIMENSION. */
110#define MIN_CHARSET_OFFICIAL_DIMENSION1 0x80
111#define MAX_CHARSET_OFFICIAL_DIMENSION1 0x8F
112#define MIN_CHARSET_OFFICIAL_DIMENSION2 0x90
113#define MAX_CHARSET_OFFICIAL_DIMENSION2 0x99
114#define MIN_CHARSET_PRIVATE_DIMENSION1 LEADING_CODE_EXT_11
115#define MIN_CHARSET_PRIVATE_DIMENSION2 LEADING_CODE_EXT_21
116
117/* Maximum value of overall charset identification number. */
118#define MAX_CHARSET 0xFE
119
120/* Definition of special charsets. */
121#define CHARSET_ASCII 0 /* 0x00..0x7F */
122#define CHARSET_8_BIT_CONTROL 0x9E /* 0x80..0x9F */
123#define CHARSET_8_BIT_GRAPHIC 0x80 /* 0xA0..0xFF */
124
125extern int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
126extern int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
127extern int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
128extern int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
129extern int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
130extern int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
131extern int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
132
133/* Check if CH is an ASCII character or a base leading-code.
134 Nowadays, any byte can be the first byte of a character in a
135 multibyte buffer/string. So this macro name is not appropriate. */
136#define CHAR_HEAD_P(ch) ((unsigned char) (ch) < 0xA0)
137
138/*** GENERAL NOTE on CHARACTER REPRESENTATION ***
139
140 Firstly, the term "character" or "char" is used for a multilingual
141 character (of course, including ASCII characters), not for a byte in
142 computer memory. We use the term "code" or "byte" for the latter
143 case.
144
145 A character is identified by charset and one or two POSITION-CODEs.
146 POSITION-CODE is the position of the character in the charset. A
147 character of DIMENSION1 charset has one POSITION-CODE: POSITION-CODE-1.
148 A character of DIMENSION2 charset has two POSITION-CODE:
149 POSITION-CODE-1 and POSITION-CODE-2. The code range of
150 POSITION-CODE is 0x20..0x7F.
151
152 Emacs has two kinds of representation of a character: multi-byte
153 form (for buffers and strings) and single-word form (for character
154 objects in Emacs Lisp). The latter is called "character code"
155 hereafter. Both representations encode the information of charset
156 and POSITION-CODE but in a different way (for instance, the MSB of
157 POSITION-CODE is set in multi-byte form).
158
159 For details of the multi-byte form, see the section "2. Emacs
160 internal format handlers" of `coding.c'.
161
162 Emacs uses 19 bits for a character code. The bits are divided into
163 3 fields: FIELD1(5bits):FIELD2(7bits):FIELD3(7bits).
164
165 A character code of DIMENSION1 character uses FIELD2 to hold charset
166 and FIELD3 to hold POSITION-CODE-1. A character code of DIMENSION2
167 character uses FIELD1 to hold charset, FIELD2 and FIELD3 to hold
168 POSITION-CODE-1 and POSITION-CODE-2 respectively.
169
170 More precisely...
171
172 FIELD2 of DIMENSION1 character (except for ascii, eight-bit-control,
173 and eight-bit-graphic) is "charset - 0x70". This is to make all
174 character codes except for ASCII and 8-bit codes greater than 256.
175 So, the range of FIELD2 of DIMENSION1 character is 0, 1, or
176 0x11..0x7F.
177
178 FIELD1 of DIMENSION2 character is "charset - 0x8F" for official
179 charset and "charset - 0xE0" for private charset. So, the range of
180 FIELD1 of DIMENSION2 character is 0x01..0x1E.
181
182 -----------------------------------------------------------------------------
183 charset FIELD1 (5-bit) FIELD2 (7-bit) FIELD3 (7-bit)
184 -----------------------------------------------------------------------------
185 ascii 0 0 0x00..0x7F
186 eight-bit-control 0 1 0x00..0x1F
187 eight-bit-graphic 0 1 0x20..0x7F
188 DIMENSION1 0 charset - 0x70 POSITION-CODE-1
189 DIMENSION2(o) charset - 0x8F POSITION-CODE-1 POSITION-CODE-2
190 DIMENSION2(p) charset - 0xE0 POSITION-CODE-1 POSITION-CODE-2
191 -----------------------------------------------------------------------------
192 "(o)": official, "(p)": private
193 -----------------------------------------------------------------------------
194*/
195
196/* Masks of each field of character code. */
197#define CHAR_FIELD1_MASK (0x1F << 14)
198#define CHAR_FIELD2_MASK (0x7F << 7)
199#define CHAR_FIELD3_MASK 0x7F
200
201/* Macros to access each field of character C. */
202#define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
203#define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
204#define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
205
206/* Minimum character code of character of each DIMENSION. */
207#define MIN_CHAR_OFFICIAL_DIMENSION1 \
208 ((0x81 - 0x70) << 7)
209#define MIN_CHAR_PRIVATE_DIMENSION1 \
210 ((MIN_CHARSET_PRIVATE_DIMENSION1 - 0x70) << 7)
211#define MIN_CHAR_OFFICIAL_DIMENSION2 \
212 ((MIN_CHARSET_OFFICIAL_DIMENSION2 - 0x8F) << 14)
213#define MIN_CHAR_PRIVATE_DIMENSION2 \
214 ((MIN_CHARSET_PRIVATE_DIMENSION2 - 0xE0) << 14)
215/* Maximum character code currently used plus 1. */
216#define MAX_CHAR (0x1F << 14)
217
218/* 1 if C is a single byte character, else 0. */
219#define SINGLE_BYTE_CHAR_P(c) ((unsigned) (c) < 0x100)
220
221/* 1 if BYTE is an ASCII character in itself, in multibyte mode. */
222#define ASCII_BYTE_P(byte) ((byte) < 0x80)
223
224/* A char-table containing information on each character set.
225
226 Unlike ordinary char-tables, this doesn't contain any nested tables.
227 Only the top level elements are used. Each element is a vector of
228 the following information:
229 CHARSET-ID, BYTES, DIMENSION, CHARS, WIDTH, DIRECTION,
230 LEADING-CODE-BASE, LEADING-CODE-EXT,
231 ISO-FINAL-CHAR, ISO-GRAPHIC-PLANE,
232 REVERSE-CHARSET, SHORT-NAME, LONG-NAME, DESCRIPTION,
233 PLIST.
234
235 CHARSET-ID (integer) is the identification number of the charset.
236
237 BYTES (integer) is the length of the multi-byte form of a character
238 in the charset: one of 1, 2, 3, and 4.
239
240 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
241
242 CHARS (integer) is the number of characters in a dimension: 94 or 96.
243
244 WIDTH (integer) is the number of columns a character in the charset
245 occupies on the screen: one of 0, 1, and 2..
246
247 DIRECTION (integer) is the rendering direction of characters in the
248 charset when rendering. If 0, render from left to right, else
249 render from right to left.
250
251 LEADING-CODE-BASE (integer) is the base leading-code for the
252 charset.
253
254 LEADING-CODE-EXT (integer) is the extended leading-code for the
255 charset. All charsets of less than 0xA0 have the value 0.
256
257 ISO-FINAL-CHAR (character) is the final character of the
258 corresponding ISO 2022 charset. It is -1 for such a character
259 that is used only internally (e.g. `eight-bit-control').
260
261 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
262 while encoding to variants of ISO 2022 coding system, one of the
263 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR). It
264 is -1 for such a character that is used only internally
265 (e.g. `eight-bit-control').
266
267 REVERSE-CHARSET (integer) is the charset which differs only in
268 LEFT-TO-RIGHT value from the charset. If there's no such a
269 charset, the value is -1.
270
271 SHORT-NAME (string) is the short name to refer to the charset.
272
273 LONG-NAME (string) is the long name to refer to the charset.
274
275 DESCRIPTION (string) is the description string of the charset.
276
277 PLIST (property list) may contain any type of information a user
278 wants to put and get by functions `put-charset-property' and
279 `get-charset-property' respectively. */
280extern Lisp_Object Vcharset_table;
281
282/* Macros to access various information of CHARSET in Vcharset_table.
283 We provide these macros for efficiency. No range check of CHARSET. */
284
285/* Return entry of CHARSET (C integer) in Vcharset_table. */
286#define CHARSET_TABLE_ENTRY(charset) \
287 XCHAR_TABLE (Vcharset_table)->contents[((charset) == CHARSET_ASCII \
288 ? 0 : (charset) + 128)]
289
290/* Return information INFO-IDX of CHARSET. */
291#define CHARSET_TABLE_INFO(charset, info_idx) \
292 XVECTOR (CHARSET_TABLE_ENTRY (charset))->contents[info_idx]
293
294#define CHARSET_ID_IDX (0)
295#define CHARSET_BYTES_IDX (1)
296#define CHARSET_DIMENSION_IDX (2)
297#define CHARSET_CHARS_IDX (3)
298#define CHARSET_WIDTH_IDX (4)
299#define CHARSET_DIRECTION_IDX (5)
300#define CHARSET_LEADING_CODE_BASE_IDX (6)
301#define CHARSET_LEADING_CODE_EXT_IDX (7)
302#define CHARSET_ISO_FINAL_CHAR_IDX (8)
303#define CHARSET_ISO_GRAPHIC_PLANE_IDX (9)
304#define CHARSET_REVERSE_CHARSET_IDX (10)
305#define CHARSET_SHORT_NAME_IDX (11)
306#define CHARSET_LONG_NAME_IDX (12)
307#define CHARSET_DESCRIPTION_IDX (13)
308#define CHARSET_PLIST_IDX (14)
309/* Size of a vector of each entry of Vcharset_table. */
310#define CHARSET_MAX_IDX (15)
311
312/* And several more macros to be used frequently. */
313#define CHARSET_BYTES(charset) \
314 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX))
315#define CHARSET_DIMENSION(charset) \
316 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX))
317#define CHARSET_CHARS(charset) \
318 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX))
319#define CHARSET_WIDTH(charset) \
320 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX))
321#define CHARSET_DIRECTION(charset) \
322 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX))
323#define CHARSET_LEADING_CODE_BASE(charset) \
324 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX))
325#define CHARSET_LEADING_CODE_EXT(charset) \
326 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX))
327#define CHARSET_ISO_FINAL_CHAR(charset) \
328 XINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX))
329#define CHARSET_ISO_GRAPHIC_PLANE(charset) \
330 XINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX))
331#define CHARSET_REVERSE_CHARSET(charset) \
332 XINT (CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX))
333
334/* Macros to specify direction of a charset. */
335#define CHARSET_DIRECTION_LEFT_TO_RIGHT 0
336#define CHARSET_DIRECTION_RIGHT_TO_LEFT 1
337
338/* A vector of charset symbol indexed by charset-id. This is used
339 only for returning charset symbol from C functions. */
340extern Lisp_Object Vcharset_symbol_table;
341
342/* Return symbol of CHARSET. */
343#define CHARSET_SYMBOL(charset) \
344 XVECTOR (Vcharset_symbol_table)->contents[charset]
345
346/* 1 if CHARSET is in valid value range, else 0. */
347#define CHARSET_VALID_P(charset) \
348 ((charset) == 0 \
349 || ((charset) > 0x80 && (charset) <= MAX_CHARSET_OFFICIAL_DIMENSION2) \
350 || ((charset) >= MIN_CHARSET_PRIVATE_DIMENSION1 \
351 && (charset) <= MAX_CHARSET) \
352 || ((charset) == CHARSET_8_BIT_CONTROL) \
353 || ((charset) == CHARSET_8_BIT_GRAPHIC))
354
355/* 1 if CHARSET is already defined, else 0. */
356#define CHARSET_DEFINED_P(charset) \
357 (((charset) >= 0) && ((charset) <= MAX_CHARSET) \
358 && !NILP (CHARSET_TABLE_ENTRY (charset)))
359
360/* Since the information CHARSET-BYTES and CHARSET-WIDTH of
361 Vcharset_table can be retrieved only by the first byte of
362 multi-byte form (an ASCII code or a base leading-code), we provide
363 here tables to be used by macros BYTES_BY_CHAR_HEAD and
364 WIDTH_BY_CHAR_HEAD for faster information retrieval. */
365extern int bytes_by_char_head[256];
366extern int width_by_char_head[256];
367
368#define BYTES_BY_CHAR_HEAD(char_head) \
369 (ASCII_BYTE_P (char_head) ? 1 : bytes_by_char_head[char_head])
370#define WIDTH_BY_CHAR_HEAD(char_head) \
371 (ASCII_BYTE_P (char_head) ? 1 : width_by_char_head[char_head])
372
373/* Charset of the character C. */
374#define CHAR_CHARSET(c) \
375 (SINGLE_BYTE_CHAR_P (c) \
376 ? (ASCII_BYTE_P (c) \
377 ? CHARSET_ASCII \
378 : (c) < 0xA0 ? CHARSET_8_BIT_CONTROL : CHARSET_8_BIT_GRAPHIC) \
379 : ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 \
380 ? CHAR_FIELD2 (c) + 0x70 \
381 : ((c) < MIN_CHAR_PRIVATE_DIMENSION2 \
382 ? CHAR_FIELD1 (c) + 0x8F \
383 : CHAR_FIELD1 (c) + 0xE0)))
384 57
385/* Check if two characters C1 and C2 belong to the same charset. */ 58 /* Name of the charset (symbol). */
386#define SAME_CHARSET_P(c1, c2) \ 59 charset_name,
387 (c1 < MIN_CHAR_OFFICIAL_DIMENSION2 \ 60
388 ? (c1 & CHAR_FIELD2_MASK) == (c2 & CHAR_FIELD2_MASK) \ 61 /* Property list of the charset. */
389 : (c1 & CHAR_FIELD1_MASK) == (c2 & CHAR_FIELD1_MASK)) 62 charset_plist,
390 63
391/* Return a character of which charset is CHARSET and position-codes 64 /* If the method of the charset is `MAP_DEFERRED', the value is a
392 are C1 and C2. DIMENSION1 character ignores C2. */ 65 mappint vector or a file name that contains mapping vector.
393#define MAKE_CHAR(charset, c1, c2) \ 66 Otherwise, nil. */
394 ((charset) == CHARSET_ASCII \ 67 charset_map,
395 ? (c1) & 0x7F \ 68
396 : (((charset) == CHARSET_8_BIT_CONTROL \ 69 /* If the method of the charset is `MAP', the value is a vector
397 || (charset) == CHARSET_8_BIT_GRAPHIC) \ 70 that maps code points of the charset to characters. The vector
398 ? ((c1) & 0x7F) | 0x80 \ 71 is indexed by a character index. A character index is
399 : ((CHARSET_DEFINED_P (charset) \ 72 calculated from a code point and the code-space table of the
400 ? CHARSET_DIMENSION (charset) == 1 \ 73 charset. */
401 : (charset) < MIN_CHARSET_PRIVATE_DIMENSION2) \ 74 charset_decoder,
402 ? (((charset) - 0x70) << 7) | ((c1) <= 0 ? 0 : ((c1) & 0x7F)) \ 75
403 : ((((charset) \ 76 /* If the method of the charset is `MAP', the value is a
404 - ((charset) < MIN_CHARSET_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)) \ 77 char-table that maps characters of the charset to code
405 << 14) \ 78 points. */
406 | ((c2) <= 0 ? 0 : ((c2) & 0x7F)) \ 79 charset_encoder,
407 | ((c1) <= 0 ? 0 : (((c1) & 0x7F) << 7)))))) 80
408 81 /* If the method of the charset is `INHERIT', the value is a list
409 82 of the form (PARENT-CHARSET-ID . CODE-OFFSET). */
410/* If GENERICP is nonzero, return nonzero iff C is a valid normal or 83 charset_parents,
411 generic character. If GENERICP is zero, return nonzero iff C is a 84
412 valid normal character. */ 85 /* */
413#define CHAR_VALID_P(c, genericp) \ 86 charset_unify_map,
414 ((c) >= 0 \ 87
415 && (SINGLE_BYTE_CHAR_P (c) || char_valid_p (c, genericp))) 88 /* */
416 89 charset_deunifier,
417/* This default value is used when nonascii-translation-table or 90
418 nonascii-insert-offset fail to convert unibyte character to a valid 91 /* The length of charset attribute vector. */
419 multibyte character. This makes a Latin-1 character. */ 92 charset_attr_max
420 93 };
421#define DEFAULT_NONASCII_INSERT_OFFSET 0x800 94
422 95/* Methods for converting code points and characters of charsets. */
423/* Parse multibyte string STR of length LENGTH and set BYTES to the 96
424 byte length of a character at STR. */ 97enum charset_method
425 98 {
426#ifdef BYTE_COMBINING_DEBUG 99 /* For a charset of this method, a character code is calculated
427 100 from a character index (which is calculated from a code point)
428#define PARSE_MULTIBYTE_SEQ(str, length, bytes) \ 101 simply by adding an offset value. */
102 CHARSET_METHOD_OFFSET,
103
104 /* For a charset of this method, a decoder vector and an encoder
105 char-table is used for code point <-> character code
106 conversion. */
107 CHARSET_METHOD_MAP,
108
109 /* Same as above but decoder and encoder are loaded from a file on
110 demand. Once loaded, the method is changed to
111 CHARSET_METHOD_MAP. */
112 CHARSET_METHOD_MAP_DEFERRED,
113
114 /* A charset of this method inherits characters from the other
115 charsets. */
116 CHARSET_METHOD_INHERIT,
117 };
118
119struct charset
120{
121 int id;
122
123 int hash_index;
124
125 /* Dimension of the charset: 1, 2, 3, or 4. */
126 int dimension;
127
128 /* Minimum byte code in each dimension. */
129 int code_space[16];
130
131 /* 1 if there's no gap in code-points. */
132 int code_linear_p;
133
134 /* If the charset is treated as 94-chars in ISO-2022, the value is 0.
135 If the charset is treated as 96-chars in ISO-2022, the value is 1. */
136 int iso_chars_96;
137
138 /* ISO final character code for the charset: 48..127.
139 It may be 0 if the charset doesn't conform to ISO-2022. */
140 int iso_final;
141
142 int iso_revision;
143
144 /* If the charset is identical to what supported by Emacs 21 and the
145 priors, the identification number of the charset used in those
146 version. Otherwise, -1. */
147 int emacs_mule_id;
148
149 /* Nonzero iff the charset is compatible with ASCII. */
150 int ascii_compatible_p;
151
152 /* Nonzero iff the charset is supplementary. */
153 int supplementary_p;
154
155 /* Nonzero iff all the code points are representable by Lisp_Int. */
156 int compact_codes_p;
157
158 /* The method for encoding/decoding characters of the charset. */
159 enum charset_method method;
160
161 /* Mininum and Maximum code points of the charset. */
162 unsigned min_code, max_code;
163
164 /* Mininum and Maximum character codes of the charset. If the
165 charset is compatible with ASCII, min_char is a minimum non-ASCII
166 character of the charset. */
167 int min_char, max_char;
168
169 /* The code returned by ENCODE_CHAR if a character is not encodable
170 by the charset. */
171 unsigned invalid_code;
172
173 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
174 table of bits used to quickly and roughly guess if a character
175 belongs to the charset.
176
177 The first 64 elements are 512 bits for characters less than
178 0x10000. Each bit corresponds to 128-character block. The last
179 126 elements are 1008 bits for the greater characters
180 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
181 block.
182
183 If a bit is 1, at least one character in the corresponds block is
184 in this charset. */
185 unsigned char fast_map[190];
186
187 /* Offset value to calculate a character code from code-point, and
188 visa versa. */
189 int code_offset;
190
191 int unified_p;
192};
193
194/* Hash table of charset symbols vs. the correponding attribute
195 vectors. */
196extern Lisp_Object Vcharset_hash_table;
197
198/* Table of struct charset. */
199extern struct charset *charset_table;
200extern int charset_table_used;
201
202#define CHARSET_FROM_ID(id) (charset_table + (id))
203
204extern Lisp_Object Vcharset_list;
205extern Lisp_Object Viso_2022_charset_list;
206extern Lisp_Object Vemacs_mule_charset_list;
207
208extern struct charset *emacs_mule_charset[256];
209
210
211/* Macros to access information about charset. */
212
213/* Return the attribute vector of charset whose symbol is SYMBOL. */
214#define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
215 Fgethash ((symbol), Vcharset_hash_table, Qnil)
216
217#define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
218#define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
219#define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
220#define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
221#define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
222#define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
223#define CHARSET_ATTR_PARENTS(attrs) AREF ((attrs), charset_parents)
224#define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
225#define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
226
227#define CHARSET_SYMBOL_ID(symbol) \
228 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
229
230/* Return an index to Vcharset_hash_table of the charset whose symbol
231 is SYMBOL. */
232#define CHARSET_SYMBOL_HASH_INDEX(symbol) \
233 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
234
235/* Return the attribute vector of CHARSET. */
236#define CHARSET_ATTRIBUTES(charset) \
237 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
238
239#define CHARSET_ID(charset) ((charset)->id)
240#define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
241#define CHARSET_DIMENSION(charset) ((charset)->dimension)
242#define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
243#define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
244#define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
245#define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
246#define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
247#define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
248#define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
249#define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
250#define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
251#define CHARSET_METHOD(charset) ((charset)->method)
252#define CHARSET_MIN_CODE(charset) ((charset)->min_code)
253#define CHARSET_MAX_CODE(charset) ((charset)->max_code)
254#define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
255#define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
256#define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
257#define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
258#define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
259
260#define CHARSET_NAME(charset) \
261 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
262#define CHARSET_MAP(charset) \
263 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
264#define CHARSET_DECODER(charset) \
265 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
266#define CHARSET_ENCODER(charset) \
267 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
268#define CHARSET_PARENTS(charset) \
269 (CHARSET_ATTR_PARENTS (CHARSET_ATTRIBUTES (charset)))
270#define CHARSET_UNIFY_MAP(charset) \
271 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
272#define CHARSET_DEUNIFIER(charset) \
273 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
274
275
276/* Nonzero iff OBJ is a valid charset symbol. */
277#define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
278
279/* Check if X is a valid charset symbol. If not, signal an error. */
280#define CHECK_CHARSET(x) \
429 do { \ 281 do { \
430 int i = 1; \ 282 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
431 while (i < (length) && ! CHAR_HEAD_P ((str)[i])) i++; \ 283 x = wrong_type_argument (Qcharsetp, (x)); \
432 (bytes) = BYTES_BY_CHAR_HEAD ((str)[0]); \
433 if ((bytes) > i) \
434 abort (); \
435 } while (0) 284 } while (0)
436 285
437#else /* not BYTE_COMBINING_DEBUG */
438
439#define PARSE_MULTIBYTE_SEQ(str, length, bytes) \
440 (bytes) = BYTES_BY_CHAR_HEAD ((str)[0])
441
442#endif /* not BYTE_COMBINING_DEBUG */
443
444/* Return 1 iff the byte sequence at unibyte string STR (LENGTH bytes)
445 is valid as a multibyte form. If valid, by a side effect, BYTES is
446 set to the byte length of the multibyte form. */
447
448#define UNIBYTE_STR_AS_MULTIBYTE_P(str, length, bytes) \
449 (((str)[0] < 0x80 || (str)[0] >= 0xA0) \
450 ? ((bytes) = 1) \
451 : (((bytes) = BYTES_BY_CHAR_HEAD ((str)[0])), \
452 ((bytes) > 1 && (bytes) <= (length) \
453 && (str)[0] != LEADING_CODE_8_BIT_CONTROL \
454 && !CHAR_HEAD_P ((str)[1]) \
455 && ((bytes) == 2 \
456 || (!CHAR_HEAD_P ((str)[2]) \
457 && ((bytes) == 3 \
458 || !CHAR_HEAD_P ((str)[3])))))))
459
460/* Return 1 iff the byte sequence at multibyte string STR is valid as
461 a unibyte form. By a side effect, BYTES is set to the byte length
462 of one character at STR. */
463
464#define MULTIBYTE_STR_AS_UNIBYTE_P(str, bytes) \
465 ((bytes) = BYTES_BY_CHAR_HEAD ((str)[0]), \
466 (str)[0] != LEADING_CODE_8_BIT_CONTROL)
467
468/* The charset of character C is stored in CHARSET, and the
469 position-codes of C are stored in C1 and C2.
470 We store -1 in C2 if the dimension of the charset is 1. */
471
472#define SPLIT_CHAR(c, charset, c1, c2) \
473 (SINGLE_BYTE_CHAR_P (c) \
474 ? ((charset \
475 = (ASCII_BYTE_P (c) \
476 ? CHARSET_ASCII \
477 : ((c) < 0xA0 ? CHARSET_8_BIT_CONTROL : CHARSET_8_BIT_GRAPHIC))), \
478 c1 = (c), c2 = -1) \
479 : ((c) & CHAR_FIELD1_MASK \
480 ? (charset = (CHAR_FIELD1 (c) \
481 + ((c) < MIN_CHAR_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)), \
482 c1 = CHAR_FIELD2 (c), \
483 c2 = CHAR_FIELD3 (c)) \
484 : (charset = CHAR_FIELD2 (c) + 0x70, \
485 c1 = CHAR_FIELD3 (c), \
486 c2 = -1)))
487
488/* Return 1 iff character C has valid printable glyph. */
489#define CHAR_PRINTABLE_P(c) (ASCII_BYTE_P (c) || char_printable_p (c))
490
491/* The charset of the character at STR is stored in CHARSET, and the
492 position-codes are stored in C1 and C2.
493 We store -1 in C2 if the character is just 2 bytes. */
494
495#define SPLIT_STRING(str, len, charset, c1, c2) \
496 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) < 2 \
497 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > len \
498 || split_string (str, len, &charset, &c1, &c2) < 0) \
499 ? c1 = *(str), charset = CHARSET_ASCII \
500 : charset)
501 286
502/* Mapping table from ISO2022's charset (specified by DIMENSION, 287/* Check if X is a valid charset symbol. If valid, set ID to the id
503 CHARS, and FINAL_CHAR) to Emacs' charset. Should be accessed by 288 number of the charset. Otherwise, signal an error. */
504 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */ 289#define CHECK_CHARSET_GET_ID(x, id) \
505extern int iso_charset_table[2][2][128]; 290 do { \
506 291 int idx; \
507#define ISO_CHARSET_TABLE(dimension, chars, final_char) \ 292 \
508 iso_charset_table[XINT (dimension) - 1][XINT (chars) > 94][XINT (final_char)] 293 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
509 294 x = wrong_type_argument (Qcharsetp, (x)); \
510#define BASE_LEADING_CODE_P(c) (BYTES_BY_CHAR_HEAD ((unsigned char) (c)) > 1) 295 id = AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
511 296 charset_id); \
512/* Return how many bytes C will occupy in a multibyte buffer. */
513#define CHAR_BYTES(c) \
514 (SINGLE_BYTE_CHAR_P (c) \
515 ? ((ASCII_BYTE_P (c) || (c) >= 0xA0) ? 1 : 2) \
516 : char_bytes (c))
517
518/* The following two macros CHAR_STRING and STRING_CHAR are the main
519 entry points to convert between Emacs's two types of character
520 representations: multi-byte form and single-word form (character
521 code). */
522
523/* Store multi-byte form of the character C in STR. The caller should
524 allocate at least MAX_MULTIBYTE_LENGTH bytes area at STR in
525 advance. Returns the length of the multi-byte form. If C is an
526 invalid character code, signal an error. */
527
528#define CHAR_STRING(c, str) \
529 (SINGLE_BYTE_CHAR_P (c) \
530 ? ((ASCII_BYTE_P (c) || c >= 0xA0) \
531 ? (*(str) = (unsigned char)(c), 1) \
532 : (*(str) = LEADING_CODE_8_BIT_CONTROL, *((str)+ 1) = c + 0x20, 2)) \
533 : char_to_string (c, (unsigned char *) str))
534
535/* Like CHAR_STRING but don't signal an error if C is invalid.
536 Value is -1 in this case. */
537
538#define CHAR_STRING_NO_SIGNAL(c, str) \
539 (SINGLE_BYTE_CHAR_P (c) \
540 ? ((ASCII_BYTE_P (c) || c >= 0xA0) \
541 ? (*(str) = (unsigned char)(c), 1) \
542 : (*(str) = LEADING_CODE_8_BIT_CONTROL, *((str)+ 1) = c + 0x20, 2)) \
543 : char_to_string_1 (c, (unsigned char *) str))
544
545/* Return a character code of the character of which multi-byte form
546 is at STR and the length is LEN. If STR doesn't contain valid
547 multi-byte form, only the first byte in STR is returned. */
548
549#define STRING_CHAR(str, len) \
550 (BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
551 ? (unsigned char) *(str) \
552 : string_to_char (str, len, 0))
553
554/* This is like STRING_CHAR but the third arg ACTUAL_LEN is set to the
555 length of the multi-byte form. Just to know the length, use
556 MULTIBYTE_FORM_LENGTH. */
557
558#define STRING_CHAR_AND_LENGTH(str, len, actual_len) \
559 (BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
560 ? ((actual_len) = 1), (unsigned char) *(str) \
561 : string_to_char (str, len, &(actual_len)))
562
563/* Fetch the "next" character from Lisp string STRING at byte position
564 BYTEIDX, character position CHARIDX. Store it into OUTPUT.
565
566 All the args must be side-effect-free.
567 BYTEIDX and CHARIDX must be lvalues;
568 we increment them past the character fetched. */
569
570#define FETCH_STRING_CHAR_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX) \
571if (1) \
572 { \
573 CHARIDX++; \
574 if (STRING_MULTIBYTE (STRING)) \
575 { \
576 unsigned char *ptr = &XSTRING (STRING)->data[BYTEIDX]; \
577 int space_left = XSTRING (STRING)->size_byte - BYTEIDX; \
578 int actual_len; \
579 \
580 OUTPUT = STRING_CHAR_AND_LENGTH (ptr, space_left, actual_len); \
581 BYTEIDX += actual_len; \
582 } \
583 else \
584 OUTPUT = XSTRING (STRING)->data[BYTEIDX++]; \
585 } \
586else
587
588/* Like FETCH_STRING_CHAR_ADVANCE but assume STRING is multibyte. */
589
590#define FETCH_STRING_CHAR_ADVANCE_NO_CHECK(OUTPUT, STRING, CHARIDX, BYTEIDX) \
591if (1) \
592 { \
593 unsigned char *fetch_string_char_ptr = &XSTRING (STRING)->data[BYTEIDX]; \
594 int fetch_string_char_space_left = XSTRING (STRING)->size_byte - BYTEIDX; \
595 int actual_len; \
596 \
597 OUTPUT \
598 = STRING_CHAR_AND_LENGTH (fetch_string_char_ptr, \
599 fetch_string_char_space_left, actual_len); \
600 \
601 BYTEIDX += actual_len; \
602 CHARIDX++; \
603 } \
604else
605
606/* Like FETCH_STRING_CHAR_ADVANCE but fetch character from the current
607 buffer. */
608
609#define FETCH_CHAR_ADVANCE(OUTPUT, CHARIDX, BYTEIDX) \
610if (1) \
611 { \
612 CHARIDX++; \
613 if (!NILP (current_buffer->enable_multibyte_characters)) \
614 { \
615 unsigned char *ptr = BYTE_POS_ADDR (BYTEIDX); \
616 int space_left = ((CHARIDX < GPT ? GPT_BYTE : Z_BYTE) - BYTEIDX); \
617 int actual_len; \
618 \
619 OUTPUT= STRING_CHAR_AND_LENGTH (ptr, space_left, actual_len); \
620 BYTEIDX += actual_len; \
621 } \
622 else \
623 { \
624 OUTPUT = *(BYTE_POS_ADDR (BYTEIDX)); \
625 BYTEIDX++; \
626 } \
627 } \
628else
629
630/* Return the length of the multi-byte form at string STR of length LEN. */
631
632#define MULTIBYTE_FORM_LENGTH(str, len) \
633 (BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) == 1 \
634 ? 1 \
635 : multibyte_form_length (str, len))
636
637#ifdef emacs
638
639/* Increase the buffer byte position POS_BYTE of the current buffer to
640 the next character boundary. This macro relies on the fact that
641 *GPT_ADDR and *Z_ADDR are always accessible and the values are
642 '\0'. No range checking of POS. */
643
644#ifdef BYTE_COMBINING_DEBUG
645
646#define INC_POS(pos_byte) \
647 do { \
648 unsigned char *p = BYTE_POS_ADDR (pos_byte); \
649 if (BASE_LEADING_CODE_P (*p)) \
650 { \
651 int len, bytes; \
652 len = Z_BYTE - pos_byte; \
653 PARSE_MULTIBYTE_SEQ (p, len, bytes); \
654 pos_byte += bytes; \
655 } \
656 else \
657 pos_byte++; \
658 } while (0) 297 } while (0)
659 298
660#else /* not BYTE_COMBINING_DEBUG */
661 299
662#define INC_POS(pos_byte) \ 300/* Check if X is a valid charset symbol. If valid, set ATTR to the
663 do { \ 301 attr vector of the charset. Otherwise, signal an error. */
664 unsigned char *p = BYTE_POS_ADDR (pos_byte); \ 302#define CHECK_CHARSET_GET_ATTR(x, attr) \
665 pos_byte += BYTES_BY_CHAR_HEAD (*p); \ 303 do { \
304 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
305 x = wrong_type_argument (Qcharsetp, (x)); \
666 } while (0) 306 } while (0)
667 307
668#endif /* not BYTE_COMBINING_DEBUG */
669 308
670/* Decrease the buffer byte position POS_BYTE of the current buffer to 309#define CHECK_CHARSET_GET_CHARSET(x, charset) \
671 the previous character boundary. No range checking of POS. */ 310 do { \
672#define DEC_POS(pos_byte) \ 311 int id; \
673 do { \ 312 CHECK_CHARSET_GET_ID (x, id); \
674 unsigned char *p, *p_min; \ 313 charset = CHARSET_FROM_ID (id); \
675 \
676 pos_byte--; \
677 if (pos_byte < GPT_BYTE) \
678 p = BEG_ADDR + pos_byte - 1, p_min = BEG_ADDR; \
679 else \
680 p = BEG_ADDR + GAP_SIZE + pos_byte - 1, p_min = GAP_END_ADDR; \
681 if (p > p_min && !CHAR_HEAD_P (*p)) \
682 { \
683 unsigned char *pend = p--; \
684 int len, bytes; \
685 while (p > p_min && !CHAR_HEAD_P (*p)) p--; \
686 len = pend + 1 - p; \
687 PARSE_MULTIBYTE_SEQ (p, len, bytes); \
688 if (bytes == len) \
689 pos_byte -= len - 1; \
690 } \
691 } while (0) 314 } while (0)
692 315
693/* Increment both CHARPOS and BYTEPOS, each in the appropriate way. */
694 316
695#define INC_BOTH(charpos, bytepos) \ 317/* Lookup Vcharset_order_list and return the first charset that
696do \ 318 contains the character C. */
697 { \ 319#define CHAR_CHARSET(c) \
698 (charpos)++; \ 320 char_charset ((c), Qnil, NULL)
699 if (NILP (current_buffer->enable_multibyte_characters)) \ 321
700 (bytepos)++; \ 322#if 0
701 else \ 323/* Char-table of charset-sets. Each element is a bool vector indexed
702 INC_POS ((bytepos)); \ 324 by a charset ID. */
703 } \ 325extern Lisp_Object Vchar_charset_set;
704while (0) 326
705 327/* Charset-bag of character C. */
706/* Decrement both CHARPOS and BYTEPOS, each in the appropriate way. */ 328#define CHAR_CHARSET_SET(c) \
707 329 CHAR_TABLE_REF (Vchar_charset_set, c)
708#define DEC_BOTH(charpos, bytepos) \ 330
709do \ 331/* Check if two characters C1 and C2 belong to the same charset. */
710 { \ 332#define SAME_CHARSET_P(c1, c2) \
711 (charpos)--; \ 333 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
712 if (NILP (current_buffer->enable_multibyte_characters)) \ 334
713 (bytepos)--; \ 335#endif
714 else \ 336
715 DEC_POS ((bytepos)); \ 337
716 } \ 338/* Return a character correponding to the code-point CODE of CHARSET.
717while (0) 339 Try some optimization before calling decode_char. */
340
341#define DECODE_CHAR(charset, code) \
342 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
343 ? (code) \
344 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
345 ? -1 \
346 : (charset)->unified_p \
347 ? decode_char ((charset), (code)) \
348 : (charset)->method == CHARSET_METHOD_OFFSET \
349 ? ((charset)->code_linear_p \
350 ? (code) - (charset)->min_code + (charset)->code_offset \
351 : decode_char ((charset), (code))) \
352 : (charset)->method == CHARSET_METHOD_MAP \
353 ? ((charset)->code_linear_p \
354 ? XINT (AREF (CHARSET_DECODER (charset), \
355 (code) - (charset)->min_code)) \
356 : decode_char ((charset), (code))) \
357 : decode_char ((charset), (code)))
358
359
360/* Return a code point of CHAR in CHARSET.
361 Try some optimization before calling encode_char. */
362
363#define ENCODE_CHAR(charset, c) \
364 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
365 ? (c) \
366 : (charset)->unified_p \
367 ? encode_char ((charset), (c)) \
368 : ((c) < (charset)->min_char || (c) > (charset)->max_char) \
369 ? (charset)->invalid_code \
370 : (charset)->method == CHARSET_METHOD_OFFSET \
371 ? ((charset)->code_linear_p \
372 ? (c) - (charset)->code_offset + (charset)->min_code \
373 : encode_char ((charset), (c))) \
374 : (charset)->method == CHARSET_METHOD_MAP \
375 ? ((charset)->compact_codes_p \
376 ? XFASTINT (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
377 : encode_char ((charset), (c))) \
378 : encode_char ((charset), (c)))
379
380
381/* Set to 1 when a charset map is loaded to warn that a buffer text
382 and a string data may be relocated. */
383extern int charset_map_loaded;
384
385
386/* Set CHARSET to the charset highest priority of C, CODE to the
387 code-point of C in CHARSET. */
388#define SPLIT_CHAR(c, charset, code) \
389 ((charset) = char_charset ((c), Qnil, &(code)))
390
391
392#define ISO_MAX_DIMENSION 3
393#define ISO_MAX_CHARS 2
394#define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
395
396/* Mapping table from ISO2022's charset (specified by DIMENSION,
397 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
398 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
399extern int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
718 400
719/* Increase the buffer byte position POS_BYTE of the current buffer to 401/* A charset of type iso2022 who has DIMENSION, CHARS, and FINAL
720 the next character boundary. This macro relies on the fact that 402 (final character). */
721 *GPT_ADDR and *Z_ADDR are always accessible and the values are 403#define ISO_CHARSET_TABLE(dimension, chars_96, final) \
722 '\0'. No range checking of POS_BYTE. */ 404 iso_charset_table[(dimension) - 1][(chars_96)][(final)]
723 405
724#ifdef BYTE_COMBINING_DEBUG 406/* Nonzero iff the charset who has FAST_MAP may contain C. */
407#define CHARSET_FAST_MAP_REF(c, fast_map) \
408 ((c) < 0x10000 \
409 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
410 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
725 411
726#define BUF_INC_POS(buf, pos_byte) \ 412#define CHARSET_FAST_MAP_SET(c, fast_map) \
727 do { \ 413 do { \
728 unsigned char *p = BUF_BYTE_ADDRESS (buf, pos_byte); \ 414 if ((c) < 0x10000) \
729 if (BASE_LEADING_CODE_P (*p)) \ 415 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
730 { \
731 int len, bytes; \
732 len = BUF_Z_BYTE (buf) - pos_byte; \
733 PARSE_MULTIBYTE_SEQ (p, len, bytes); \
734 pos_byte += bytes; \
735 } \
736 else \ 416 else \
737 pos_byte++; \ 417 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
738 } while (0) 418 } while (0)
739 419
740#else /* not BYTE_COMBINING_DEBUG */
741 420
742#define BUF_INC_POS(buf, pos_byte) \
743 do { \
744 unsigned char *p = BUF_BYTE_ADDRESS (buf, pos_byte); \
745 pos_byte += BYTES_BY_CHAR_HEAD (*p); \
746 } while (0)
747 421
748#endif /* not BYTE_COMBINING_DEBUG */ 422/* 1 iff CHARSET may contain the character C. */
423#define CHAR_CHARSET_P(c, charset) \
424 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
425 || (CHARSET_UNIFIED_P (charset) \
426 ? encode_char ((charset), (c)) != (charset)->invalid_code \
427 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
428 && ((charset)->method == CHARSET_METHOD_OFFSET \
429 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
430 : ((charset)->method == CHARSET_METHOD_MAP \
431 && (charset)->compact_codes_p) \
432 ? (XFASTINT (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
433 != (charset)->invalid_code) \
434 : encode_char ((charset), (c)) != (charset)->invalid_code))))
749 435
750/* Decrease the buffer byte position POS_BYTE of the current buffer to
751 the previous character boundary. No range checking of POS_BYTE. */
752#define BUF_DEC_POS(buf, pos_byte) \
753 do { \
754 unsigned char *p, *p_min; \
755 pos_byte--; \
756 if (pos_byte < BUF_GPT_BYTE (buf)) \
757 { \
758 p = BUF_BEG_ADDR (buf) + pos_byte - 1; \
759 p_min = BUF_BEG_ADDR (buf); \
760 } \
761 else \
762 { \
763 p = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - 1; \
764 p_min = BUF_GAP_END_ADDR (buf); \
765 } \
766 if (p > p_min && !CHAR_HEAD_P (*p)) \
767 { \
768 unsigned char *pend = p--; \
769 int len, bytes; \
770 while (p > p_min && !CHAR_HEAD_P (*p)) p--; \
771 len = pend + 1 - p; \
772 PARSE_MULTIBYTE_SEQ (p, len, bytes); \
773 if (bytes == len) \
774 pos_byte -= len - 1; \
775 } \
776 } while (0)
777 436
778#endif /* emacs */ 437extern Lisp_Object Qcharsetp;
779 438
780/* This is the maximum byte length of multi-byte sequence. */ 439extern Lisp_Object Qascii, Qunicode;
781#define MAX_MULTIBYTE_LENGTH 4 440extern int charset_ascii, charset_8_bit_control, charset_8_bit_graphic;
782 441extern int charset_iso_8859_1;
783extern void invalid_character P_ ((int)); 442extern int charset_primary;
784 443
785extern int translate_char P_ ((Lisp_Object, int, int, int, int)); 444extern struct charset *char_charset P_ ((int, Lisp_Object, unsigned *));
786extern int split_string P_ ((const unsigned char *, int, int *, 445extern Lisp_Object charset_attributes P_ ((int));
787 unsigned char *, unsigned char *)); 446
788extern int char_to_string P_ ((int, unsigned char *)); 447extern int decode_char P_ ((struct charset *, unsigned));
789extern int char_to_string_1 P_ ((int, unsigned char *)); 448extern unsigned encode_char P_ ((struct charset *, int));
790extern int string_to_char P_ ((const unsigned char *, int, int *)); 449extern int string_xstring_p P_ ((Lisp_Object));
791extern int char_printable_p P_ ((int c)); 450
792extern int multibyte_form_length P_ ((const unsigned char *, int)); 451EXFUN (Funify_charset, 2);
793extern void parse_str_as_multibyte P_ ((unsigned char *, int, int *, int *));
794extern int str_as_multibyte P_ ((unsigned char *, int, int, int *));
795extern int parse_str_to_multibyte P_ ((unsigned char *, int));
796extern int str_to_multibyte P_ ((unsigned char *, int, int));
797extern int str_as_unibyte P_ ((unsigned char *, int));
798extern int get_charset_id P_ ((Lisp_Object));
799extern int find_charset_in_text P_ ((unsigned char *, int, int, int *,
800 Lisp_Object));
801extern int strwidth P_ ((unsigned char *, int));
802extern int c_string_width P_ ((unsigned char *, int, int, int *, int *));
803extern int lisp_string_width P_ ((Lisp_Object, int, int *, int *));
804extern int char_bytes P_ ((int));
805extern int char_valid_p P_ ((int, int));
806
807extern Lisp_Object Vtranslation_table_vector;
808
809/* Return a translation table of id number ID. */
810#define GET_TRANSLATION_TABLE(id) \
811 (XCDR(XVECTOR(Vtranslation_table_vector)->contents[(id)]))
812
813/* A char-table for characters which may invoke auto-filling. */
814extern Lisp_Object Vauto_fill_chars;
815
816/* Copy LEN bytes from FROM to TO. This macro should be used only
817 when a caller knows that LEN is short and the obvious copy loop is
818 faster than calling bcopy which has some overhead. Copying a
819 multibyte sequence of a multibyte character is the typical case. */
820
821#define BCOPY_SHORT(from, to, len) \
822 do { \
823 int i = len; \
824 unsigned char *from_p = from, *to_p = to; \
825 while (i--) *to_p++ = *from_p++; \
826 } while (0)
827 452
828#endif /* EMACS_CHARSET_H */ 453#endif /* EMACS_CHARSET_H */