diff options
| author | Kenichi Handa | 1999-09-03 01:28:42 +0000 |
|---|---|---|
| committer | Kenichi Handa | 1999-09-03 01:28:42 +0000 |
| commit | ac4137cca636cd011ce43a595c9cf57126c0a558 (patch) | |
| tree | a30b7c701ff4729f3dd766e0f22fafe9109fbbe1 /src | |
| parent | 384107f281bd6a43a4c66e9bbb1826e1bc8cec05 (diff) | |
| download | emacs-ac4137cca636cd011ce43a595c9cf57126c0a558.tar.gz emacs-ac4137cca636cd011ce43a595c9cf57126c0a558.zip | |
(SPLIT_COMPOSITE_SEQ): New macro.
(SPLIT_CHARACTER_SEQ): New macro.
(SPLIT_MULTIBYTE_SEQ): New macro.
(CHAR_COMPONENT_VALID_P): New macro.
(non_ascii_char_to_string): Generate a multibyte sequence as far
as possible.
(string_to_non_ascii_char): The 4th arg exclude_tail_garbage is
deleted. Caller changed. Use the macro SPLIT_MULTIBYTE_SEQ.
(split_non_ascii_string): Likewise.
(multibyte_form_length): Use the macro PARSE_MULTIBYTE_SEQ.
(char_printable_p): New function.
(translate_char): Check character by NATNUMP instead of INTEGERP.
(unibyte_char_to_multibyte): Call char_valid_p instead of
VALID_MULTIBYTE_CHAR_P.
(Fmake_char_internal): Check the arguments more rigidly.
(Fcharset_after): Use the macro SPLIT_MULTIBYTE_SEQ.
(char_valid_p): Check the validity by CHAR_COMPONENT_VALID_P.
(Fmultibyte_char_to_unibyte): Check the validity of character by
CHAR_VALID_P.
(chars_in_text): Call multibyte_chars_in_text.
(multibyte_chars_in_text): Use the macro PARSE_MULTIBYTE_SEQ.
(Fcompose_string): Use the macro STRING_CHAR_AND_LENGTH instead of
STRING_CHAR_AND_CHAR_LENGTH (which is obsolete now).
Diffstat (limited to 'src')
| -rw-r--r-- | src/charset.c | 417 |
1 files changed, 245 insertions, 172 deletions
diff --git a/src/charset.c b/src/charset.c index 9c6da218436..0876644ee94 100644 --- a/src/charset.c +++ b/src/charset.c | |||
| @@ -124,13 +124,89 @@ invalid_character (c) | |||
| 124 | error ("Invalid character: 0%o, %d, 0x%x", c, c, c); | 124 | error ("Invalid character: 0%o, %d, 0x%x", c, c, c); |
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | /* Parse string STR of length LENGTH (>= 2) and check if a composite | ||
| 128 | character is at STR. If there is a valid composite character, set | ||
| 129 | CHARSET, C1, and C2 to proper values so that MAKE_CHAR can compose | ||
| 130 | the composite character from them. Otherwise, set CHARSET to | ||
| 131 | CHARSET_COMPOSITION, but set C1 to the second byte of the sequence, | ||
| 132 | C2 to -1 so that MAKE_CHAR can compose the invalid multibyte | ||
| 133 | character whose string representation is two bytes of STR[0] and | ||
| 134 | STR[1]. In any case, set BYTES to LENGTH. */ | ||
| 135 | |||
| 136 | #define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2) \ | ||
| 137 | do { \ | ||
| 138 | int cmpchar_id = str_cmpchar_id ((str), (length)); \ | ||
| 139 | \ | ||
| 140 | (charset) = CHARSET_COMPOSITION; \ | ||
| 141 | (bytes) = (length); \ | ||
| 142 | if (cmpchar_id >= 0) \ | ||
| 143 | { \ | ||
| 144 | (c1) = CHAR_FIELD2 (cmpchar_id); \ | ||
| 145 | (c2) = CHAR_FIELD3 (cmpchar_id); \ | ||
| 146 | } \ | ||
| 147 | else \ | ||
| 148 | { \ | ||
| 149 | (c1) = (str)[1] & 0x7F; \ | ||
| 150 | (c2) = -1; \ | ||
| 151 | } \ | ||
| 152 | } while (0) | ||
| 153 | |||
| 154 | /* Parse string STR of length LENGTH (>= 2) and check if a | ||
| 155 | non-composite multibyte character is at STR. Set BYTES to the | ||
| 156 | actual length, CHARSET, C1, and C2 to proper values so that | ||
| 157 | MAKE_CHAR can compose the multibyte character from them. */ | ||
| 158 | |||
| 159 | #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \ | ||
| 160 | do { \ | ||
| 161 | (bytes) = 1; \ | ||
| 162 | (charset) = (str)[0]; \ | ||
| 163 | if ((charset) >= LEADING_CODE_PRIVATE_11 \ | ||
| 164 | && (charset) <= LEADING_CODE_PRIVATE_22) \ | ||
| 165 | (charset) = (str)[(bytes)++]; \ | ||
| 166 | if ((bytes) < (length)) \ | ||
| 167 | { \ | ||
| 168 | (c1) = (str)[(bytes)++] & 0x7F; \ | ||
| 169 | if ((bytes) < (length)) \ | ||
| 170 | (c2) = (str)[(bytes)++] & 0x7F; \ | ||
| 171 | else \ | ||
| 172 | (c2) = -1; \ | ||
| 173 | } \ | ||
| 174 | else \ | ||
| 175 | (c1) = (c2) = -1; \ | ||
| 176 | } while (0) | ||
| 177 | |||
| 178 | /* Parse string STR of length LENGTH and check if a multibyte | ||
| 179 | characters is at STR. set BYTES to the actual length, CHARSET, C1, | ||
| 180 | C2 to proper values for that character. */ | ||
| 181 | |||
| 182 | #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \ | ||
| 183 | do { \ | ||
| 184 | int i; \ | ||
| 185 | for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \ | ||
| 186 | if (i == 1) \ | ||
| 187 | (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \ | ||
| 188 | else if ((str)[0] == LEADING_CODE_COMPOSITION) \ | ||
| 189 | SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2); \ | ||
| 190 | else \ | ||
| 191 | { \ | ||
| 192 | if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \ | ||
| 193 | i = BYTES_BY_CHAR_HEAD ((str)[0]); \ | ||
| 194 | SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \ | ||
| 195 | } \ | ||
| 196 | } while (0) | ||
| 197 | |||
| 198 | /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */ | ||
| 199 | #define CHAR_COMPONENT_VALID_P(charset, c1, c2) \ | ||
| 200 | (CHARSET_DIMENSION (charset) == 1 \ | ||
| 201 | ? ((c1) >= 0x20 && (c1) <= 0x7F) \ | ||
| 202 | : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)) | ||
| 127 | 203 | ||
| 128 | /* Set STR a pointer to the multi-byte form of the character C. If C | 204 | /* Set STR a pointer to the multi-byte form of the character C. If C |
| 129 | is not a composite character, the multi-byte form is set in WORKBUF | 205 | is not a composite character, the multi-byte form is set in WORKBUF |
| 130 | and STR points WORKBUF. The caller should allocate at least 4-byte | 206 | and STR points WORKBUF. The caller should allocate at least 4-byte |
| 131 | area at WORKBUF in advance. Returns the length of the multi-byte | 207 | area at WORKBUF in advance. Returns the length of the multi-byte |
| 132 | form. If C is an invalid character to have a multi-byte form, | 208 | form. If C is an invalid character, store (C & 0xFF) in WORKBUF[0] |
| 133 | signal an error. | 209 | and return 1. |
| 134 | 210 | ||
| 135 | Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this | 211 | Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this |
| 136 | function directly if C can be an ASCII character. */ | 212 | function directly if C can be an ASCII character. */ |
| @@ -140,8 +216,6 @@ non_ascii_char_to_string (c, workbuf, str) | |||
| 140 | int c; | 216 | int c; |
| 141 | unsigned char *workbuf, **str; | 217 | unsigned char *workbuf, **str; |
| 142 | { | 218 | { |
| 143 | int charset, c1, c2; | ||
| 144 | |||
| 145 | if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */ | 219 | if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */ |
| 146 | { | 220 | { |
| 147 | /* Multibyte character can't have a modifier bit. */ | 221 | /* Multibyte character can't have a modifier bit. */ |
| @@ -183,111 +257,79 @@ non_ascii_char_to_string (c, workbuf, str) | |||
| 183 | invalid_character (c); | 257 | invalid_character (c); |
| 184 | 258 | ||
| 185 | *str = workbuf; | 259 | *str = workbuf; |
| 186 | *workbuf = c; | 260 | *workbuf++ = c; |
| 187 | return 1; | ||
| 188 | } | 261 | } |
| 189 | 262 | else | |
| 190 | if (c < 0) | ||
| 191 | invalid_character (c); | ||
| 192 | |||
| 193 | if (COMPOSITE_CHAR_P (c)) | ||
| 194 | { | 263 | { |
| 195 | int cmpchar_id = COMPOSITE_CHAR_ID (c); | 264 | int charset, c1, c2; |
| 196 | 265 | ||
| 197 | if (cmpchar_id < n_cmpchars) | 266 | SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); |
| 267 | if (charset == CHARSET_COMPOSITION) | ||
| 198 | { | 268 | { |
| 199 | *str = cmpchar_table[cmpchar_id]->data; | 269 | if (c >= MAX_CHAR) |
| 200 | return cmpchar_table[cmpchar_id]->len; | 270 | invalid_character (c); |
| 271 | if (c >= MIN_CHAR_COMPOSITION) | ||
| 272 | { | ||
| 273 | /* Valid composite character. */ | ||
| 274 | *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data; | ||
| 275 | workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len; | ||
| 276 | } | ||
| 277 | else | ||
| 278 | { | ||
| 279 | /* Invalid but can have multibyte form. */ | ||
| 280 | *str = workbuf; | ||
| 281 | *workbuf++ = LEADING_CODE_COMPOSITION; | ||
| 282 | *workbuf++ = c1 | 0x80; | ||
| 283 | } | ||
| 201 | } | 284 | } |
| 202 | else | 285 | else if (charset > CHARSET_COMPOSITION) |
| 203 | { | 286 | { |
| 204 | invalid_character (c); | 287 | *str = workbuf; |
| 288 | if (charset >= LEADING_CODE_EXT_11) | ||
| 289 | *workbuf++ = (charset < LEADING_CODE_EXT_12 | ||
| 290 | ? LEADING_CODE_PRIVATE_11 | ||
| 291 | : (charset < LEADING_CODE_EXT_21 | ||
| 292 | ? LEADING_CODE_PRIVATE_12 | ||
| 293 | : (charset < LEADING_CODE_EXT_22 | ||
| 294 | ? LEADING_CODE_PRIVATE_21 | ||
| 295 | : LEADING_CODE_PRIVATE_22))); | ||
| 296 | *workbuf++ = charset; | ||
| 297 | if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32) | ||
| 298 | invalid_character (c); | ||
| 299 | if (c1) | ||
| 300 | { | ||
| 301 | *workbuf++ = c1 | 0x80; | ||
| 302 | if (c2 > 0) | ||
| 303 | *workbuf++ = c2 | 0x80; | ||
| 304 | } | ||
| 205 | } | 305 | } |
| 306 | else if (charset == CHARSET_ASCII) | ||
| 307 | *workbuf++= c & 0x7F; | ||
| 308 | else | ||
| 309 | invalid_character (c); | ||
| 206 | } | 310 | } |
| 207 | 311 | ||
| 208 | SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); | ||
| 209 | if (!charset | ||
| 210 | || ! CHARSET_DEFINED_P (charset) | ||
| 211 | || c1 >= 0 && c1 < 32 | ||
| 212 | || c2 >= 0 && c2 < 32) | ||
| 213 | invalid_character (c); | ||
| 214 | |||
| 215 | *str = workbuf; | ||
| 216 | *workbuf++ = CHARSET_LEADING_CODE_BASE (charset); | ||
| 217 | if (*workbuf = CHARSET_LEADING_CODE_EXT (charset)) | ||
| 218 | workbuf++; | ||
| 219 | *workbuf++ = c1 | 0x80; | ||
| 220 | if (c2 >= 0) | ||
| 221 | *workbuf++ = c2 | 0x80; | ||
| 222 | |||
| 223 | return (workbuf - *str); | 312 | return (workbuf - *str); |
| 224 | } | 313 | } |
| 225 | 314 | ||
| 226 | /* Return a non-ASCII character of which multi-byte form is at STR of | 315 | /* Return a non-ASCII character of which multi-byte form is at STR of |
| 227 | length LEN. If ACTUAL_LEN is not NULL, the actual length of the | 316 | length LEN. If ACTUAL_LEN is not NULL, the byte length of the |
| 228 | multibyte form is set to the address ACTUAL_LEN. | 317 | multibyte form is set to the address ACTUAL_LEN. |
| 229 | 318 | ||
| 230 | If exclude_tail_garbage is nonzero, ACTUAL_LEN excludes gabage | ||
| 231 | bytes following the non-ASCII character. | ||
| 232 | |||
| 233 | Use macro `STRING_CHAR (STR, LEN)' instead of calling this function | 319 | Use macro `STRING_CHAR (STR, LEN)' instead of calling this function |
| 234 | directly if STR can hold an ASCII character. */ | 320 | directly if STR can hold an ASCII character. */ |
| 235 | 321 | ||
| 236 | int | 322 | int |
| 237 | string_to_non_ascii_char (str, len, actual_len, exclude_tail_garbage) | 323 | string_to_non_ascii_char (str, len, actual_len) |
| 238 | const unsigned char *str; | 324 | const unsigned char *str; |
| 239 | int len, *actual_len, exclude_tail_garbage; | 325 | int len, *actual_len; |
| 240 | { | 326 | { |
| 241 | int charset; | 327 | int c, bytes, charset, c1, c2; |
| 242 | unsigned char c1, c2; | ||
| 243 | int c, bytes; | ||
| 244 | const unsigned char *begp = str; | ||
| 245 | |||
| 246 | c = *str++; | ||
| 247 | bytes = 1; | ||
| 248 | |||
| 249 | if (BASE_LEADING_CODE_P (c)) | ||
| 250 | do { | ||
| 251 | while (bytes < len && ! CHAR_HEAD_P (begp[bytes])) bytes++; | ||
| 252 | |||
| 253 | if (c == LEADING_CODE_COMPOSITION) | ||
| 254 | { | ||
| 255 | int cmpchar_id = str_cmpchar_id (begp, bytes); | ||
| 256 | |||
| 257 | if (cmpchar_id >= 0) | ||
| 258 | { | ||
| 259 | c = MAKE_COMPOSITE_CHAR (cmpchar_id); | ||
| 260 | str += cmpchar_table[cmpchar_id]->len - 1; | ||
| 261 | } | ||
| 262 | else | ||
| 263 | str += bytes - 1; | ||
| 264 | } | ||
| 265 | else | ||
| 266 | { | ||
| 267 | const unsigned char *endp = begp + bytes; | ||
| 268 | int charset = c, c1, c2 = 0; | ||
| 269 | |||
| 270 | if (str >= endp) break; | ||
| 271 | if (c >= LEADING_CODE_PRIVATE_11 && c <= LEADING_CODE_PRIVATE_22) | ||
| 272 | { | ||
| 273 | charset = *str++; | ||
| 274 | if (str < endp) | ||
| 275 | c1 = *str++ & 0x7F; | ||
| 276 | else | ||
| 277 | c1 = charset, charset = c; | ||
| 278 | } | ||
| 279 | else | ||
| 280 | c1 = *str++ & 0x7f; | ||
| 281 | if (CHARSET_DEFINED_P (charset) | ||
| 282 | && CHARSET_DIMENSION (charset) == 2 | ||
| 283 | && str < endp) | ||
| 284 | c2 = *str++ & 0x7F; | ||
| 285 | c = MAKE_NON_ASCII_CHAR (charset, c1, c2); | ||
| 286 | } | ||
| 287 | } while (0); | ||
| 288 | 328 | ||
| 329 | SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2); | ||
| 330 | c = MAKE_CHAR (charset, c1, c2); | ||
| 289 | if (actual_len) | 331 | if (actual_len) |
| 290 | *actual_len = exclude_tail_garbage ? str - begp : bytes; | 332 | *actual_len = bytes; |
| 291 | return c; | 333 | return c; |
| 292 | } | 334 | } |
| 293 | 335 | ||
| @@ -297,53 +339,59 @@ multibyte_form_length (str, len) | |||
| 297 | const unsigned char *str; | 339 | const unsigned char *str; |
| 298 | int len; | 340 | int len; |
| 299 | { | 341 | { |
| 300 | int bytes = 1; | 342 | int bytes; |
| 301 | |||
| 302 | if (BASE_LEADING_CODE_P (*str)) | ||
| 303 | while (bytes < len && ! CHAR_HEAD_P (str[bytes])) bytes++; | ||
| 304 | 343 | ||
| 344 | PARSE_MULTIBYTE_SEQ (str, len, bytes); | ||
| 305 | return bytes; | 345 | return bytes; |
| 306 | } | 346 | } |
| 307 | 347 | ||
| 308 | /* Check if string STR of length LEN contains valid multi-byte form of | 348 | /* Check multibyte form at string STR of length LEN and set variables |
| 309 | a character. If valid, charset and position codes of the character | 349 | pointed by CHARSET, C1, and C2 to charset and position codes of the |
| 310 | is set at *CHARSET, *C1, and *C2, and return 0. If not valid, | 350 | character at STR, and return 0. If there's no multibyte character, |
| 311 | return -1. This should be used only in the macro SPLIT_STRING | 351 | return -1. This should be used only in the macro SPLIT_STRING |
| 312 | which checks range of STR in advance. */ | 352 | which checks range of STR in advance. */ |
| 313 | 353 | ||
| 314 | int | 354 | int |
| 315 | split_non_ascii_string (str, len, charset, c1, c2) | 355 | split_non_ascii_string (str, len, charset, c1, c2) |
| 316 | register const unsigned char *str; | 356 | const unsigned char *str; |
| 317 | register unsigned char *c1, *c2; | 357 | unsigned char *c1, *c2; |
| 318 | register int len, *charset; | 358 | int len, *charset; |
| 319 | { | 359 | { |
| 320 | register unsigned int cs = *str++; | 360 | register int bytes, cs, code1, code2 = -1; |
| 321 | |||
| 322 | if (cs == LEADING_CODE_COMPOSITION) | ||
| 323 | { | ||
| 324 | int cmpchar_id = str_cmpchar_id (str - 1, len); | ||
| 325 | 361 | ||
| 326 | if (cmpchar_id < 0) | 362 | SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2); |
| 327 | return -1; | 363 | if (cs == CHARSET_ASCII) |
| 328 | *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F; | ||
| 329 | } | ||
| 330 | else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0) | ||
| 331 | && CHARSET_DEFINED_P (cs)) | ||
| 332 | { | ||
| 333 | *charset = cs; | ||
| 334 | if (*str < 0xA0) | ||
| 335 | return -1; | ||
| 336 | *c1 = (*str++) & 0x7F; | ||
| 337 | if (CHARSET_DIMENSION (cs) == 2) | ||
| 338 | { | ||
| 339 | if (*str < 0xA0) | ||
| 340 | return -1; | ||
| 341 | *c2 = (*str++) & 0x7F; | ||
| 342 | } | ||
| 343 | } | ||
| 344 | else | ||
| 345 | return -1; | 364 | return -1; |
| 346 | return 0; | 365 | *charset = cs; |
| 366 | *c1 = code1; | ||
| 367 | *c2 = code2; | ||
| 368 | } | ||
| 369 | |||
| 370 | /* Return 1 iff character C has valid printable glyph. */ | ||
| 371 | int | ||
| 372 | char_printable_p (c) | ||
| 373 | int c; | ||
| 374 | { | ||
| 375 | int charset, c1, c2, chars; | ||
| 376 | |||
| 377 | if (SINGLE_BYTE_CHAR_P (c)) | ||
| 378 | return 1; | ||
| 379 | if (c >= MIN_CHAR_COMPOSITION) | ||
| 380 | return (c < MAX_CHAR); | ||
| 381 | |||
| 382 | SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); | ||
| 383 | if (! CHARSET_DEFINED_P (charset)) | ||
| 384 | return 0; | ||
| 385 | if (CHARSET_CHARS (charset) == 94 | ||
| 386 | ? c1 <= 32 || c1 >= 127 | ||
| 387 | : c1 < 32) | ||
| 388 | return 0; | ||
| 389 | if (CHARSET_DIMENSION (charset) == 2 | ||
| 390 | && (CHARSET_CHARS (charset) == 94 | ||
| 391 | ? c2 <= 32 || c2 >= 127 | ||
| 392 | : c2 < 32)) | ||
| 393 | return 0; | ||
| 394 | return 1; | ||
| 347 | } | 395 | } |
| 348 | 396 | ||
| 349 | /* Translate character C by translation table TABLE. If C | 397 | /* Translate character C by translation table TABLE. If C |
| @@ -360,8 +408,7 @@ translate_char (table, c, charset, c1, c2) | |||
| 360 | 408 | ||
| 361 | if (c < 0) c = MAKE_CHAR (charset, c1, c2); | 409 | if (c < 0) c = MAKE_CHAR (charset, c1, c2); |
| 362 | if (!CHAR_TABLE_P (table) | 410 | if (!CHAR_TABLE_P (table) |
| 363 | || (ch = Faref (table, make_number (c)), !INTEGERP (ch)) | 411 | || (ch = Faref (table, make_number (c)), !NATNUMP (ch))) |
| 364 | || XINT (ch) < 0) | ||
| 365 | return c; | 412 | return c; |
| 366 | 413 | ||
| 367 | SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2); | 414 | SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2); |
| @@ -396,13 +443,13 @@ unibyte_char_to_multibyte (c) | |||
| 396 | if (! NILP (Vnonascii_translation_table)) | 443 | if (! NILP (Vnonascii_translation_table)) |
| 397 | { | 444 | { |
| 398 | c = XINT (Faref (Vnonascii_translation_table, make_number (c))); | 445 | c = XINT (Faref (Vnonascii_translation_table, make_number (c))); |
| 399 | if (c >= 0400 && ! VALID_MULTIBYTE_CHAR_P (c)) | 446 | if (c >= 0400 && ! char_valid_p (c, 0)) |
| 400 | c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; | 447 | c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; |
| 401 | } | 448 | } |
| 402 | else if (c >= 0240 && nonascii_insert_offset > 0) | 449 | else if (c >= 0240 && nonascii_insert_offset > 0) |
| 403 | { | 450 | { |
| 404 | c += nonascii_insert_offset; | 451 | c += nonascii_insert_offset; |
| 405 | if (c < 0400 || ! VALID_MULTIBYTE_CHAR_P (c)) | 452 | if (c < 0400 || ! char_valid_p (c, 0)) |
| 406 | c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; | 453 | c = c_save + DEFAULT_NONASCII_INSERT_OFFSET; |
| 407 | } | 454 | } |
| 408 | else if (c >= 0240) | 455 | else if (c >= 0240) |
| @@ -987,21 +1034,40 @@ DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, | |||
| 987 | (charset, code1, code2) | 1034 | (charset, code1, code2) |
| 988 | Lisp_Object charset, code1, code2; | 1035 | Lisp_Object charset, code1, code2; |
| 989 | { | 1036 | { |
| 1037 | int charset_id, c1, c2; | ||
| 1038 | |||
| 990 | CHECK_NUMBER (charset, 0); | 1039 | CHECK_NUMBER (charset, 0); |
| 1040 | charset_id = XINT (charset); | ||
| 1041 | if (!CHARSET_DEFINED_P (charset_id)) | ||
| 1042 | error ("Invalid charset ID: %d", XINT (charset)); | ||
| 991 | 1043 | ||
| 992 | if (NILP (code1)) | 1044 | if (NILP (code1)) |
| 993 | XSETFASTINT (code1, 0); | 1045 | c1 = 0; |
| 994 | else | 1046 | else |
| 995 | CHECK_NUMBER (code1, 1); | 1047 | { |
| 1048 | CHECK_NUMBER (code1, 1); | ||
| 1049 | c1 = XINT (code1); | ||
| 1050 | } | ||
| 996 | if (NILP (code2)) | 1051 | if (NILP (code2)) |
| 997 | XSETFASTINT (code2, 0); | 1052 | c2 = 0; |
| 998 | else | 1053 | else |
| 999 | CHECK_NUMBER (code2, 2); | 1054 | { |
| 1000 | 1055 | CHECK_NUMBER (code2, 2); | |
| 1001 | if (!CHARSET_DEFINED_P (XINT (charset))) | 1056 | c2 = XINT (code2); |
| 1002 | error ("Invalid charset: %d", XINT (charset)); | 1057 | } |
| 1003 | 1058 | ||
| 1004 | return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2))); | 1059 | if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF) |
| 1060 | error ("Invalid code points: %d %d", c1, c2); | ||
| 1061 | c1 &= 0x7F; | ||
| 1062 | c2 &= 0x7F; | ||
| 1063 | if (c1 == 0 | ||
| 1064 | ? c2 != 0 | ||
| 1065 | : (c2 == 0 | ||
| 1066 | ? !CHAR_COMPONENT_VALID_P (charset, c1, 0x20) | ||
| 1067 | : !CHAR_COMPONENT_VALID_P (charset, c1, c2))) | ||
| 1068 | error ("Invalid code points: %d %d", c1, c2); | ||
| 1069 | |||
| 1070 | return make_number (MAKE_CHAR (charset_id, c1, c2)); | ||
| 1005 | } | 1071 | } |
| 1006 | 1072 | ||
| 1007 | DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0, | 1073 | DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0, |
| @@ -1036,13 +1102,13 @@ DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0, | |||
| 1036 | } | 1102 | } |
| 1037 | 1103 | ||
| 1038 | DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0, | 1104 | DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0, |
| 1039 | "Return charset of a character in current buffer at position POS.\n\ | 1105 | "Return charset of a character in the current buffer at position POS.\n\ |
| 1040 | If POS is nil, it defauls to the current point.\n\ | 1106 | If POS is nil, it defauls to the current point.\n\ |
| 1041 | If POS is out of range, the value is nil.") | 1107 | If POS is out of range, the value is nil.") |
| 1042 | (pos) | 1108 | (pos) |
| 1043 | Lisp_Object pos; | 1109 | Lisp_Object pos; |
| 1044 | { | 1110 | { |
| 1045 | register int pos_byte, c, charset; | 1111 | register int pos_byte, bytes, charset, c1, c2; |
| 1046 | register unsigned char *p; | 1112 | register unsigned char *p; |
| 1047 | 1113 | ||
| 1048 | if (NILP (pos)) | 1114 | if (NILP (pos)) |
| @@ -1061,8 +1127,15 @@ If POS is out of range, the value is nil.") | |||
| 1061 | pos_byte = CHAR_TO_BYTE (XINT (pos)); | 1127 | pos_byte = CHAR_TO_BYTE (XINT (pos)); |
| 1062 | } | 1128 | } |
| 1063 | p = BYTE_POS_ADDR (pos_byte); | 1129 | p = BYTE_POS_ADDR (pos_byte); |
| 1064 | c = STRING_CHAR (p, Z_BYTE - pos_byte); | 1130 | if (BASE_LEADING_CODE_P (*p)) |
| 1065 | charset = CHAR_CHARSET (c); | 1131 | { |
| 1132 | SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2); | ||
| 1133 | if (charset < 0) | ||
| 1134 | charset = 1; | ||
| 1135 | } | ||
| 1136 | else | ||
| 1137 | charset = CHARSET_ASCII; | ||
| 1138 | |||
| 1066 | return CHARSET_SYMBOL (charset); | 1139 | return CHARSET_SYMBOL (charset); |
| 1067 | } | 1140 | } |
| 1068 | 1141 | ||
| @@ -1103,15 +1176,23 @@ char_valid_p (c, genericp) | |||
| 1103 | if (SINGLE_BYTE_CHAR_P (c)) | 1176 | if (SINGLE_BYTE_CHAR_P (c)) |
| 1104 | return 1; | 1177 | return 1; |
| 1105 | SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); | 1178 | SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); |
| 1106 | if (charset != CHARSET_COMPOSITION && !CHARSET_DEFINED_P (charset)) | 1179 | if (charset == CHARSET_COMPOSITION) |
| 1107 | return 0; | 1180 | return ((c >= MIN_CHAR_COMPOSITION |
| 1108 | return (c < MIN_CHAR_COMPOSITION | 1181 | && c < MIN_CHAR_COMPOSITION + n_cmpchars) |
| 1109 | ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */ | 1182 | || (genericp && c == GENERIC_COMPOSITION_CHAR)); |
| 1110 | ? (genericp && c1 == 0 && c2 == 0 | 1183 | if (genericp) |
| 1111 | || c1 >= 32 && c2 >= 32) | 1184 | { |
| 1112 | : (genericp && c1 == 0 | 1185 | if (c1) |
| 1113 | || c1 >= 32)) | 1186 | { |
| 1114 | : c < MIN_CHAR_COMPOSITION + n_cmpchars); | 1187 | if (c2 <= 0) c2 = 0x20; |
| 1188 | } | ||
| 1189 | else | ||
| 1190 | { | ||
| 1191 | if (c2 <= 0) c1 = c2 = 0x20; | ||
| 1192 | } | ||
| 1193 | } | ||
| 1194 | return (CHARSET_DEFINED_P (charset) | ||
| 1195 | && CHAR_COMPONENT_VALID_P (charset, c1, c2)); | ||
| 1115 | } | 1196 | } |
| 1116 | 1197 | ||
| 1117 | DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0, | 1198 | DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0, |
| @@ -1158,7 +1239,7 @@ The conversion is done based on `nonascii-translation-table' (which see)\n\ | |||
| 1158 | 1239 | ||
| 1159 | CHECK_NUMBER (ch, 0); | 1240 | CHECK_NUMBER (ch, 0); |
| 1160 | c = XINT (ch); | 1241 | c = XINT (ch); |
| 1161 | if (c < 0) | 1242 | if (! CHAR_VALID_P (c, 0)) |
| 1162 | error ("Invalid multibyte character: %d", c); | 1243 | error ("Invalid multibyte character: %d", c); |
| 1163 | c = multibyte_char_to_unibyte (c, Qnil); | 1244 | c = multibyte_char_to_unibyte (c, Qnil); |
| 1164 | if (c < 0) | 1245 | if (c < 0) |
| @@ -1369,27 +1450,12 @@ chars_in_text (ptr, nbytes) | |||
| 1369 | unsigned char *ptr; | 1450 | unsigned char *ptr; |
| 1370 | int nbytes; | 1451 | int nbytes; |
| 1371 | { | 1452 | { |
| 1372 | unsigned char *endp, c; | ||
| 1373 | int chars; | ||
| 1374 | |||
| 1375 | /* current_buffer is null at early stages of Emacs initialization. */ | 1453 | /* current_buffer is null at early stages of Emacs initialization. */ |
| 1376 | if (current_buffer == 0 | 1454 | if (current_buffer == 0 |
| 1377 | || NILP (current_buffer->enable_multibyte_characters)) | 1455 | || NILP (current_buffer->enable_multibyte_characters)) |
| 1378 | return nbytes; | 1456 | return nbytes; |
| 1379 | 1457 | ||
| 1380 | endp = ptr + nbytes; | 1458 | return multibyte_chars_in_text (ptr, nbytes); |
| 1381 | chars = 0; | ||
| 1382 | |||
| 1383 | while (ptr < endp) | ||
| 1384 | { | ||
| 1385 | c = *ptr++; | ||
| 1386 | |||
| 1387 | if (BASE_LEADING_CODE_P (c)) | ||
| 1388 | while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++; | ||
| 1389 | chars++; | ||
| 1390 | } | ||
| 1391 | |||
| 1392 | return chars; | ||
| 1393 | } | 1459 | } |
| 1394 | 1460 | ||
| 1395 | /* Return the number of characters in the NBYTES bytes at PTR. | 1461 | /* Return the number of characters in the NBYTES bytes at PTR. |
| @@ -1401,18 +1467,25 @@ multibyte_chars_in_text (ptr, nbytes) | |||
| 1401 | unsigned char *ptr; | 1467 | unsigned char *ptr; |
| 1402 | int nbytes; | 1468 | int nbytes; |
| 1403 | { | 1469 | { |
| 1404 | unsigned char *endp, c; | 1470 | unsigned char *endp; |
| 1405 | int chars; | 1471 | int chars, bytes; |
| 1406 | 1472 | ||
| 1407 | endp = ptr + nbytes; | 1473 | endp = ptr + nbytes; |
| 1408 | chars = 0; | 1474 | chars = 0; |
| 1409 | 1475 | ||
| 1410 | while (ptr < endp) | 1476 | while (ptr < endp) |
| 1411 | { | 1477 | { |
| 1412 | c = *ptr++; | 1478 | if (BASE_LEADING_CODE_P (*ptr)) |
| 1413 | 1479 | { | |
| 1414 | if (BASE_LEADING_CODE_P (c)) | 1480 | PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes); |
| 1415 | while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++; | 1481 | ptr += bytes; |
| 1482 | nbytes -= bytes; | ||
| 1483 | } | ||
| 1484 | else | ||
| 1485 | { | ||
| 1486 | ptr++; | ||
| 1487 | nbytes--; | ||
| 1488 | } | ||
| 1416 | chars++; | 1489 | chars++; |
| 1417 | } | 1490 | } |
| 1418 | 1491 | ||
| @@ -1514,7 +1587,7 @@ str_cmpchar_id (str, len) | |||
| 1514 | int i; | 1587 | int i; |
| 1515 | struct cmpchar_info *cmpcharp; | 1588 | struct cmpchar_info *cmpcharp; |
| 1516 | 1589 | ||
| 1517 | /* The second byte 0xFF means compostion rule is embedded. */ | 1590 | /* The second byte 0xFF means COMPOSITION rule is embedded. */ |
| 1518 | embedded_rule = (str[1] == 0xFF); | 1591 | embedded_rule = (str[1] == 0xFF); |
| 1519 | 1592 | ||
| 1520 | /* At first, get the actual length of the composite character. */ | 1593 | /* At first, get the actual length of the composite character. */ |
| @@ -1650,7 +1723,7 @@ str_cmpchar_id (str, len) | |||
| 1650 | /* Make `bufp' point normal multi-byte form temporally. */ | 1723 | /* Make `bufp' point normal multi-byte form temporally. */ |
| 1651 | *bufp -= 0x20; | 1724 | *bufp -= 0x20; |
| 1652 | cmpcharp->glyph[i] | 1725 | cmpcharp->glyph[i] |
| 1653 | = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0, 0), 0); | 1726 | = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0); |
| 1654 | width = WIDTH_BY_CHAR_HEAD (*bufp); | 1727 | width = WIDTH_BY_CHAR_HEAD (*bufp); |
| 1655 | *bufp += 0x20; | 1728 | *bufp += 0x20; |
| 1656 | bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20); | 1729 | bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20); |
| @@ -1870,7 +1943,7 @@ DEFUN ("compose-string", Fcompose_string, Scompose_string, | |||
| 1870 | { | 1943 | { |
| 1871 | /* Add 0x20 to the base leading-code, keep the remaining | 1944 | /* Add 0x20 to the base leading-code, keep the remaining |
| 1872 | bytes unchanged. */ | 1945 | bytes unchanged. */ |
| 1873 | int c = STRING_CHAR_AND_CHAR_LENGTH (p, pend - p, len); | 1946 | int c = STRING_CHAR_AND_LENGTH (p, pend - p, len); |
| 1874 | 1947 | ||
| 1875 | if (len <= 1 || ! CHAR_VALID_P (c, 0)) | 1948 | if (len <= 1 || ! CHAR_VALID_P (c, 0)) |
| 1876 | error ("Can't compose an invalid character"); | 1949 | error ("Can't compose an invalid character"); |