diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/buffer.c | 3 | ||||
| -rw-r--r-- | src/character.c | 42 | ||||
| -rw-r--r-- | src/character.h | 96 | ||||
| -rw-r--r-- | src/coding.c | 14 |
4 files changed, 87 insertions, 68 deletions
diff --git a/src/buffer.c b/src/buffer.c index 5398414e6eb..53b3bd960c4 100644 --- a/src/buffer.c +++ b/src/buffer.c | |||
| @@ -2634,8 +2634,7 @@ current buffer is cleared. */) | |||
| 2634 | if (ASCII_CHAR_P (*p)) | 2634 | if (ASCII_CHAR_P (*p)) |
| 2635 | p++, pos++; | 2635 | p++, pos++; |
| 2636 | else if (EQ (flag, Qt) | 2636 | else if (EQ (flag, Qt) |
| 2637 | && ! CHAR_BYTE8_HEAD_P (*p) | 2637 | && 0 < (bytes = multibyte_length (p, pend, true, false))) |
| 2638 | && (bytes = MULTIBYTE_LENGTH (p, pend)) > 0) | ||
| 2639 | p += bytes, pos += bytes; | 2638 | p += bytes, pos += bytes; |
| 2640 | else | 2639 | else |
| 2641 | { | 2640 | { |
diff --git a/src/character.c b/src/character.c index 303c83ccec3..da09e77e131 100644 --- a/src/character.c +++ b/src/character.c | |||
| @@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes) | |||
| 486 | 486 | ||
| 487 | while (ptr < endp) | 487 | while (ptr < endp) |
| 488 | { | 488 | { |
| 489 | int len = MULTIBYTE_LENGTH (ptr, endp); | 489 | int len = multibyte_length (ptr, endp, true, true); |
| 490 | 490 | ||
| 491 | if (len == 0) | 491 | if (len == 0) |
| 492 | emacs_abort (); | 492 | emacs_abort (); |
| @@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, | |||
| 508 | ptrdiff_t *nchars, ptrdiff_t *nbytes) | 508 | ptrdiff_t *nchars, ptrdiff_t *nbytes) |
| 509 | { | 509 | { |
| 510 | const unsigned char *endp = str + len; | 510 | const unsigned char *endp = str + len; |
| 511 | int n; | ||
| 512 | ptrdiff_t chars = 0, bytes = 0; | 511 | ptrdiff_t chars = 0, bytes = 0; |
| 513 | 512 | ||
| 514 | if (len >= MAX_MULTIBYTE_LENGTH) | 513 | if (len >= MAX_MULTIBYTE_LENGTH) |
| @@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, | |||
| 516 | const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; | 515 | const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; |
| 517 | while (str < adjusted_endp) | 516 | while (str < adjusted_endp) |
| 518 | { | 517 | { |
| 519 | if (! CHAR_BYTE8_HEAD_P (*str) | 518 | int n = multibyte_length (str, NULL, false, false); |
| 520 | && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) | 519 | if (0 < n) |
| 521 | str += n, bytes += n; | 520 | str += n, bytes += n; |
| 522 | else | 521 | else |
| 523 | str++, bytes += 2; | 522 | str++, bytes += 2; |
| @@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, | |||
| 526 | } | 525 | } |
| 527 | while (str < endp) | 526 | while (str < endp) |
| 528 | { | 527 | { |
| 529 | if (! CHAR_BYTE8_HEAD_P (*str) | 528 | int n = multibyte_length (str, endp, true, false); |
| 530 | && (n = MULTIBYTE_LENGTH (str, endp)) > 0) | 529 | if (0 < n) |
| 531 | str += n, bytes += n; | 530 | str += n, bytes += n; |
| 532 | else | 531 | else |
| 533 | str++, bytes += 2; | 532 | str++, bytes += 2; |
| @@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, | |||
| 554 | unsigned char *p = str, *endp = str + nbytes; | 553 | unsigned char *p = str, *endp = str + nbytes; |
| 555 | unsigned char *to; | 554 | unsigned char *to; |
| 556 | ptrdiff_t chars = 0; | 555 | ptrdiff_t chars = 0; |
| 557 | int n; | ||
| 558 | 556 | ||
| 559 | if (nbytes >= MAX_MULTIBYTE_LENGTH) | 557 | if (nbytes >= MAX_MULTIBYTE_LENGTH) |
| 560 | { | 558 | { |
| 561 | unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; | 559 | unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; |
| 562 | while (p < adjusted_endp | 560 | while (p < adjusted_endp) |
| 563 | && ! CHAR_BYTE8_HEAD_P (*p) | 561 | { |
| 564 | && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) | 562 | int n = multibyte_length (p, NULL, false, false); |
| 565 | p += n, chars++; | 563 | if (n <= 0) |
| 564 | break; | ||
| 565 | p += n, chars++; | ||
| 566 | } | ||
| 567 | } | ||
| 568 | while (true) | ||
| 569 | { | ||
| 570 | int n = multibyte_length (p, endp, true, false); | ||
| 571 | if (n <= 0) | ||
| 572 | break; | ||
| 573 | p += n, chars++; | ||
| 566 | } | 574 | } |
| 567 | while (p < endp | ||
| 568 | && ! CHAR_BYTE8_HEAD_P (*p) | ||
| 569 | && (n = MULTIBYTE_LENGTH (p, endp)) > 0) | ||
| 570 | p += n, chars++; | ||
| 571 | if (nchars) | 575 | if (nchars) |
| 572 | *nchars = chars; | 576 | *nchars = chars; |
| 573 | if (p == endp) | 577 | if (p == endp) |
| @@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, | |||
| 584 | unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; | 588 | unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; |
| 585 | while (p < adjusted_endp) | 589 | while (p < adjusted_endp) |
| 586 | { | 590 | { |
| 587 | if (! CHAR_BYTE8_HEAD_P (*p) | 591 | int n = multibyte_length (p, NULL, false, false); |
| 588 | && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) | 592 | if (0 < n) |
| 589 | { | 593 | { |
| 590 | while (n--) | 594 | while (n--) |
| 591 | *to++ = *p++; | 595 | *to++ = *p++; |
| @@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, | |||
| 601 | } | 605 | } |
| 602 | while (p < endp) | 606 | while (p < endp) |
| 603 | { | 607 | { |
| 604 | if (! CHAR_BYTE8_HEAD_P (*p) | 608 | int n = multibyte_length (p, endp, true, false); |
| 605 | && (n = MULTIBYTE_LENGTH (p, endp)) > 0) | 609 | if (0 < n) |
| 606 | { | 610 | { |
| 607 | while (n--) | 611 | while (n--) |
| 608 | *to++ = *p++; | 612 | *to++ = *p++; |
diff --git a/src/character.h b/src/character.h index 81320dedd17..4887473b27e 100644 --- a/src/character.h +++ b/src/character.h | |||
| @@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN | |||
| 31 | /* character code 1st byte byte sequence | 31 | /* character code 1st byte byte sequence |
| 32 | -------------- -------- ------------- | 32 | -------------- -------- ------------- |
| 33 | 0-7F 00..7F 0xxxxxxx | 33 | 0-7F 00..7F 0xxxxxxx |
| 34 | 80-7FF C2..DF 110xxxxx 10xxxxxx | 34 | 80-7FF C2..DF 110yyyyx 10xxxxxx |
| 35 | 800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx | 35 | 800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx |
| 36 | 10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 36 | 10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx |
| 37 | 200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx | 37 | 200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 38 | 3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char) | 38 | 3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char) |
| 39 | 400000-... invalid | 39 | 400000-... invalid |
| 40 | 40 | ||
| 41 | invalid 1st byte 80..BF 10xxxxxx | 41 | invalid 1st byte 80..BF 10xxxxxx |
| 42 | F9..FF 11111xxx (xxx != 000) | 42 | F9..FF 11111yyy |
| 43 | |||
| 44 | In each bit pattern, 'x' and 'y' each represent a single bit of the | ||
| 45 | character code payload, and least one 'y' must be a 1 bit. | ||
| 46 | In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F. | ||
| 43 | */ | 47 | */ |
| 44 | 48 | ||
| 45 | /* Maximum character code ((1 << CHARACTERBITS) - 1). */ | 49 | /* Maximum character code ((1 << CHARACTERBITS) - 1). */ |
| @@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte) | |||
| 284 | } | 288 | } |
| 285 | 289 | ||
| 286 | /* How many bytes a character that starts with BYTE occupies in a | 290 | /* How many bytes a character that starts with BYTE occupies in a |
| 287 | multibyte form. Unlike MULTIBYTE_LENGTH below, this function does not | 291 | multibyte form. Unlike multibyte_length, this function does not |
| 288 | validate the multibyte form, but looks only at its first byte. */ | 292 | validate the multibyte form, but looks only at its first byte. */ |
| 289 | INLINE int | 293 | INLINE int |
| 290 | BYTES_BY_CHAR_HEAD (int byte) | 294 | BYTES_BY_CHAR_HEAD (int byte) |
| @@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte) | |||
| 297 | } | 301 | } |
| 298 | 302 | ||
| 299 | 303 | ||
| 300 | /* The byte length of multibyte form at unibyte string P ending at | 304 | /* The byte length of the multibyte form at the unibyte string P, |
| 301 | PEND. If the string doesn't point to a valid multibyte form, | 305 | ending at PEND if CHECK, and without a length check if !CHECK. |
| 302 | return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the | 306 | If ALLOW_8BIT, allow multibyte forms of eight-bit characters. |
| 303 | multibyte form. */ | 307 | If the string doesn't point to a valid multibyte form, return 0. |
| 308 | Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */ | ||
| 304 | 309 | ||
| 305 | INLINE int | 310 | INLINE int |
| 306 | MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend) | 311 | multibyte_length (unsigned char const *p, unsigned char const *pend, |
| 307 | { | 312 | bool check, bool allow_8bit) |
| 308 | return (! (p < pend) ? 0 | 313 | { |
| 309 | : ! (p[0] & 0x80) ? 1 | 314 | if (!check || p < pend) |
| 310 | : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0 | 315 | { |
| 311 | : (p[0] & 0xE0) == 0xC0 ? 2 | 316 | unsigned char c = p[0]; |
| 312 | : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0 | 317 | if (c < 0x80) |
| 313 | : (p[0] & 0xF0) == 0xE0 ? 3 | 318 | return 1; |
| 314 | : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0 | 319 | if (!check || p + 1 < pend) |
| 315 | : (p[0] & 0xF8) == 0xF0 ? 4 | 320 | { |
| 316 | : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0 | 321 | /* The 'unsigned int' avoids int overflow in the 5-byte case. */ |
| 317 | : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5 | 322 | unsigned int d = p[1]; |
| 318 | : 0); | 323 | |
| 319 | } | 324 | if (TRAILING_CODE_P (d)) |
| 320 | 325 | { | |
| 321 | 326 | if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF) | |
| 322 | /* Like MULTIBYTE_LENGTH, but don't check the ending address. The | 327 | return 2; |
| 323 | multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */ | 328 | if ((!check || p + 2 < pend) |
| 329 | && TRAILING_CODE_P (p[2])) | ||
| 330 | { | ||
| 331 | if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20))) | ||
| 332 | return 3; | ||
| 333 | if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3])) | ||
| 334 | { | ||
| 335 | if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30))) | ||
| 336 | return 4; | ||
| 337 | if (c == 0xF8 && (!check || p + 4 < pend) | ||
| 338 | && TRAILING_CODE_P (p[4])) | ||
| 339 | { | ||
| 340 | unsigned int w = ((d << 24) + (p[2] << 16) | ||
| 341 | + (p[3] << 8) + p[4]); | ||
| 342 | if (0x88808080 <= w && w <= 0x8FBFBDBF) | ||
| 343 | return 5; | ||
| 344 | } | ||
| 345 | } | ||
| 346 | } | ||
| 347 | } | ||
| 348 | } | ||
| 349 | } | ||
| 324 | 350 | ||
| 325 | INLINE int | 351 | return 0; |
| 326 | MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p) | ||
| 327 | { | ||
| 328 | return (!(p[0] & 0x80) ? 1 | ||
| 329 | : (p[1] & 0xC0) != 0x80 ? 0 | ||
| 330 | : (p[0] & 0xE0) == 0xC0 ? 2 | ||
| 331 | : (p[2] & 0xC0) != 0x80 ? 0 | ||
| 332 | : (p[0] & 0xF0) == 0xE0 ? 3 | ||
| 333 | : (p[3] & 0xC0) != 0x80 ? 0 | ||
| 334 | : (p[0] & 0xF8) == 0xF0 ? 4 | ||
| 335 | : (p[4] & 0xC0) != 0x80 ? 0 | ||
| 336 | : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5 | ||
| 337 | : 0); | ||
| 338 | } | 352 | } |
| 339 | 353 | ||
| 340 | 354 | ||
diff --git a/src/coding.c b/src/coding.c index 716b0d99792..34f36d5a86a 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table, | |||
| 7670 | 7670 | ||
| 7671 | if (! multibytep) | 7671 | if (! multibytep) |
| 7672 | { | 7672 | { |
| 7673 | int bytes; | ||
| 7674 | |||
| 7675 | if (coding->encoder == encode_coding_raw_text | 7673 | if (coding->encoder == encode_coding_raw_text |
| 7676 | || coding->encoder == encode_coding_ccl) | 7674 | || coding->encoder == encode_coding_ccl) |
| 7677 | c = *src++, pos++; | 7675 | c = *src++, pos++; |
| 7678 | else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0) | ||
| 7679 | c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes; | ||
| 7680 | else | 7676 | else |
| 7681 | c = BYTE8_TO_CHAR (*src), src++, pos++; | 7677 | { |
| 7678 | int bytes = multibyte_length (src, src_end, true, true); | ||
| 7679 | if (0 < bytes) | ||
| 7680 | c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes; | ||
| 7681 | else | ||
| 7682 | c = BYTE8_TO_CHAR (*src), src++, pos++; | ||
| 7683 | } | ||
| 7682 | } | 7684 | } |
| 7683 | else | 7685 | else |
| 7684 | c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++; | 7686 | c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++; |
| @@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table, | |||
| 7727 | for (i = 1; i < to_nchars; i++) | 7729 | for (i = 1; i < to_nchars; i++) |
| 7728 | *buf++ = XFIXNUM (AREF (trans, i)); | 7730 | *buf++ = XFIXNUM (AREF (trans, i)); |
| 7729 | for (i = 1; i < from_nchars; i++, pos++) | 7731 | for (i = 1; i < from_nchars; i++, pos++) |
| 7730 | src += MULTIBYTE_LENGTH_NO_CHECK (src); | 7732 | src += multibyte_length (src, NULL, false, true); |
| 7731 | } | 7733 | } |
| 7732 | } | 7734 | } |
| 7733 | 7735 | ||