aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/buffer.c3
-rw-r--r--src/character.c42
-rw-r--r--src/character.h96
-rw-r--r--src/coding.c14
4 files changed, 87 insertions, 68 deletions
diff --git a/src/buffer.c b/src/buffer.c
index 5398414e6eb..53b3bd960c4 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -2634,8 +2634,7 @@ current buffer is cleared. */)
2634 if (ASCII_CHAR_P (*p)) 2634 if (ASCII_CHAR_P (*p))
2635 p++, pos++; 2635 p++, pos++;
2636 else if (EQ (flag, Qt) 2636 else if (EQ (flag, Qt)
2637 && ! CHAR_BYTE8_HEAD_P (*p) 2637 && 0 < (bytes = multibyte_length (p, pend, true, false)))
2638 && (bytes = MULTIBYTE_LENGTH (p, pend)) > 0)
2639 p += bytes, pos += bytes; 2638 p += bytes, pos += bytes;
2640 else 2639 else
2641 { 2640 {
diff --git a/src/character.c b/src/character.c
index 303c83ccec3..da09e77e131 100644
--- a/src/character.c
+++ b/src/character.c
@@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
486 486
487 while (ptr < endp) 487 while (ptr < endp)
488 { 488 {
489 int len = MULTIBYTE_LENGTH (ptr, endp); 489 int len = multibyte_length (ptr, endp, true, true);
490 490
491 if (len == 0) 491 if (len == 0)
492 emacs_abort (); 492 emacs_abort ();
@@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
508 ptrdiff_t *nchars, ptrdiff_t *nbytes) 508 ptrdiff_t *nchars, ptrdiff_t *nbytes)
509{ 509{
510 const unsigned char *endp = str + len; 510 const unsigned char *endp = str + len;
511 int n;
512 ptrdiff_t chars = 0, bytes = 0; 511 ptrdiff_t chars = 0, bytes = 0;
513 512
514 if (len >= MAX_MULTIBYTE_LENGTH) 513 if (len >= MAX_MULTIBYTE_LENGTH)
@@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
516 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; 515 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
517 while (str < adjusted_endp) 516 while (str < adjusted_endp)
518 { 517 {
519 if (! CHAR_BYTE8_HEAD_P (*str) 518 int n = multibyte_length (str, NULL, false, false);
520 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) 519 if (0 < n)
521 str += n, bytes += n; 520 str += n, bytes += n;
522 else 521 else
523 str++, bytes += 2; 522 str++, bytes += 2;
@@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
526 } 525 }
527 while (str < endp) 526 while (str < endp)
528 { 527 {
529 if (! CHAR_BYTE8_HEAD_P (*str) 528 int n = multibyte_length (str, endp, true, false);
530 && (n = MULTIBYTE_LENGTH (str, endp)) > 0) 529 if (0 < n)
531 str += n, bytes += n; 530 str += n, bytes += n;
532 else 531 else
533 str++, bytes += 2; 532 str++, bytes += 2;
@@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
554 unsigned char *p = str, *endp = str + nbytes; 553 unsigned char *p = str, *endp = str + nbytes;
555 unsigned char *to; 554 unsigned char *to;
556 ptrdiff_t chars = 0; 555 ptrdiff_t chars = 0;
557 int n;
558 556
559 if (nbytes >= MAX_MULTIBYTE_LENGTH) 557 if (nbytes >= MAX_MULTIBYTE_LENGTH)
560 { 558 {
561 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; 559 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
562 while (p < adjusted_endp 560 while (p < adjusted_endp)
563 && ! CHAR_BYTE8_HEAD_P (*p) 561 {
564 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) 562 int n = multibyte_length (p, NULL, false, false);
565 p += n, chars++; 563 if (n <= 0)
564 break;
565 p += n, chars++;
566 }
567 }
568 while (true)
569 {
570 int n = multibyte_length (p, endp, true, false);
571 if (n <= 0)
572 break;
573 p += n, chars++;
566 } 574 }
567 while (p < endp
568 && ! CHAR_BYTE8_HEAD_P (*p)
569 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
570 p += n, chars++;
571 if (nchars) 575 if (nchars)
572 *nchars = chars; 576 *nchars = chars;
573 if (p == endp) 577 if (p == endp)
@@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
584 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; 588 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
585 while (p < adjusted_endp) 589 while (p < adjusted_endp)
586 { 590 {
587 if (! CHAR_BYTE8_HEAD_P (*p) 591 int n = multibyte_length (p, NULL, false, false);
588 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) 592 if (0 < n)
589 { 593 {
590 while (n--) 594 while (n--)
591 *to++ = *p++; 595 *to++ = *p++;
@@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
601 } 605 }
602 while (p < endp) 606 while (p < endp)
603 { 607 {
604 if (! CHAR_BYTE8_HEAD_P (*p) 608 int n = multibyte_length (p, endp, true, false);
605 && (n = MULTIBYTE_LENGTH (p, endp)) > 0) 609 if (0 < n)
606 { 610 {
607 while (n--) 611 while (n--)
608 *to++ = *p++; 612 *to++ = *p++;
diff --git a/src/character.h b/src/character.h
index 81320dedd17..4887473b27e 100644
--- a/src/character.h
+++ b/src/character.h
@@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN
31/* character code 1st byte byte sequence 31/* character code 1st byte byte sequence
32 -------------- -------- ------------- 32 -------------- -------- -------------
33 0-7F 00..7F 0xxxxxxx 33 0-7F 00..7F 0xxxxxxx
34 80-7FF C2..DF 110xxxxx 10xxxxxx 34 80-7FF C2..DF 110yyyyx 10xxxxxx
35 800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx 35 800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx
36 10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 36 10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
37 200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx 37 200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
38 3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char) 38 3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char)
39 400000-... invalid 39 400000-... invalid
40 40
41 invalid 1st byte 80..BF 10xxxxxx 41 invalid 1st byte 80..BF 10xxxxxx
42 F9..FF 11111xxx (xxx != 000) 42 F9..FF 11111yyy
43
44 In each bit pattern, 'x' and 'y' each represent a single bit of the
45 character code payload, and least one 'y' must be a 1 bit.
46 In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
43*/ 47*/
44 48
45/* Maximum character code ((1 << CHARACTERBITS) - 1). */ 49/* Maximum character code ((1 << CHARACTERBITS) - 1). */
@@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte)
284} 288}
285 289
286/* How many bytes a character that starts with BYTE occupies in a 290/* How many bytes a character that starts with BYTE occupies in a
287 multibyte form. Unlike MULTIBYTE_LENGTH below, this function does not 291 multibyte form. Unlike multibyte_length, this function does not
288 validate the multibyte form, but looks only at its first byte. */ 292 validate the multibyte form, but looks only at its first byte. */
289INLINE int 293INLINE int
290BYTES_BY_CHAR_HEAD (int byte) 294BYTES_BY_CHAR_HEAD (int byte)
@@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte)
297} 301}
298 302
299 303
300/* The byte length of multibyte form at unibyte string P ending at 304/* The byte length of the multibyte form at the unibyte string P,
301 PEND. If the string doesn't point to a valid multibyte form, 305 ending at PEND if CHECK, and without a length check if !CHECK.
302 return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the 306 If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
303 multibyte form. */ 307 If the string doesn't point to a valid multibyte form, return 0.
308 Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */
304 309
305INLINE int 310INLINE int
306MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend) 311multibyte_length (unsigned char const *p, unsigned char const *pend,
307{ 312 bool check, bool allow_8bit)
308 return (! (p < pend) ? 0 313{
309 : ! (p[0] & 0x80) ? 1 314 if (!check || p < pend)
310 : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0 315 {
311 : (p[0] & 0xE0) == 0xC0 ? 2 316 unsigned char c = p[0];
312 : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0 317 if (c < 0x80)
313 : (p[0] & 0xF0) == 0xE0 ? 3 318 return 1;
314 : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0 319 if (!check || p + 1 < pend)
315 : (p[0] & 0xF8) == 0xF0 ? 4 320 {
316 : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0 321 /* The 'unsigned int' avoids int overflow in the 5-byte case. */
317 : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5 322 unsigned int d = p[1];
318 : 0); 323
319} 324 if (TRAILING_CODE_P (d))
320 325 {
321 326 if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF)
322/* Like MULTIBYTE_LENGTH, but don't check the ending address. The 327 return 2;
323 multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */ 328 if ((!check || p + 2 < pend)
329 && TRAILING_CODE_P (p[2]))
330 {
331 if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20)))
332 return 3;
333 if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3]))
334 {
335 if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30)))
336 return 4;
337 if (c == 0xF8 && (!check || p + 4 < pend)
338 && TRAILING_CODE_P (p[4]))
339 {
340 unsigned int w = ((d << 24) + (p[2] << 16)
341 + (p[3] << 8) + p[4]);
342 if (0x88808080 <= w && w <= 0x8FBFBDBF)
343 return 5;
344 }
345 }
346 }
347 }
348 }
349 }
324 350
325INLINE int 351 return 0;
326MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p)
327{
328 return (!(p[0] & 0x80) ? 1
329 : (p[1] & 0xC0) != 0x80 ? 0
330 : (p[0] & 0xE0) == 0xC0 ? 2
331 : (p[2] & 0xC0) != 0x80 ? 0
332 : (p[0] & 0xF0) == 0xE0 ? 3
333 : (p[3] & 0xC0) != 0x80 ? 0
334 : (p[0] & 0xF8) == 0xF0 ? 4
335 : (p[4] & 0xC0) != 0x80 ? 0
336 : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
337 : 0);
338} 352}
339 353
340 354
diff --git a/src/coding.c b/src/coding.c
index 716b0d99792..34f36d5a86a 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7670 7670
7671 if (! multibytep) 7671 if (! multibytep)
7672 { 7672 {
7673 int bytes;
7674
7675 if (coding->encoder == encode_coding_raw_text 7673 if (coding->encoder == encode_coding_raw_text
7676 || coding->encoder == encode_coding_ccl) 7674 || coding->encoder == encode_coding_ccl)
7677 c = *src++, pos++; 7675 c = *src++, pos++;
7678 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7679 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7680 else 7676 else
7681 c = BYTE8_TO_CHAR (*src), src++, pos++; 7677 {
7678 int bytes = multibyte_length (src, src_end, true, true);
7679 if (0 < bytes)
7680 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7681 else
7682 c = BYTE8_TO_CHAR (*src), src++, pos++;
7683 }
7682 } 7684 }
7683 else 7685 else
7684 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++; 7686 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
@@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7727 for (i = 1; i < to_nchars; i++) 7729 for (i = 1; i < to_nchars; i++)
7728 *buf++ = XFIXNUM (AREF (trans, i)); 7730 *buf++ = XFIXNUM (AREF (trans, i));
7729 for (i = 1; i < from_nchars; i++, pos++) 7731 for (i = 1; i < from_nchars; i++, pos++)
7730 src += MULTIBYTE_LENGTH_NO_CHECK (src); 7732 src += multibyte_length (src, NULL, false, true);
7731 } 7733 }
7732 } 7734 }
7733 7735