diff options
| author | Mattias EngdegÄrd | 2022-06-01 11:39:44 +0200 |
|---|---|---|
| committer | Mattias EngdegÄrd | 2022-06-01 19:47:30 +0200 |
| commit | c50718dcfa54293b695f8a3fa5cd4d77848ee084 (patch) | |
| tree | ff0c224f50af5365e7fc3cb03c7cab5011eca50f | |
| parent | 84e122dc9676f1bcf36db62f313b0343a073982b (diff) | |
| download | emacs-c50718dcfa54293b695f8a3fa5cd4d77848ee084.tar.gz emacs-c50718dcfa54293b695f8a3fa5cd4d77848ee084.zip | |
Fix reader char escape bugs (bug#55738)
Make the character literal ?\LF (linefeed) generate 10, not -1.
Ensure that Control escape sequences in character literals are
idempotent: ?\C-\C-a and ?\^\^a mean the same thing as ?\C-a and ?\^a,
generating the control character with value 1. "\C-\C-a" no longer
signals an error.
* src/lread.c (read_escape): Make nonrecursive and only combine
the base char with modifiers at the end, creating control chars
if applicable. Remove the `stringp` argument; assume character
literal syntax. Never return -1.
(read_string_literal): Handle string-specific escape semantics here
and simplify.
* test/src/lread-tests.el (lread-misc-2): New test.
| -rw-r--r-- | src/lread.c | 201 | ||||
| -rw-r--r-- | test/src/lread-tests.el | 10 |
2 files changed, 112 insertions, 99 deletions
diff --git a/src/lread.c b/src/lread.c index a1045184d9b..670413efc02 100644 --- a/src/lread.c +++ b/src/lread.c | |||
| @@ -2631,93 +2631,88 @@ character_name_to_code (char const *name, ptrdiff_t name_len, | |||
| 2631 | enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; | 2631 | enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; |
| 2632 | 2632 | ||
| 2633 | /* Read a \-escape sequence, assuming we already read the `\'. | 2633 | /* Read a \-escape sequence, assuming we already read the `\'. |
| 2634 | When there is a difference between string and character literal \-sequences, | ||
| 2635 | the latter is assumed. | ||
| 2634 | If the escape sequence forces unibyte, return eight-bit char. */ | 2636 | If the escape sequence forces unibyte, return eight-bit char. */ |
| 2635 | 2637 | ||
| 2636 | static int | 2638 | static int |
| 2637 | read_escape (Lisp_Object readcharfun, bool stringp) | 2639 | read_escape (Lisp_Object readcharfun) |
| 2638 | { | 2640 | { |
| 2641 | int modifiers = 0; | ||
| 2642 | again: ; | ||
| 2639 | int c = READCHAR; | 2643 | int c = READCHAR; |
| 2640 | /* \u allows up to four hex digits, \U up to eight. Default to the | 2644 | int unicode_hex_count; |
| 2641 | behavior for \u, and change this value in the case that \U is seen. */ | ||
| 2642 | int unicode_hex_count = 4; | ||
| 2643 | 2645 | ||
| 2644 | switch (c) | 2646 | switch (c) |
| 2645 | { | 2647 | { |
| 2646 | case -1: | 2648 | case -1: |
| 2647 | end_of_file_error (); | 2649 | end_of_file_error (); |
| 2648 | 2650 | ||
| 2649 | case 'a': | 2651 | case 'a': c = '\a'; break; |
| 2650 | return '\007'; | 2652 | case 'b': c = '\b'; break; |
| 2651 | case 'b': | 2653 | case 'd': c = 127; break; |
| 2652 | return '\b'; | 2654 | case 'e': c = 27; break; |
| 2653 | case 'd': | 2655 | case 'f': c = '\f'; break; |
| 2654 | return 0177; | 2656 | case 'n': c = '\n'; break; |
| 2655 | case 'e': | 2657 | case 'r': c = '\r'; break; |
| 2656 | return 033; | 2658 | case 't': c = '\t'; break; |
| 2657 | case 'f': | 2659 | case 'v': c = '\v'; break; |
| 2658 | return '\f'; | ||
| 2659 | case 'n': | ||
| 2660 | return '\n'; | ||
| 2661 | case 'r': | ||
| 2662 | return '\r'; | ||
| 2663 | case 't': | ||
| 2664 | return '\t'; | ||
| 2665 | case 'v': | ||
| 2666 | return '\v'; | ||
| 2667 | case '\n': | ||
| 2668 | return -1; | ||
| 2669 | case ' ': | ||
| 2670 | if (stringp) | ||
| 2671 | return -1; | ||
| 2672 | return ' '; | ||
| 2673 | 2660 | ||
| 2674 | case 'M': | 2661 | case 'M': |
| 2675 | c = READCHAR; | 2662 | c = READCHAR; |
| 2676 | if (c != '-') | 2663 | if (c != '-') |
| 2677 | error ("Invalid escape character syntax"); | 2664 | error ("Invalid escape character syntax"); |
| 2665 | modifiers |= meta_modifier; | ||
| 2678 | c = READCHAR; | 2666 | c = READCHAR; |
| 2679 | if (c == '\\') | 2667 | if (c == '\\') |
| 2680 | c = read_escape (readcharfun, 0); | 2668 | goto again; |
| 2681 | return c | meta_modifier; | 2669 | break; |
| 2682 | 2670 | ||
| 2683 | case 'S': | 2671 | case 'S': |
| 2684 | c = READCHAR; | 2672 | c = READCHAR; |
| 2685 | if (c != '-') | 2673 | if (c != '-') |
| 2686 | error ("Invalid escape character syntax"); | 2674 | error ("Invalid escape character syntax"); |
| 2675 | modifiers |= shift_modifier; | ||
| 2687 | c = READCHAR; | 2676 | c = READCHAR; |
| 2688 | if (c == '\\') | 2677 | if (c == '\\') |
| 2689 | c = read_escape (readcharfun, 0); | 2678 | goto again; |
| 2690 | return c | shift_modifier; | 2679 | break; |
| 2691 | 2680 | ||
| 2692 | case 'H': | 2681 | case 'H': |
| 2693 | c = READCHAR; | 2682 | c = READCHAR; |
| 2694 | if (c != '-') | 2683 | if (c != '-') |
| 2695 | error ("Invalid escape character syntax"); | 2684 | error ("Invalid escape character syntax"); |
| 2685 | modifiers |= hyper_modifier; | ||
| 2696 | c = READCHAR; | 2686 | c = READCHAR; |
| 2697 | if (c == '\\') | 2687 | if (c == '\\') |
| 2698 | c = read_escape (readcharfun, 0); | 2688 | goto again; |
| 2699 | return c | hyper_modifier; | 2689 | break; |
| 2700 | 2690 | ||
| 2701 | case 'A': | 2691 | case 'A': |
| 2702 | c = READCHAR; | 2692 | c = READCHAR; |
| 2703 | if (c != '-') | 2693 | if (c != '-') |
| 2704 | error ("Invalid escape character syntax"); | 2694 | error ("Invalid escape character syntax"); |
| 2695 | modifiers |= alt_modifier; | ||
| 2705 | c = READCHAR; | 2696 | c = READCHAR; |
| 2706 | if (c == '\\') | 2697 | if (c == '\\') |
| 2707 | c = read_escape (readcharfun, 0); | 2698 | goto again; |
| 2708 | return c | alt_modifier; | 2699 | break; |
| 2709 | 2700 | ||
| 2710 | case 's': | 2701 | case 's': |
| 2711 | c = READCHAR; | 2702 | c = READCHAR; |
| 2712 | if (stringp || c != '-') | 2703 | if (c == '-') |
| 2704 | { | ||
| 2705 | modifiers |= super_modifier; | ||
| 2706 | c = READCHAR; | ||
| 2707 | if (c == '\\') | ||
| 2708 | goto again; | ||
| 2709 | } | ||
| 2710 | else | ||
| 2713 | { | 2711 | { |
| 2714 | UNREAD (c); | 2712 | UNREAD (c); |
| 2715 | return ' '; | 2713 | c = ' '; |
| 2716 | } | 2714 | } |
| 2717 | c = READCHAR; | 2715 | break; |
| 2718 | if (c == '\\') | ||
| 2719 | c = read_escape (readcharfun, 0); | ||
| 2720 | return c | super_modifier; | ||
| 2721 | 2716 | ||
| 2722 | case 'C': | 2717 | case 'C': |
| 2723 | c = READCHAR; | 2718 | c = READCHAR; |
| @@ -2725,21 +2720,11 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2725 | error ("Invalid escape character syntax"); | 2720 | error ("Invalid escape character syntax"); |
| 2726 | FALLTHROUGH; | 2721 | FALLTHROUGH; |
| 2727 | case '^': | 2722 | case '^': |
| 2723 | modifiers |= ctrl_modifier; | ||
| 2728 | c = READCHAR; | 2724 | c = READCHAR; |
| 2729 | if (c == '\\') | 2725 | if (c == '\\') |
| 2730 | c = read_escape (readcharfun, 0); | 2726 | goto again; |
| 2731 | if ((c & ~CHAR_MODIFIER_MASK) == '?') | 2727 | break; |
| 2732 | return 0177 | (c & CHAR_MODIFIER_MASK); | ||
| 2733 | else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) | ||
| 2734 | return c | ctrl_modifier; | ||
| 2735 | /* ASCII control chars are made from letters (both cases), | ||
| 2736 | as well as the non-letters within 0100...0137. */ | ||
| 2737 | else if ((c & 0137) >= 0101 && (c & 0137) <= 0132) | ||
| 2738 | return (c & (037 | ~0177)); | ||
| 2739 | else if ((c & 0177) >= 0100 && (c & 0177) <= 0137) | ||
| 2740 | return (c & (037 | ~0177)); | ||
| 2741 | else | ||
| 2742 | return c | ctrl_modifier; | ||
| 2743 | 2728 | ||
| 2744 | case '0': | 2729 | case '0': |
| 2745 | case '1': | 2730 | case '1': |
| @@ -2749,31 +2734,30 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2749 | case '5': | 2734 | case '5': |
| 2750 | case '6': | 2735 | case '6': |
| 2751 | case '7': | 2736 | case '7': |
| 2752 | /* An octal escape, as in ANSI C. */ | 2737 | /* 1-3 octal digits. */ |
| 2753 | { | 2738 | { |
| 2754 | register int i = c - '0'; | 2739 | int i = c - '0'; |
| 2755 | register int count = 0; | 2740 | int count = 0; |
| 2756 | while (++count < 3) | 2741 | while (++count < 3) |
| 2757 | { | 2742 | { |
| 2758 | if ((c = READCHAR) >= '0' && c <= '7') | 2743 | c = READCHAR; |
| 2759 | { | 2744 | if (c < '0' || c > '7') |
| 2760 | i *= 8; | ||
| 2761 | i += c - '0'; | ||
| 2762 | } | ||
| 2763 | else | ||
| 2764 | { | 2745 | { |
| 2765 | UNREAD (c); | 2746 | UNREAD (c); |
| 2766 | break; | 2747 | break; |
| 2767 | } | 2748 | } |
| 2749 | i *= 8; | ||
| 2750 | i += c - '0'; | ||
| 2768 | } | 2751 | } |
| 2769 | 2752 | ||
| 2770 | if (i >= 0x80 && i < 0x100) | 2753 | if (i >= 0x80 && i < 0x100) |
| 2771 | i = BYTE8_TO_CHAR (i); | 2754 | i = BYTE8_TO_CHAR (i); |
| 2772 | return i; | 2755 | c = i; |
| 2756 | break; | ||
| 2773 | } | 2757 | } |
| 2774 | 2758 | ||
| 2775 | case 'x': | 2759 | case 'x': |
| 2776 | /* A hex escape, as in ANSI C. */ | 2760 | /* One or more hex digits. */ |
| 2777 | { | 2761 | { |
| 2778 | unsigned int i = 0; | 2762 | unsigned int i = 0; |
| 2779 | int count = 0; | 2763 | int count = 0; |
| @@ -2795,16 +2779,18 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2795 | } | 2779 | } |
| 2796 | 2780 | ||
| 2797 | if (count < 3 && i >= 0x80) | 2781 | if (count < 3 && i >= 0x80) |
| 2798 | return BYTE8_TO_CHAR (i); | 2782 | i = BYTE8_TO_CHAR (i); |
| 2799 | return i; | 2783 | c = i; |
| 2784 | break; | ||
| 2800 | } | 2785 | } |
| 2801 | 2786 | ||
| 2802 | case 'U': | 2787 | case 'U': /* Eight hex digits. */ |
| 2803 | /* Post-Unicode-2.0: Up to eight hex chars. */ | ||
| 2804 | unicode_hex_count = 8; | 2788 | unicode_hex_count = 8; |
| 2805 | FALLTHROUGH; | 2789 | goto unicode; |
| 2806 | case 'u': | ||
| 2807 | 2790 | ||
| 2791 | case 'u': /* Four hex digits. */ | ||
| 2792 | unicode_hex_count = 4; | ||
| 2793 | unicode: | ||
| 2808 | /* A Unicode escape. We only permit them in strings and characters, | 2794 | /* A Unicode escape. We only permit them in strings and characters, |
| 2809 | not arbitrarily in the source code, as in some other languages. */ | 2795 | not arbitrarily in the source code, as in some other languages. */ |
| 2810 | { | 2796 | { |
| @@ -2815,12 +2801,8 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2815 | { | 2801 | { |
| 2816 | c = READCHAR; | 2802 | c = READCHAR; |
| 2817 | if (c < 0) | 2803 | if (c < 0) |
| 2818 | { | 2804 | error ("Malformed Unicode escape: \\%c%x", |
| 2819 | if (unicode_hex_count > 4) | 2805 | unicode_hex_count == 4 ? 'u' : 'U', i); |
| 2820 | error ("Malformed Unicode escape: \\U%x", i); | ||
| 2821 | else | ||
| 2822 | error ("Malformed Unicode escape: \\u%x", i); | ||
| 2823 | } | ||
| 2824 | /* `isdigit' and `isalpha' may be locale-specific, which we don't | 2806 | /* `isdigit' and `isalpha' may be locale-specific, which we don't |
| 2825 | want. */ | 2807 | want. */ |
| 2826 | int digit = char_hexdigit (c); | 2808 | int digit = char_hexdigit (c); |
| @@ -2831,7 +2813,8 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2831 | } | 2813 | } |
| 2832 | if (i > 0x10FFFF) | 2814 | if (i > 0x10FFFF) |
| 2833 | error ("Non-Unicode character: 0x%x", i); | 2815 | error ("Non-Unicode character: 0x%x", i); |
| 2834 | return i; | 2816 | c = i; |
| 2817 | break; | ||
| 2835 | } | 2818 | } |
| 2836 | 2819 | ||
| 2837 | case 'N': | 2820 | case 'N': |
| @@ -2880,12 +2863,31 @@ read_escape (Lisp_Object readcharfun, bool stringp) | |||
| 2880 | 2863 | ||
| 2881 | /* character_name_to_code can invoke read0, recursively. | 2864 | /* character_name_to_code can invoke read0, recursively. |
| 2882 | This is why read0's buffer is not static. */ | 2865 | This is why read0's buffer is not static. */ |
| 2883 | return character_name_to_code (name, length, readcharfun); | 2866 | c = character_name_to_code (name, length, readcharfun); |
| 2867 | break; | ||
| 2884 | } | 2868 | } |
| 2869 | } | ||
| 2885 | 2870 | ||
| 2886 | default: | 2871 | c |= modifiers; |
| 2887 | return c; | 2872 | if (c & ctrl_modifier) |
| 2873 | { | ||
| 2874 | int b = c & ~CHAR_MODIFIER_MASK; | ||
| 2875 | /* If the base char is in the 0x3f..0x5f range or a lower case | ||
| 2876 | letter, drop the ctrl_modifier bit and generate a C0 control | ||
| 2877 | character instead. */ | ||
| 2878 | if ((b >= 0x3f && b <= 0x5f) || (b >= 'a' && b <= 'z')) | ||
| 2879 | { | ||
| 2880 | c &= ~ctrl_modifier; | ||
| 2881 | if (b == '?') | ||
| 2882 | /* Special case: ^? is DEL. */ | ||
| 2883 | b = 127; | ||
| 2884 | else | ||
| 2885 | /* Make a C0 control in 0..31 by clearing bits 5 and 6. */ | ||
| 2886 | b &= 0x1f; | ||
| 2887 | } | ||
| 2888 | c = b | (c & CHAR_MODIFIER_MASK); | ||
| 2888 | } | 2889 | } |
| 2890 | return c; | ||
| 2889 | } | 2891 | } |
| 2890 | 2892 | ||
| 2891 | /* Return the digit that CHARACTER stands for in the given BASE. | 2893 | /* Return the digit that CHARACTER stands for in the given BASE. |
| @@ -3012,7 +3014,7 @@ read_char_literal (Lisp_Object readcharfun) | |||
| 3012 | } | 3014 | } |
| 3013 | 3015 | ||
| 3014 | if (ch == '\\') | 3016 | if (ch == '\\') |
| 3015 | ch = read_escape (readcharfun, 0); | 3017 | ch = read_escape (readcharfun); |
| 3016 | 3018 | ||
| 3017 | int modifiers = ch & CHAR_MODIFIER_MASK; | 3019 | int modifiers = ch & CHAR_MODIFIER_MASK; |
| 3018 | ch &= ~CHAR_MODIFIER_MASK; | 3020 | ch &= ~CHAR_MODIFIER_MASK; |
| @@ -3066,14 +3068,21 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)], | |||
| 3066 | 3068 | ||
| 3067 | if (ch == '\\') | 3069 | if (ch == '\\') |
| 3068 | { | 3070 | { |
| 3069 | ch = read_escape (readcharfun, 1); | 3071 | ch = READCHAR; |
| 3070 | 3072 | switch (ch) | |
| 3071 | /* CH is -1 if \ newline or \ space has just been seen. */ | ||
| 3072 | if (ch == -1) | ||
| 3073 | { | 3073 | { |
| 3074 | case 's': | ||
| 3075 | ch = ' '; | ||
| 3076 | break; | ||
| 3077 | case ' ': | ||
| 3078 | case '\n': | ||
| 3074 | if (p == read_buffer) | 3079 | if (p == read_buffer) |
| 3075 | cancel = true; | 3080 | cancel = true; |
| 3076 | continue; | 3081 | continue; |
| 3082 | default: | ||
| 3083 | UNREAD (ch); | ||
| 3084 | ch = read_escape (readcharfun); | ||
| 3085 | break; | ||
| 3077 | } | 3086 | } |
| 3078 | 3087 | ||
| 3079 | int modifiers = ch & CHAR_MODIFIER_MASK; | 3088 | int modifiers = ch & CHAR_MODIFIER_MASK; |
| @@ -3085,19 +3094,13 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)], | |||
| 3085 | force_multibyte = true; | 3094 | force_multibyte = true; |
| 3086 | else /* I.e. ASCII_CHAR_P (ch). */ | 3095 | else /* I.e. ASCII_CHAR_P (ch). */ |
| 3087 | { | 3096 | { |
| 3088 | /* Allow `\C- ' and `\C-?'. */ | 3097 | /* Allow `\C-SPC' and `\^SPC'. This is done here because |
| 3089 | if (modifiers == CHAR_CTL) | 3098 | the literals ?\C-SPC and ?\^SPC (rather inconsistently) |
| 3099 | yield (' ' | CHAR_CTL); see bug#55738. */ | ||
| 3100 | if (modifiers == CHAR_CTL && ch == ' ') | ||
| 3090 | { | 3101 | { |
| 3091 | if (ch == ' ') | 3102 | ch = 0; |
| 3092 | { | 3103 | modifiers = 0; |
| 3093 | ch = 0; | ||
| 3094 | modifiers = 0; | ||
| 3095 | } | ||
| 3096 | else if (ch == '?') | ||
| 3097 | { | ||
| 3098 | ch = 127; | ||
| 3099 | modifiers = 0; | ||
| 3100 | } | ||
| 3101 | } | 3104 | } |
| 3102 | if (modifiers & CHAR_SHIFT) | 3105 | if (modifiers & CHAR_SHIFT) |
| 3103 | { | 3106 | { |
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el index 47351c1d116..59d5ca076f1 100644 --- a/test/src/lread-tests.el +++ b/test/src/lread-tests.el | |||
| @@ -317,4 +317,14 @@ literals (Bug#20852)." | |||
| 317 | (should (equal (read-from-string "#_") | 317 | (should (equal (read-from-string "#_") |
| 318 | '(## . 2)))) | 318 | '(## . 2)))) |
| 319 | 319 | ||
| 320 | (ert-deftest lread-misc-2 () | ||
| 321 | ;; ?\LF should produce LF (only inside string literals do we ignore \LF). | ||
| 322 | (should (equal (read-from-string "?\\\n") '(?\n . 3))) | ||
| 323 | (should (equal (read-from-string "\"a\\\nb\"") '("ab" . 6))) | ||
| 324 | ;; The Control modifier constructs should be idempotent. | ||
| 325 | (should (equal ?\C-\C-x ?\C-x)) | ||
| 326 | (should (equal ?\^\^x ?\C-x)) | ||
| 327 | (should (equal ?\C-\^x ?\C-x)) | ||
| 328 | (should (equal ?\^\C-x ?\C-x))) | ||
| 329 | |||
| 320 | ;;; lread-tests.el ends here | 330 | ;;; lread-tests.el ends here |