aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias EngdegÄrd2022-06-01 11:39:44 +0200
committerMattias EngdegÄrd2022-06-01 19:47:30 +0200
commitc50718dcfa54293b695f8a3fa5cd4d77848ee084 (patch)
treeff0c224f50af5365e7fc3cb03c7cab5011eca50f
parent84e122dc9676f1bcf36db62f313b0343a073982b (diff)
downloademacs-c50718dcfa54293b695f8a3fa5cd4d77848ee084.tar.gz
emacs-c50718dcfa54293b695f8a3fa5cd4d77848ee084.zip
Fix reader char escape bugs (bug#55738)
Make the character literal ?\LF (linefeed) generate 10, not -1. Ensure that Control escape sequences in character literals are idempotent: ?\C-\C-a and ?\^\^a mean the same thing as ?\C-a and ?\^a, generating the control character with value 1. "\C-\C-a" no longer signals an error. * src/lread.c (read_escape): Make nonrecursive and only combine the base char with modifiers at the end, creating control chars if applicable. Remove the `stringp` argument; assume character literal syntax. Never return -1. (read_string_literal): Handle string-specific escape semantics here and simplify. * test/src/lread-tests.el (lread-misc-2): New test.
-rw-r--r--src/lread.c201
-rw-r--r--test/src/lread-tests.el10
2 files changed, 112 insertions, 99 deletions
diff --git a/src/lread.c b/src/lread.c
index a1045184d9b..670413efc02 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2631,93 +2631,88 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
2631enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; 2631enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
2632 2632
2633/* Read a \-escape sequence, assuming we already read the `\'. 2633/* Read a \-escape sequence, assuming we already read the `\'.
2634 When there is a difference between string and character literal \-sequences,
2635 the latter is assumed.
2634 If the escape sequence forces unibyte, return eight-bit char. */ 2636 If the escape sequence forces unibyte, return eight-bit char. */
2635 2637
2636static int 2638static int
2637read_escape (Lisp_Object readcharfun, bool stringp) 2639read_escape (Lisp_Object readcharfun)
2638{ 2640{
2641 int modifiers = 0;
2642 again: ;
2639 int c = READCHAR; 2643 int c = READCHAR;
2640 /* \u allows up to four hex digits, \U up to eight. Default to the 2644 int unicode_hex_count;
2641 behavior for \u, and change this value in the case that \U is seen. */
2642 int unicode_hex_count = 4;
2643 2645
2644 switch (c) 2646 switch (c)
2645 { 2647 {
2646 case -1: 2648 case -1:
2647 end_of_file_error (); 2649 end_of_file_error ();
2648 2650
2649 case 'a': 2651 case 'a': c = '\a'; break;
2650 return '\007'; 2652 case 'b': c = '\b'; break;
2651 case 'b': 2653 case 'd': c = 127; break;
2652 return '\b'; 2654 case 'e': c = 27; break;
2653 case 'd': 2655 case 'f': c = '\f'; break;
2654 return 0177; 2656 case 'n': c = '\n'; break;
2655 case 'e': 2657 case 'r': c = '\r'; break;
2656 return 033; 2658 case 't': c = '\t'; break;
2657 case 'f': 2659 case 'v': c = '\v'; break;
2658 return '\f';
2659 case 'n':
2660 return '\n';
2661 case 'r':
2662 return '\r';
2663 case 't':
2664 return '\t';
2665 case 'v':
2666 return '\v';
2667 case '\n':
2668 return -1;
2669 case ' ':
2670 if (stringp)
2671 return -1;
2672 return ' ';
2673 2660
2674 case 'M': 2661 case 'M':
2675 c = READCHAR; 2662 c = READCHAR;
2676 if (c != '-') 2663 if (c != '-')
2677 error ("Invalid escape character syntax"); 2664 error ("Invalid escape character syntax");
2665 modifiers |= meta_modifier;
2678 c = READCHAR; 2666 c = READCHAR;
2679 if (c == '\\') 2667 if (c == '\\')
2680 c = read_escape (readcharfun, 0); 2668 goto again;
2681 return c | meta_modifier; 2669 break;
2682 2670
2683 case 'S': 2671 case 'S':
2684 c = READCHAR; 2672 c = READCHAR;
2685 if (c != '-') 2673 if (c != '-')
2686 error ("Invalid escape character syntax"); 2674 error ("Invalid escape character syntax");
2675 modifiers |= shift_modifier;
2687 c = READCHAR; 2676 c = READCHAR;
2688 if (c == '\\') 2677 if (c == '\\')
2689 c = read_escape (readcharfun, 0); 2678 goto again;
2690 return c | shift_modifier; 2679 break;
2691 2680
2692 case 'H': 2681 case 'H':
2693 c = READCHAR; 2682 c = READCHAR;
2694 if (c != '-') 2683 if (c != '-')
2695 error ("Invalid escape character syntax"); 2684 error ("Invalid escape character syntax");
2685 modifiers |= hyper_modifier;
2696 c = READCHAR; 2686 c = READCHAR;
2697 if (c == '\\') 2687 if (c == '\\')
2698 c = read_escape (readcharfun, 0); 2688 goto again;
2699 return c | hyper_modifier; 2689 break;
2700 2690
2701 case 'A': 2691 case 'A':
2702 c = READCHAR; 2692 c = READCHAR;
2703 if (c != '-') 2693 if (c != '-')
2704 error ("Invalid escape character syntax"); 2694 error ("Invalid escape character syntax");
2695 modifiers |= alt_modifier;
2705 c = READCHAR; 2696 c = READCHAR;
2706 if (c == '\\') 2697 if (c == '\\')
2707 c = read_escape (readcharfun, 0); 2698 goto again;
2708 return c | alt_modifier; 2699 break;
2709 2700
2710 case 's': 2701 case 's':
2711 c = READCHAR; 2702 c = READCHAR;
2712 if (stringp || c != '-') 2703 if (c == '-')
2704 {
2705 modifiers |= super_modifier;
2706 c = READCHAR;
2707 if (c == '\\')
2708 goto again;
2709 }
2710 else
2713 { 2711 {
2714 UNREAD (c); 2712 UNREAD (c);
2715 return ' '; 2713 c = ' ';
2716 } 2714 }
2717 c = READCHAR; 2715 break;
2718 if (c == '\\')
2719 c = read_escape (readcharfun, 0);
2720 return c | super_modifier;
2721 2716
2722 case 'C': 2717 case 'C':
2723 c = READCHAR; 2718 c = READCHAR;
@@ -2725,21 +2720,11 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2725 error ("Invalid escape character syntax"); 2720 error ("Invalid escape character syntax");
2726 FALLTHROUGH; 2721 FALLTHROUGH;
2727 case '^': 2722 case '^':
2723 modifiers |= ctrl_modifier;
2728 c = READCHAR; 2724 c = READCHAR;
2729 if (c == '\\') 2725 if (c == '\\')
2730 c = read_escape (readcharfun, 0); 2726 goto again;
2731 if ((c & ~CHAR_MODIFIER_MASK) == '?') 2727 break;
2732 return 0177 | (c & CHAR_MODIFIER_MASK);
2733 else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
2734 return c | ctrl_modifier;
2735 /* ASCII control chars are made from letters (both cases),
2736 as well as the non-letters within 0100...0137. */
2737 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
2738 return (c & (037 | ~0177));
2739 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
2740 return (c & (037 | ~0177));
2741 else
2742 return c | ctrl_modifier;
2743 2728
2744 case '0': 2729 case '0':
2745 case '1': 2730 case '1':
@@ -2749,31 +2734,30 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2749 case '5': 2734 case '5':
2750 case '6': 2735 case '6':
2751 case '7': 2736 case '7':
2752 /* An octal escape, as in ANSI C. */ 2737 /* 1-3 octal digits. */
2753 { 2738 {
2754 register int i = c - '0'; 2739 int i = c - '0';
2755 register int count = 0; 2740 int count = 0;
2756 while (++count < 3) 2741 while (++count < 3)
2757 { 2742 {
2758 if ((c = READCHAR) >= '0' && c <= '7') 2743 c = READCHAR;
2759 { 2744 if (c < '0' || c > '7')
2760 i *= 8;
2761 i += c - '0';
2762 }
2763 else
2764 { 2745 {
2765 UNREAD (c); 2746 UNREAD (c);
2766 break; 2747 break;
2767 } 2748 }
2749 i *= 8;
2750 i += c - '0';
2768 } 2751 }
2769 2752
2770 if (i >= 0x80 && i < 0x100) 2753 if (i >= 0x80 && i < 0x100)
2771 i = BYTE8_TO_CHAR (i); 2754 i = BYTE8_TO_CHAR (i);
2772 return i; 2755 c = i;
2756 break;
2773 } 2757 }
2774 2758
2775 case 'x': 2759 case 'x':
2776 /* A hex escape, as in ANSI C. */ 2760 /* One or more hex digits. */
2777 { 2761 {
2778 unsigned int i = 0; 2762 unsigned int i = 0;
2779 int count = 0; 2763 int count = 0;
@@ -2795,16 +2779,18 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2795 } 2779 }
2796 2780
2797 if (count < 3 && i >= 0x80) 2781 if (count < 3 && i >= 0x80)
2798 return BYTE8_TO_CHAR (i); 2782 i = BYTE8_TO_CHAR (i);
2799 return i; 2783 c = i;
2784 break;
2800 } 2785 }
2801 2786
2802 case 'U': 2787 case 'U': /* Eight hex digits. */
2803 /* Post-Unicode-2.0: Up to eight hex chars. */
2804 unicode_hex_count = 8; 2788 unicode_hex_count = 8;
2805 FALLTHROUGH; 2789 goto unicode;
2806 case 'u':
2807 2790
2791 case 'u': /* Four hex digits. */
2792 unicode_hex_count = 4;
2793 unicode:
2808 /* A Unicode escape. We only permit them in strings and characters, 2794 /* A Unicode escape. We only permit them in strings and characters,
2809 not arbitrarily in the source code, as in some other languages. */ 2795 not arbitrarily in the source code, as in some other languages. */
2810 { 2796 {
@@ -2815,12 +2801,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2815 { 2801 {
2816 c = READCHAR; 2802 c = READCHAR;
2817 if (c < 0) 2803 if (c < 0)
2818 { 2804 error ("Malformed Unicode escape: \\%c%x",
2819 if (unicode_hex_count > 4) 2805 unicode_hex_count == 4 ? 'u' : 'U', i);
2820 error ("Malformed Unicode escape: \\U%x", i);
2821 else
2822 error ("Malformed Unicode escape: \\u%x", i);
2823 }
2824 /* `isdigit' and `isalpha' may be locale-specific, which we don't 2806 /* `isdigit' and `isalpha' may be locale-specific, which we don't
2825 want. */ 2807 want. */
2826 int digit = char_hexdigit (c); 2808 int digit = char_hexdigit (c);
@@ -2831,7 +2813,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2831 } 2813 }
2832 if (i > 0x10FFFF) 2814 if (i > 0x10FFFF)
2833 error ("Non-Unicode character: 0x%x", i); 2815 error ("Non-Unicode character: 0x%x", i);
2834 return i; 2816 c = i;
2817 break;
2835 } 2818 }
2836 2819
2837 case 'N': 2820 case 'N':
@@ -2880,12 +2863,31 @@ read_escape (Lisp_Object readcharfun, bool stringp)
2880 2863
2881 /* character_name_to_code can invoke read0, recursively. 2864 /* character_name_to_code can invoke read0, recursively.
2882 This is why read0's buffer is not static. */ 2865 This is why read0's buffer is not static. */
2883 return character_name_to_code (name, length, readcharfun); 2866 c = character_name_to_code (name, length, readcharfun);
2867 break;
2884 } 2868 }
2869 }
2885 2870
2886 default: 2871 c |= modifiers;
2887 return c; 2872 if (c & ctrl_modifier)
2873 {
2874 int b = c & ~CHAR_MODIFIER_MASK;
2875 /* If the base char is in the 0x3f..0x5f range or a lower case
2876 letter, drop the ctrl_modifier bit and generate a C0 control
2877 character instead. */
2878 if ((b >= 0x3f && b <= 0x5f) || (b >= 'a' && b <= 'z'))
2879 {
2880 c &= ~ctrl_modifier;
2881 if (b == '?')
2882 /* Special case: ^? is DEL. */
2883 b = 127;
2884 else
2885 /* Make a C0 control in 0..31 by clearing bits 5 and 6. */
2886 b &= 0x1f;
2887 }
2888 c = b | (c & CHAR_MODIFIER_MASK);
2888 } 2889 }
2890 return c;
2889} 2891}
2890 2892
2891/* Return the digit that CHARACTER stands for in the given BASE. 2893/* Return the digit that CHARACTER stands for in the given BASE.
@@ -3012,7 +3014,7 @@ read_char_literal (Lisp_Object readcharfun)
3012 } 3014 }
3013 3015
3014 if (ch == '\\') 3016 if (ch == '\\')
3015 ch = read_escape (readcharfun, 0); 3017 ch = read_escape (readcharfun);
3016 3018
3017 int modifiers = ch & CHAR_MODIFIER_MASK; 3019 int modifiers = ch & CHAR_MODIFIER_MASK;
3018 ch &= ~CHAR_MODIFIER_MASK; 3020 ch &= ~CHAR_MODIFIER_MASK;
@@ -3066,14 +3068,21 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
3066 3068
3067 if (ch == '\\') 3069 if (ch == '\\')
3068 { 3070 {
3069 ch = read_escape (readcharfun, 1); 3071 ch = READCHAR;
3070 3072 switch (ch)
3071 /* CH is -1 if \ newline or \ space has just been seen. */
3072 if (ch == -1)
3073 { 3073 {
3074 case 's':
3075 ch = ' ';
3076 break;
3077 case ' ':
3078 case '\n':
3074 if (p == read_buffer) 3079 if (p == read_buffer)
3075 cancel = true; 3080 cancel = true;
3076 continue; 3081 continue;
3082 default:
3083 UNREAD (ch);
3084 ch = read_escape (readcharfun);
3085 break;
3077 } 3086 }
3078 3087
3079 int modifiers = ch & CHAR_MODIFIER_MASK; 3088 int modifiers = ch & CHAR_MODIFIER_MASK;
@@ -3085,19 +3094,13 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
3085 force_multibyte = true; 3094 force_multibyte = true;
3086 else /* I.e. ASCII_CHAR_P (ch). */ 3095 else /* I.e. ASCII_CHAR_P (ch). */
3087 { 3096 {
3088 /* Allow `\C- ' and `\C-?'. */ 3097 /* Allow `\C-SPC' and `\^SPC'. This is done here because
3089 if (modifiers == CHAR_CTL) 3098 the literals ?\C-SPC and ?\^SPC (rather inconsistently)
3099 yield (' ' | CHAR_CTL); see bug#55738. */
3100 if (modifiers == CHAR_CTL && ch == ' ')
3090 { 3101 {
3091 if (ch == ' ') 3102 ch = 0;
3092 { 3103 modifiers = 0;
3093 ch = 0;
3094 modifiers = 0;
3095 }
3096 else if (ch == '?')
3097 {
3098 ch = 127;
3099 modifiers = 0;
3100 }
3101 } 3104 }
3102 if (modifiers & CHAR_SHIFT) 3105 if (modifiers & CHAR_SHIFT)
3103 { 3106 {
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index 47351c1d116..59d5ca076f1 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -317,4 +317,14 @@ literals (Bug#20852)."
317 (should (equal (read-from-string "#_") 317 (should (equal (read-from-string "#_")
318 '(## . 2)))) 318 '(## . 2))))
319 319
320(ert-deftest lread-misc-2 ()
321 ;; ?\LF should produce LF (only inside string literals do we ignore \LF).
322 (should (equal (read-from-string "?\\\n") '(?\n . 3)))
323 (should (equal (read-from-string "\"a\\\nb\"") '("ab" . 6)))
324 ;; The Control modifier constructs should be idempotent.
325 (should (equal ?\C-\C-x ?\C-x))
326 (should (equal ?\^\^x ?\C-x))
327 (should (equal ?\C-\^x ?\C-x))
328 (should (equal ?\^\C-x ?\C-x)))
329
320;;; lread-tests.el ends here 330;;; lread-tests.el ends here