(read_escape): Provide a Unicode character escape syntax; \u followed by

exactly four or \U followed by exactly eight hex digits in a comment or string is read as a Unicode character with that code point.
author: Eli Zaretskii 2006-06-09 18:22:30 +0000
committer: Eli Zaretskii 2006-06-09 18:22:30 +0000
commit: 71b169b8c49d4c2f593b7074e8555f6e479b10f3 (patch)
tree: 351837ea1b19a7f1b4a68cff90c9596cf983da0d /src
parent: a9ab79a844b232ce7971c6234c86be3cc634a78e (diff)
download: emacs-71b169b8c49d4c2f593b7074e8555f6e479b10f3.tar.gz
emacs-71b169b8c49d4c2f593b7074e8555f6e479b10f3.zip
1 files changed, 49 insertions, 0 deletions
diff --git a/src/lread.c b/src/lread.c
index 31f974d9bc0..a0d4ad825dd 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -1764,6 +1764,9 @@ read_escape (readcharfun, stringp, byterep)
     int *byterep;
 {
  register int c = READCHAR;
+  /* \u allows up to four hex digits, \U up to eight. Default to the
+     behaviour for \u, and change this value in the case that \U is seen. */
+  int unicode_hex_count = 4;
  *byterep = 0;
@@ -1928,6 +1931,52 @@ read_escape (readcharfun, stringp, byterep)
        return i;
      }
+    case 'U':
+      /* Post-Unicode-2.0: Up to eight hex chars.  */
+      unicode_hex_count = 8;
+    case 'u':
+      /* A Unicode escape. We only permit them in strings and characters,
+         not arbitrarily in the source code, as in some other languages.  */
+      {
+        int i = 0;
+        int count = 0;
+        Lisp_Object lisp_char;
+        struct gcpro gcpro1;
+        while (++count <= unicode_hex_count)
+          {
+            c = READCHAR;
+            /* isdigit(), isalpha() may be locale-specific, which we don't
+               want. */
+            if      (c >= '0' && c <= '9')  i = (i << 4) + (c - '0');
+            else if (c >= 'a' && c <= 'f')  i = (i << 4) + (c - 'a') + 10;
+            else if (c >= 'A' && c <= 'F')  i = (i << 4) + (c - 'A') + 10;
+            else
+              {
+                error ("Non-hex digit used for Unicode escape");
+                break;
+              }
+          }
+        GCPRO1 (readcharfun);
+        lisp_char = call2(intern("decode-char"), intern("ucs"),
+                          make_number(i));
+        UNGCPRO;
+        if (EQ(Qnil, lisp_char))
+          {
+            /* This is ugly and horrible and trashes the user's data.  */
+            XSETFASTINT (i, MAKE_CHAR (charset_katakana_jisx0201,
+                                       34 + 128, 46 + 128));
+            return i;
+          }
+        else
+          {
+            return XFASTINT (lisp_char);
+          }
+      }
    default:
      if (BASE_LEADING_CODE_P (c))
        c = read_multibyte (c, readcharfun);
author	Eli Zaretskii	2006-06-09 18:22:30 +0000
committer	Eli Zaretskii	2006-06-09 18:22:30 +0000
commit	71b169b8c49d4c2f593b7074e8555f6e479b10f3 (patch)
tree	351837ea1b19a7f1b4a68cff90c9596cf983da0d /src
parent	a9ab79a844b232ce7971c6234c86be3cc634a78e (diff)
download	emacs-71b169b8c49d4c2f593b7074e8555f6e479b10f3.tar.gz emacs-71b169b8c49d4c2f593b7074e8555f6e479b10f3.zip

diff --git a/src/lread.c b/src/lread.c index 31f974d9bc0..a0d4ad825dd 100644 --- a/src/lread.c +++ b/src/lread.c
@@ -1764,6 +1764,9 @@ read_escape (readcharfun, stringp, byterep)
1764	int *byterep;	1764	int *byterep;
1765	{	1765	{
1766	register int c = READCHAR;	1766	register int c = READCHAR;
		1767	/* \u allows up to four hex digits, \U up to eight. Default to the
		1768	behaviour for \u, and change this value in the case that \U is seen. */
		1769	int unicode_hex_count = 4;
1767		1770
1768	*byterep = 0;	1771	*byterep = 0;
1769		1772
@@ -1928,6 +1931,52 @@ read_escape (readcharfun, stringp, byterep)
1928	return i;	1931	return i;
1929	}	1932	}
1930		1933
		1934	case 'U':
		1935	/* Post-Unicode-2.0: Up to eight hex chars. */
		1936	unicode_hex_count = 8;
		1937	case 'u':
		1938
		1939	/* A Unicode escape. We only permit them in strings and characters,
		1940	not arbitrarily in the source code, as in some other languages. */
		1941	{
		1942	int i = 0;
		1943	int count = 0;
		1944	Lisp_Object lisp_char;
		1945	struct gcpro gcpro1;
		1946
		1947	while (++count <= unicode_hex_count)
		1948	{
		1949	c = READCHAR;
		1950	/* isdigit(), isalpha() may be locale-specific, which we don't
		1951	want. */
		1952	if (c >= '0' && c <= '9') i = (i << 4) + (c - '0');
		1953	else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10;
		1954	else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10;
		1955	else
		1956	{
		1957	error ("Non-hex digit used for Unicode escape");
		1958	break;
		1959	}
		1960	}
		1961
		1962	GCPRO1 (readcharfun);
		1963	lisp_char = call2(intern("decode-char"), intern("ucs"),
		1964	make_number(i));
		1965	UNGCPRO;
		1966
		1967	if (EQ(Qnil, lisp_char))
		1968	{
		1969	/* This is ugly and horrible and trashes the user's data. */
		1970	XSETFASTINT (i, MAKE_CHAR (charset_katakana_jisx0201,
		1971	34 + 128, 46 + 128));
		1972	return i;
		1973	}
		1974	else
		1975	{
		1976	return XFASTINT (lisp_char);
		1977	}
		1978	}
		1979
1931	default:	1980	default:
1932	if (BASE_LEADING_CODE_P (c))	1981	if (BASE_LEADING_CODE_P (c))
1933	c = read_multibyte (c, readcharfun);	1982	c = read_multibyte (c, readcharfun);