aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorEli Zaretskii2015-04-14 18:47:04 +0300
committerEli Zaretskii2015-04-14 18:47:04 +0300
commit6c284c6b5828bc4407f7201499e0507ce0e5a0a0 (patch)
tree0e89f736a55245dc7e59725e57effb3b36b6dfe8 /src
parent8802474a219ad3be01825466a8837d3775f8b31b (diff)
downloademacs-6c284c6b5828bc4407f7201499e0507ce0e5a0a0.tar.gz
emacs-6c284c6b5828bc4407f7201499e0507ce0e5a0a0.zip
Make [:print:] support non-ASCII characters correctly
* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters. (BIT_PRINT): New bit mask. (re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT. * src/character.c (printablep): New function. * src/character.h (printablep): Add prototype. * lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior of 'print', 'alnum', and 'alphabetic'. * doc/lispref/searching.texi (Char Classes): Document the new behavior of [:print:]. * etc/NEWS: Mention the new behavior of [:print:].
Diffstat (limited to 'src')
-rw-r--r--src/character.c16
-rw-r--r--src/character.h1
-rw-r--r--src/regex.c9
3 files changed, 23 insertions, 3 deletions
diff --git a/src/character.c b/src/character.c
index ad78f512f43..b357dd5a334 100644
--- a/src/character.c
+++ b/src/character.c
@@ -1022,6 +1022,22 @@ decimalnump (int c)
1022 return gen_cat == UNICODE_CATEGORY_Nd; 1022 return gen_cat == UNICODE_CATEGORY_Nd;
1023} 1023}
1024 1024
1025/* Return 'true' if C is a printable character as defined by its
1026 Unicode properties. */
1027bool
1028printablep (int c)
1029{
1030 Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1031 if (! INTEGERP (category))
1032 return false;
1033 EMACS_INT gen_cat = XINT (category);
1034
1035 /* See UTS #18. */
1036 return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
1037 || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
1038 || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1039}
1040
1025void 1041void
1026syms_of_character (void) 1042syms_of_character (void)
1027{ 1043{
diff --git a/src/character.h b/src/character.h
index 7d902952db6..1a5d2c8a670 100644
--- a/src/character.h
+++ b/src/character.h
@@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
662 662
663extern bool alphabeticp (int); 663extern bool alphabeticp (int);
664extern bool decimalnump (int); 664extern bool decimalnump (int);
665extern bool printablep (int);
665 666
666/* Return a translation table of id number ID. */ 667/* Return a translation table of id number ID. */
667#define GET_TRANSLATION_TABLE(id) \ 668#define GET_TRANSLATION_TABLE(id) \
diff --git a/src/regex.c b/src/regex.c
index 1afc5037594..b9d09d02c22 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
318 318
319# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ 319# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
320 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ 320 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
321 : 1) 321 : printablep (c))
322 322
323# define ISALNUM(c) (IS_REAL_ASCII (c) \ 323# define ISALNUM(c) (IS_REAL_ASCII (c) \
324 ? (((c) >= 'a' && (c) <= 'z') \ 324 ? (((c) >= 'a' && (c) <= 'z') \
@@ -1865,7 +1865,8 @@ struct range_table_work_area
1865#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) 1865#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1866 1866
1867/* Bits used to implement the multibyte-part of the various character classes 1867/* Bits used to implement the multibyte-part of the various character classes
1868 such as [:alnum:] in a charset's range table. */ 1868 such as [:alnum:] in a charset's range table. The code currently assumes
1869 that only the low 16 bits are used. */
1869#define BIT_WORD 0x1 1870#define BIT_WORD 0x1
1870#define BIT_LOWER 0x2 1871#define BIT_LOWER 0x2
1871#define BIT_PUNCT 0x4 1872#define BIT_PUNCT 0x4
@@ -1874,6 +1875,7 @@ struct range_table_work_area
1874#define BIT_MULTIBYTE 0x20 1875#define BIT_MULTIBYTE 0x20
1875#define BIT_ALPHA 0x40 1876#define BIT_ALPHA 0x40
1876#define BIT_ALNUM 0x80 1877#define BIT_ALNUM 0x80
1878#define BIT_PRINT 0x100
1877 1879
1878 1880
1879/* Set the bit for character C in a list. */ 1881/* Set the bit for character C in a list. */
@@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
2072{ 2074{
2073 switch (cc) 2075 switch (cc)
2074 { 2076 {
2075 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: 2077 case RECC_NONASCII: case RECC_GRAPH:
2076 case RECC_MULTIBYTE: return BIT_MULTIBYTE; 2078 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2077 case RECC_ALPHA: return BIT_ALPHA; 2079 case RECC_ALPHA: return BIT_ALPHA;
2078 case RECC_ALNUM: return BIT_ALNUM; 2080 case RECC_ALNUM: return BIT_ALNUM;
@@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
2081 case RECC_UPPER: return BIT_UPPER; 2083 case RECC_UPPER: return BIT_UPPER;
2082 case RECC_PUNCT: return BIT_PUNCT; 2084 case RECC_PUNCT: return BIT_PUNCT;
2083 case RECC_SPACE: return BIT_SPACE; 2085 case RECC_SPACE: return BIT_SPACE;
2086 case RECC_PRINT: return BIT_PRINT;
2084 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: 2087 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2085 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; 2088 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2086 default: 2089 default: