aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPhilipp Stephani2017-01-06 15:56:51 +0100
committerPhilipp Stephani2017-01-06 20:12:48 +0100
commit512e9886be693f61f9d1932f19461bf4482fba51 (patch)
tree84a9576c26b01fc8990e9290a3e52a3cb38c4550 /src
parent8f0376309ee37e4f1da21d78971c4df2df5fd7b6 (diff)
downloademacs-512e9886be693f61f9d1932f19461bf4482fba51.tar.gz
emacs-512e9886be693f61f9d1932f19461bf4482fba51.zip
Add support for Unicode whitespace in [:blank:]
See Bug#25366. * src/character.c (blankp): New function for checking Unicode horizontal whitespace. * src/regex.c (ISBLANK): Use 'blankp' for non-ASCII horizontal whitespace. (BIT_BLANK): New bit for range table. (re_wctype_to_bit, execute_charset): Use it. * test/lisp/subr-tests.el (subr-tests--string-match-p--blank): Add unit test for [:blank:] character class. * test/src/regex-tests.el (test): Adapt unit test. * doc/lispref/searching.texi (Char Classes): Document new Unicode behavior for [:blank:].
Diffstat (limited to 'src')
-rw-r--r--src/character.c17
-rw-r--r--src/character.h1
-rw-r--r--src/regex.c12
3 files changed, 26 insertions, 4 deletions
diff --git a/src/character.c b/src/character.c
index b594af040c1..bc99daf0df0 100644
--- a/src/character.c
+++ b/src/character.c
@@ -1038,6 +1038,23 @@ printablep (int c)
1038 || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ 1038 || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1039} 1039}
1040 1040
1041/* Return true if C is a horizontal whitespace character, as defined
1042 by http://www.unicode.org/reports/tr18/tr18-19.html#blank. */
1043bool
1044blankp (int c)
1045{
1046 /* Fast path for ASCII characters that are always assumed to
1047 constitute horizontal whitespace. */
1048 if (c == ' ' || c == '\t')
1049 return true;
1050
1051 Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1052 if (! INTEGERP (category))
1053 return false;
1054
1055 return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
1056}
1057
1041void 1058void
1042syms_of_character (void) 1059syms_of_character (void)
1043{ 1060{
diff --git a/src/character.h b/src/character.h
index fc8a0dd74d2..62d252e91ba 100644
--- a/src/character.h
+++ b/src/character.h
@@ -680,6 +680,7 @@ extern bool alphabeticp (int);
680extern bool alphanumericp (int); 680extern bool alphanumericp (int);
681extern bool graphicp (int); 681extern bool graphicp (int);
682extern bool printablep (int); 682extern bool printablep (int);
683extern bool blankp (int);
683 684
684/* Return a translation table of id number ID. */ 685/* Return a translation table of id number ID. */
685#define GET_TRANSLATION_TABLE(id) \ 686#define GET_TRANSLATION_TABLE(id) \
diff --git a/src/regex.c b/src/regex.c
index ae3fde80c9e..7e70c494f47 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -310,11 +310,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
310 || ((c) >= 'a' && (c) <= 'f') \ 310 || ((c) >= 'a' && (c) <= 'f') \
311 || ((c) >= 'A' && (c) <= 'F')) 311 || ((c) >= 'A' && (c) <= 'F'))
312 312
313/* This is only used for single-byte characters. */
314# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
315
316/* The rest must handle multibyte characters. */ 313/* The rest must handle multibyte characters. */
317 314
315# define ISBLANK(c) (IS_REAL_ASCII (c) \
316 ? ((c) == ' ' || (c) == '\t') \
317 : blankp (c))
318
318# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ 319# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
319 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ 320 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
320 : graphicp (c)) 321 : graphicp (c))
@@ -1790,6 +1791,7 @@ struct range_table_work_area
1790#define BIT_ALNUM 0x80 1791#define BIT_ALNUM 0x80
1791#define BIT_GRAPH 0x100 1792#define BIT_GRAPH 0x100
1792#define BIT_PRINT 0x200 1793#define BIT_PRINT 0x200
1794#define BIT_BLANK 0x400
1793 1795
1794 1796
1795/* Set the bit for character C in a list. */ 1797/* Set the bit for character C in a list. */
@@ -2066,8 +2068,9 @@ re_wctype_to_bit (re_wctype_t cc)
2066 case RECC_SPACE: return BIT_SPACE; 2068 case RECC_SPACE: return BIT_SPACE;
2067 case RECC_GRAPH: return BIT_GRAPH; 2069 case RECC_GRAPH: return BIT_GRAPH;
2068 case RECC_PRINT: return BIT_PRINT; 2070 case RECC_PRINT: return BIT_PRINT;
2071 case RECC_BLANK: return BIT_BLANK;
2069 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: 2072 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2070 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; 2073 case RECC_UNIBYTE: case RECC_ERROR: return 0;
2071 default: 2074 default:
2072 abort (); 2075 abort ();
2073 } 2076 }
@@ -4658,6 +4661,7 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
4658 (class_bits & BIT_ALNUM && ISALNUM (c)) || 4661 (class_bits & BIT_ALNUM && ISALNUM (c)) ||
4659 (class_bits & BIT_ALPHA && ISALPHA (c)) || 4662 (class_bits & BIT_ALPHA && ISALPHA (c)) ||
4660 (class_bits & BIT_SPACE && ISSPACE (c)) || 4663 (class_bits & BIT_SPACE && ISSPACE (c)) ||
4664 (class_bits & BIT_BLANK && ISBLANK (c)) ||
4661 (class_bits & BIT_WORD && ISWORD (c)) || 4665 (class_bits & BIT_WORD && ISWORD (c)) ||
4662 ((class_bits & BIT_UPPER) && 4666 ((class_bits & BIT_UPPER) &&
4663 (ISUPPER (c) || (corig != c && 4667 (ISUPPER (c) || (corig != c &&