diff options
| author | Philipp Stephani | 2017-01-06 15:56:51 +0100 |
|---|---|---|
| committer | Philipp Stephani | 2017-01-06 20:12:48 +0100 |
| commit | 512e9886be693f61f9d1932f19461bf4482fba51 (patch) | |
| tree | 84a9576c26b01fc8990e9290a3e52a3cb38c4550 /src | |
| parent | 8f0376309ee37e4f1da21d78971c4df2df5fd7b6 (diff) | |
| download | emacs-512e9886be693f61f9d1932f19461bf4482fba51.tar.gz emacs-512e9886be693f61f9d1932f19461bf4482fba51.zip | |
Add support for Unicode whitespace in [:blank:]
See Bug#25366.
* src/character.c (blankp): New function for checking Unicode
horizontal whitespace.
* src/regex.c (ISBLANK): Use 'blankp' for non-ASCII horizontal
whitespace.
(BIT_BLANK): New bit for range table.
(re_wctype_to_bit, execute_charset): Use it.
* test/lisp/subr-tests.el (subr-tests--string-match-p--blank): Add
unit test for [:blank:] character class.
* test/src/regex-tests.el (test): Adapt unit test.
* doc/lispref/searching.texi (Char Classes): Document new Unicode
behavior for [:blank:].
Diffstat (limited to 'src')
| -rw-r--r-- | src/character.c | 17 | ||||
| -rw-r--r-- | src/character.h | 1 | ||||
| -rw-r--r-- | src/regex.c | 12 |
3 files changed, 26 insertions, 4 deletions
diff --git a/src/character.c b/src/character.c index b594af040c1..bc99daf0df0 100644 --- a/src/character.c +++ b/src/character.c | |||
| @@ -1038,6 +1038,23 @@ printablep (int c) | |||
| 1038 | || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ | 1038 | || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ |
| 1039 | } | 1039 | } |
| 1040 | 1040 | ||
| 1041 | /* Return true if C is a horizontal whitespace character, as defined | ||
| 1042 | by http://www.unicode.org/reports/tr18/tr18-19.html#blank. */ | ||
| 1043 | bool | ||
| 1044 | blankp (int c) | ||
| 1045 | { | ||
| 1046 | /* Fast path for ASCII characters that are always assumed to | ||
| 1047 | constitute horizontal whitespace. */ | ||
| 1048 | if (c == ' ' || c == '\t') | ||
| 1049 | return true; | ||
| 1050 | |||
| 1051 | Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); | ||
| 1052 | if (! INTEGERP (category)) | ||
| 1053 | return false; | ||
| 1054 | |||
| 1055 | return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */ | ||
| 1056 | } | ||
| 1057 | |||
| 1041 | void | 1058 | void |
| 1042 | syms_of_character (void) | 1059 | syms_of_character (void) |
| 1043 | { | 1060 | { |
diff --git a/src/character.h b/src/character.h index fc8a0dd74d2..62d252e91ba 100644 --- a/src/character.h +++ b/src/character.h | |||
| @@ -680,6 +680,7 @@ extern bool alphabeticp (int); | |||
| 680 | extern bool alphanumericp (int); | 680 | extern bool alphanumericp (int); |
| 681 | extern bool graphicp (int); | 681 | extern bool graphicp (int); |
| 682 | extern bool printablep (int); | 682 | extern bool printablep (int); |
| 683 | extern bool blankp (int); | ||
| 683 | 684 | ||
| 684 | /* Return a translation table of id number ID. */ | 685 | /* Return a translation table of id number ID. */ |
| 685 | #define GET_TRANSLATION_TABLE(id) \ | 686 | #define GET_TRANSLATION_TABLE(id) \ |
diff --git a/src/regex.c b/src/regex.c index ae3fde80c9e..7e70c494f47 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -310,11 +310,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; | |||
| 310 | || ((c) >= 'a' && (c) <= 'f') \ | 310 | || ((c) >= 'a' && (c) <= 'f') \ |
| 311 | || ((c) >= 'A' && (c) <= 'F')) | 311 | || ((c) >= 'A' && (c) <= 'F')) |
| 312 | 312 | ||
| 313 | /* This is only used for single-byte characters. */ | ||
| 314 | # define ISBLANK(c) ((c) == ' ' || (c) == '\t') | ||
| 315 | |||
| 316 | /* The rest must handle multibyte characters. */ | 313 | /* The rest must handle multibyte characters. */ |
| 317 | 314 | ||
| 315 | # define ISBLANK(c) (IS_REAL_ASCII (c) \ | ||
| 316 | ? ((c) == ' ' || (c) == '\t') \ | ||
| 317 | : blankp (c)) | ||
| 318 | |||
| 318 | # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ | 319 | # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ |
| 319 | ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ | 320 | ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ |
| 320 | : graphicp (c)) | 321 | : graphicp (c)) |
| @@ -1790,6 +1791,7 @@ struct range_table_work_area | |||
| 1790 | #define BIT_ALNUM 0x80 | 1791 | #define BIT_ALNUM 0x80 |
| 1791 | #define BIT_GRAPH 0x100 | 1792 | #define BIT_GRAPH 0x100 |
| 1792 | #define BIT_PRINT 0x200 | 1793 | #define BIT_PRINT 0x200 |
| 1794 | #define BIT_BLANK 0x400 | ||
| 1793 | 1795 | ||
| 1794 | 1796 | ||
| 1795 | /* Set the bit for character C in a list. */ | 1797 | /* Set the bit for character C in a list. */ |
| @@ -2066,8 +2068,9 @@ re_wctype_to_bit (re_wctype_t cc) | |||
| 2066 | case RECC_SPACE: return BIT_SPACE; | 2068 | case RECC_SPACE: return BIT_SPACE; |
| 2067 | case RECC_GRAPH: return BIT_GRAPH; | 2069 | case RECC_GRAPH: return BIT_GRAPH; |
| 2068 | case RECC_PRINT: return BIT_PRINT; | 2070 | case RECC_PRINT: return BIT_PRINT; |
| 2071 | case RECC_BLANK: return BIT_BLANK; | ||
| 2069 | case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: | 2072 | case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: |
| 2070 | case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; | 2073 | case RECC_UNIBYTE: case RECC_ERROR: return 0; |
| 2071 | default: | 2074 | default: |
| 2072 | abort (); | 2075 | abort (); |
| 2073 | } | 2076 | } |
| @@ -4658,6 +4661,7 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) | |||
| 4658 | (class_bits & BIT_ALNUM && ISALNUM (c)) || | 4661 | (class_bits & BIT_ALNUM && ISALNUM (c)) || |
| 4659 | (class_bits & BIT_ALPHA && ISALPHA (c)) || | 4662 | (class_bits & BIT_ALPHA && ISALPHA (c)) || |
| 4660 | (class_bits & BIT_SPACE && ISSPACE (c)) || | 4663 | (class_bits & BIT_SPACE && ISSPACE (c)) || |
| 4664 | (class_bits & BIT_BLANK && ISBLANK (c)) || | ||
| 4661 | (class_bits & BIT_WORD && ISWORD (c)) || | 4665 | (class_bits & BIT_WORD && ISWORD (c)) || |
| 4662 | ((class_bits & BIT_UPPER) && | 4666 | ((class_bits & BIT_UPPER) && |
| 4663 | (ISUPPER (c) || (corig != c && | 4667 | (ISUPPER (c) || (corig != c && |