diff options
| author | Eli Zaretskii | 2015-02-28 14:25:35 +0200 |
|---|---|---|
| committer | Eli Zaretskii | 2015-02-28 14:25:35 +0200 |
| commit | 1a50945fa4c666ae2ab5cd9419d23ad063ea1249 (patch) | |
| tree | e44604490fdf92f8912679a30eec7bb7ecb1929f /src | |
| parent | 31ecbf8d513540855aa07588f6746942aed453ba (diff) | |
| download | emacs-1a50945fa4c666ae2ab5cd9419d23ad063ea1249.tar.gz emacs-1a50945fa4c666ae2ab5cd9419d23ad063ea1249.zip | |
Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)
src/character.c (alphabeticp, decimalnump): New functions.
src/character.h (alphabeticp, decimalnump): Add prototypes.
src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties
for multibyte characters by calling alphabeticp and decimalnump.
(BIT_ALPHA, BIT_ALNUM): New bit masks.
(re_wctype_to_bit): Return them when the class is RECC_ALPHA or
RECC_ALNUM.
(re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
doc/lispref/searching.texi (Char Classes): Update the documentation of
[:alpha:] and [:alnum:].
etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
Diffstat (limited to 'src')
| -rw-r--r-- | src/ChangeLog | 13 | ||||
| -rw-r--r-- | src/character.c | 42 | ||||
| -rw-r--r-- | src/character.h | 3 | ||||
| -rw-r--r-- | src/regex.c | 18 |
4 files changed, 70 insertions, 6 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index df687914911..97ecbac0953 100644 --- a/src/ChangeLog +++ b/src/ChangeLog | |||
| @@ -1,3 +1,16 @@ | |||
| 1 | 2015-02-28 Eli Zaretskii <eliz@gnu.org> | ||
| 2 | |||
| 3 | * character.c (alphabeticp, decimalnump): New functions. | ||
| 4 | * character.h (alphabeticp, decimalnump): Add prototypes. | ||
| 5 | |||
| 6 | * regex.c (ISALNUM, ISALPHA): Check Unicode character properties | ||
| 7 | for multibyte characters by calling alphabeticp and decimalnump. | ||
| 8 | (BIT_ALPHA, BIT_ALNUM): New bit masks. | ||
| 9 | (re_wctype_to_bit): Return them when the class is RECC_ALPHA or | ||
| 10 | RECC_ALNUM. | ||
| 11 | (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate. | ||
| 12 | (Bug#19878) | ||
| 13 | |||
| 1 | 2015-02-27 Jan Djärv <jan.h.d@swipnet.se> | 14 | 2015-02-27 Jan Djärv <jan.h.d@swipnet.se> |
| 2 | 15 | ||
| 3 | * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also. | 16 | * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also. |
diff --git a/src/character.c b/src/character.c index 39d32c9d41a..999f99aa003 100644 --- a/src/character.c +++ b/src/character.c | |||
| @@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */) | |||
| 984 | 984 | ||
| 985 | #ifdef emacs | 985 | #ifdef emacs |
| 986 | 986 | ||
| 987 | /* Return 'true' if C is an alphabetic character as defined by its | ||
| 988 | Unicode properties. */ | ||
| 989 | bool | ||
| 990 | alphabeticp (int c) | ||
| 991 | { | ||
| 992 | Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); | ||
| 993 | |||
| 994 | if (INTEGERP (category)) | ||
| 995 | { | ||
| 996 | unicode_category_t gen_cat = XINT (category); | ||
| 997 | |||
| 998 | /* See UTS #18. There are additional characters that should be | ||
| 999 | here, those designated as Other_uppercase, Other_lowercase, | ||
| 1000 | and Other_alphabetic; FIXME. */ | ||
| 1001 | return (gen_cat == UNICODE_CATEGORY_Lu | ||
| 1002 | || gen_cat == UNICODE_CATEGORY_Ll | ||
| 1003 | || gen_cat == UNICODE_CATEGORY_Lt | ||
| 1004 | || gen_cat == UNICODE_CATEGORY_Lm | ||
| 1005 | || gen_cat == UNICODE_CATEGORY_Lo | ||
| 1006 | || gen_cat == UNICODE_CATEGORY_Mn | ||
| 1007 | || gen_cat == UNICODE_CATEGORY_Mc | ||
| 1008 | || gen_cat == UNICODE_CATEGORY_Me | ||
| 1009 | || gen_cat == UNICODE_CATEGORY_Nl) ? true : false; | ||
| 1010 | } | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | /* Return 'true' if C is an decimal-number character as defined by its | ||
| 1014 | Unicode properties. */ | ||
| 1015 | bool | ||
| 1016 | decimalnump (int c) | ||
| 1017 | { | ||
| 1018 | Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); | ||
| 1019 | |||
| 1020 | if (INTEGERP (category)) | ||
| 1021 | { | ||
| 1022 | unicode_category_t gen_cat = XINT (category); | ||
| 1023 | |||
| 1024 | /* See UTS #18. */ | ||
| 1025 | return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false; | ||
| 1026 | } | ||
| 1027 | } | ||
| 1028 | |||
| 987 | void | 1029 | void |
| 988 | syms_of_character (void) | 1030 | syms_of_character (void) |
| 989 | { | 1031 | { |
diff --git a/src/character.h b/src/character.h index 5043880cb42..7d902952db6 100644 --- a/src/character.h +++ b/src/character.h | |||
| @@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t, | |||
| 660 | extern Lisp_Object Vchar_unify_table; | 660 | extern Lisp_Object Vchar_unify_table; |
| 661 | extern Lisp_Object string_escape_byte8 (Lisp_Object); | 661 | extern Lisp_Object string_escape_byte8 (Lisp_Object); |
| 662 | 662 | ||
| 663 | extern bool alphabeticp (int); | ||
| 664 | extern bool decimalnump (int); | ||
| 665 | |||
| 663 | /* Return a translation table of id number ID. */ | 666 | /* Return a translation table of id number ID. */ |
| 664 | #define GET_TRANSLATION_TABLE(id) \ | 667 | #define GET_TRANSLATION_TABLE(id) \ |
| 665 | (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)])) | 668 | (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)])) |
diff --git a/src/regex.c b/src/regex.c index 41fe3fa8088..1afc5037594 100644 --- a/src/regex.c +++ b/src/regex.c | |||
| @@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; | |||
| 324 | ? (((c) >= 'a' && (c) <= 'z') \ | 324 | ? (((c) >= 'a' && (c) <= 'z') \ |
| 325 | || ((c) >= 'A' && (c) <= 'Z') \ | 325 | || ((c) >= 'A' && (c) <= 'Z') \ |
| 326 | || ((c) >= '0' && (c) <= '9')) \ | 326 | || ((c) >= '0' && (c) <= '9')) \ |
| 327 | : SYNTAX (c) == Sword) | 327 | : (alphabeticp (c) || decimalnump (c))) |
| 328 | 328 | ||
| 329 | # define ISALPHA(c) (IS_REAL_ASCII (c) \ | 329 | # define ISALPHA(c) (IS_REAL_ASCII (c) \ |
| 330 | ? (((c) >= 'a' && (c) <= 'z') \ | 330 | ? (((c) >= 'a' && (c) <= 'z') \ |
| 331 | || ((c) >= 'A' && (c) <= 'Z')) \ | 331 | || ((c) >= 'A' && (c) <= 'Z')) \ |
| 332 | : SYNTAX (c) == Sword) | 332 | : alphabeticp (c)) |
| 333 | 333 | ||
| 334 | # define ISLOWER(c) lowercasep (c) | 334 | # define ISLOWER(c) lowercasep (c) |
| 335 | 335 | ||
| @@ -1872,6 +1872,8 @@ struct range_table_work_area | |||
| 1872 | #define BIT_SPACE 0x8 | 1872 | #define BIT_SPACE 0x8 |
| 1873 | #define BIT_UPPER 0x10 | 1873 | #define BIT_UPPER 0x10 |
| 1874 | #define BIT_MULTIBYTE 0x20 | 1874 | #define BIT_MULTIBYTE 0x20 |
| 1875 | #define BIT_ALPHA 0x40 | ||
| 1876 | #define BIT_ALNUM 0x80 | ||
| 1875 | 1877 | ||
| 1876 | 1878 | ||
| 1877 | /* Set the bit for character C in a list. */ | 1879 | /* Set the bit for character C in a list. */ |
| @@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc) | |||
| 2072 | { | 2074 | { |
| 2073 | case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: | 2075 | case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: |
| 2074 | case RECC_MULTIBYTE: return BIT_MULTIBYTE; | 2076 | case RECC_MULTIBYTE: return BIT_MULTIBYTE; |
| 2075 | case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; | 2077 | case RECC_ALPHA: return BIT_ALPHA; |
| 2078 | case RECC_ALNUM: return BIT_ALNUM; | ||
| 2079 | case RECC_WORD: return BIT_WORD; | ||
| 2076 | case RECC_LOWER: return BIT_LOWER; | 2080 | case RECC_LOWER: return BIT_LOWER; |
| 2077 | case RECC_UPPER: return BIT_UPPER; | 2081 | case RECC_UPPER: return BIT_UPPER; |
| 2078 | case RECC_PUNCT: return BIT_PUNCT; | 2082 | case RECC_PUNCT: return BIT_PUNCT; |
| @@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, | |||
| 2930 | #endif /* emacs */ | 2934 | #endif /* emacs */ |
| 2931 | /* In most cases the matching rule for char classes | 2935 | /* In most cases the matching rule for char classes |
| 2932 | only uses the syntax table for multibyte chars, | 2936 | only uses the syntax table for multibyte chars, |
| 2933 | so that the content of the syntax-table it is not | 2937 | so that the content of the syntax-table is not |
| 2934 | hardcoded in the range_table. SPACE and WORD are | 2938 | hardcoded in the range_table. SPACE and WORD are |
| 2935 | the two exceptions. */ | 2939 | the two exceptions. */ |
| 2936 | if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) | 2940 | if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) |
| @@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, | |||
| 2945 | p = class_beg; | 2949 | p = class_beg; |
| 2946 | SET_LIST_BIT ('['); | 2950 | SET_LIST_BIT ('['); |
| 2947 | 2951 | ||
| 2948 | /* Because the `:' may starts the range, we | 2952 | /* Because the `:' may start the range, we |
| 2949 | can't simply set bit and repeat the loop. | 2953 | can't simply set bit and repeat the loop. |
| 2950 | Instead, just set it to C and handle below. */ | 2954 | Instead, just set it to C and handle below. */ |
| 2951 | c = ':'; | 2955 | c = ':'; |
| @@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, | |||
| 5513 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) | 5517 | | (class_bits & BIT_PUNCT && ISPUNCT (c)) |
| 5514 | | (class_bits & BIT_SPACE && ISSPACE (c)) | 5518 | | (class_bits & BIT_SPACE && ISSPACE (c)) |
| 5515 | | (class_bits & BIT_UPPER && ISUPPER (c)) | 5519 | | (class_bits & BIT_UPPER && ISUPPER (c)) |
| 5516 | | (class_bits & BIT_WORD && ISWORD (c))) | 5520 | | (class_bits & BIT_WORD && ISWORD (c)) |
| 5521 | | (class_bits & BIT_ALPHA && ISALPHA (c)) | ||
| 5522 | | (class_bits & BIT_ALNUM && ISALNUM (c))) | ||
| 5517 | not = !not; | 5523 | not = !not; |
| 5518 | else | 5524 | else |
| 5519 | CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); | 5525 | CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); |