aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorEli Zaretskii2015-02-28 14:25:35 +0200
committerEli Zaretskii2015-02-28 14:25:35 +0200
commit1a50945fa4c666ae2ab5cd9419d23ad063ea1249 (patch)
treee44604490fdf92f8912679a30eec7bb7ecb1929f /src
parent31ecbf8d513540855aa07588f6746942aed453ba (diff)
downloademacs-1a50945fa4c666ae2ab5cd9419d23ad063ea1249.tar.gz
emacs-1a50945fa4c666ae2ab5cd9419d23ad063ea1249.zip
Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)
src/character.c (alphabeticp, decimalnump): New functions. src/character.h (alphabeticp, decimalnump): Add prototypes. src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties for multibyte characters by calling alphabeticp and decimalnump. (BIT_ALPHA, BIT_ALNUM): New bit masks. (re_wctype_to_bit): Return them when the class is RECC_ALPHA or RECC_ALNUM. (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate. doc/lispref/searching.texi (Char Classes): Update the documentation of [:alpha:] and [:alnum:]. etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
Diffstat (limited to 'src')
-rw-r--r--src/ChangeLog13
-rw-r--r--src/character.c42
-rw-r--r--src/character.h3
-rw-r--r--src/regex.c18
4 files changed, 70 insertions, 6 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index df687914911..97ecbac0953 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,16 @@
12015-02-28 Eli Zaretskii <eliz@gnu.org>
2
3 * character.c (alphabeticp, decimalnump): New functions.
4 * character.h (alphabeticp, decimalnump): Add prototypes.
5
6 * regex.c (ISALNUM, ISALPHA): Check Unicode character properties
7 for multibyte characters by calling alphabeticp and decimalnump.
8 (BIT_ALPHA, BIT_ALNUM): New bit masks.
9 (re_wctype_to_bit): Return them when the class is RECC_ALPHA or
10 RECC_ALNUM.
11 (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
12 (Bug#19878)
13
12015-02-27 Jan Djärv <jan.h.d@swipnet.se> 142015-02-27 Jan Djärv <jan.h.d@swipnet.se>
2 15
3 * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also. 16 * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.
diff --git a/src/character.c b/src/character.c
index 39d32c9d41a..999f99aa003 100644
--- a/src/character.c
+++ b/src/character.c
@@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
984 984
985#ifdef emacs 985#ifdef emacs
986 986
987/* Return 'true' if C is an alphabetic character as defined by its
988 Unicode properties. */
989bool
990alphabeticp (int c)
991{
992 Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
993
994 if (INTEGERP (category))
995 {
996 unicode_category_t gen_cat = XINT (category);
997
998 /* See UTS #18. There are additional characters that should be
999 here, those designated as Other_uppercase, Other_lowercase,
1000 and Other_alphabetic; FIXME. */
1001 return (gen_cat == UNICODE_CATEGORY_Lu
1002 || gen_cat == UNICODE_CATEGORY_Ll
1003 || gen_cat == UNICODE_CATEGORY_Lt
1004 || gen_cat == UNICODE_CATEGORY_Lm
1005 || gen_cat == UNICODE_CATEGORY_Lo
1006 || gen_cat == UNICODE_CATEGORY_Mn
1007 || gen_cat == UNICODE_CATEGORY_Mc
1008 || gen_cat == UNICODE_CATEGORY_Me
1009 || gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
1010 }
1011}
1012
1013/* Return 'true' if C is an decimal-number character as defined by its
1014 Unicode properties. */
1015bool
1016decimalnump (int c)
1017{
1018 Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1019
1020 if (INTEGERP (category))
1021 {
1022 unicode_category_t gen_cat = XINT (category);
1023
1024 /* See UTS #18. */
1025 return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
1026 }
1027}
1028
987void 1029void
988syms_of_character (void) 1030syms_of_character (void)
989{ 1031{
diff --git a/src/character.h b/src/character.h
index 5043880cb42..7d902952db6 100644
--- a/src/character.h
+++ b/src/character.h
@@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
660extern Lisp_Object Vchar_unify_table; 660extern Lisp_Object Vchar_unify_table;
661extern Lisp_Object string_escape_byte8 (Lisp_Object); 661extern Lisp_Object string_escape_byte8 (Lisp_Object);
662 662
663extern bool alphabeticp (int);
664extern bool decimalnump (int);
665
663/* Return a translation table of id number ID. */ 666/* Return a translation table of id number ID. */
664#define GET_TRANSLATION_TABLE(id) \ 667#define GET_TRANSLATION_TABLE(id) \
665 (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)])) 668 (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
diff --git a/src/regex.c b/src/regex.c
index 41fe3fa8088..1afc5037594 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
324 ? (((c) >= 'a' && (c) <= 'z') \ 324 ? (((c) >= 'a' && (c) <= 'z') \
325 || ((c) >= 'A' && (c) <= 'Z') \ 325 || ((c) >= 'A' && (c) <= 'Z') \
326 || ((c) >= '0' && (c) <= '9')) \ 326 || ((c) >= '0' && (c) <= '9')) \
327 : SYNTAX (c) == Sword) 327 : (alphabeticp (c) || decimalnump (c)))
328 328
329# define ISALPHA(c) (IS_REAL_ASCII (c) \ 329# define ISALPHA(c) (IS_REAL_ASCII (c) \
330 ? (((c) >= 'a' && (c) <= 'z') \ 330 ? (((c) >= 'a' && (c) <= 'z') \
331 || ((c) >= 'A' && (c) <= 'Z')) \ 331 || ((c) >= 'A' && (c) <= 'Z')) \
332 : SYNTAX (c) == Sword) 332 : alphabeticp (c))
333 333
334# define ISLOWER(c) lowercasep (c) 334# define ISLOWER(c) lowercasep (c)
335 335
@@ -1872,6 +1872,8 @@ struct range_table_work_area
1872#define BIT_SPACE 0x8 1872#define BIT_SPACE 0x8
1873#define BIT_UPPER 0x10 1873#define BIT_UPPER 0x10
1874#define BIT_MULTIBYTE 0x20 1874#define BIT_MULTIBYTE 0x20
1875#define BIT_ALPHA 0x40
1876#define BIT_ALNUM 0x80
1875 1877
1876 1878
1877/* Set the bit for character C in a list. */ 1879/* Set the bit for character C in a list. */
@@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc)
2072 { 2074 {
2073 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: 2075 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2074 case RECC_MULTIBYTE: return BIT_MULTIBYTE; 2076 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2075 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; 2077 case RECC_ALPHA: return BIT_ALPHA;
2078 case RECC_ALNUM: return BIT_ALNUM;
2079 case RECC_WORD: return BIT_WORD;
2076 case RECC_LOWER: return BIT_LOWER; 2080 case RECC_LOWER: return BIT_LOWER;
2077 case RECC_UPPER: return BIT_UPPER; 2081 case RECC_UPPER: return BIT_UPPER;
2078 case RECC_PUNCT: return BIT_PUNCT; 2082 case RECC_PUNCT: return BIT_PUNCT;
@@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2930#endif /* emacs */ 2934#endif /* emacs */
2931 /* In most cases the matching rule for char classes 2935 /* In most cases the matching rule for char classes
2932 only uses the syntax table for multibyte chars, 2936 only uses the syntax table for multibyte chars,
2933 so that the content of the syntax-table it is not 2937 so that the content of the syntax-table is not
2934 hardcoded in the range_table. SPACE and WORD are 2938 hardcoded in the range_table. SPACE and WORD are
2935 the two exceptions. */ 2939 the two exceptions. */
2936 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) 2940 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
@@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2945 p = class_beg; 2949 p = class_beg;
2946 SET_LIST_BIT ('['); 2950 SET_LIST_BIT ('[');
2947 2951
2948 /* Because the `:' may starts the range, we 2952 /* Because the `:' may start the range, we
2949 can't simply set bit and repeat the loop. 2953 can't simply set bit and repeat the loop.
2950 Instead, just set it to C and handle below. */ 2954 Instead, just set it to C and handle below. */
2951 c = ':'; 2955 c = ':';
@@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
5513 | (class_bits & BIT_PUNCT && ISPUNCT (c)) 5517 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5514 | (class_bits & BIT_SPACE && ISSPACE (c)) 5518 | (class_bits & BIT_SPACE && ISSPACE (c))
5515 | (class_bits & BIT_UPPER && ISUPPER (c)) 5519 | (class_bits & BIT_UPPER && ISUPPER (c))
5516 | (class_bits & BIT_WORD && ISWORD (c))) 5520 | (class_bits & BIT_WORD && ISWORD (c))
5521 | (class_bits & BIT_ALPHA && ISALPHA (c))
5522 | (class_bits & BIT_ALNUM && ISALNUM (c)))
5517 not = !not; 5523 not = !not;
5518 else 5524 else
5519 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); 5525 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);