Refactor regex character class parsing in [:name:]

re_wctype function is used in three separate places and in all of those places almost exact code extracting the name from [:name:] surrounds it. Furthermore, re_wctype requires a NUL-terminated string, so the name of the character class is copied to a temporary buffer. The code duplication and unnecessary memory copying can be avoided by pushing the responsibility of parsing the whole [:name:] sequence to the function. Furthermore, since now the function has access to the length of the character class name (since it’s doing the parsing), it can take advantage of that information in skipping some string comparisons and using a constant-length memcmp instead of strcmp which needs to take care of NUL bytes. * src/regex.c (re_wctype): Delete function. Replace it with: (re_wctype_parse): New function which parses a whole [:name:] string and returns a RECC_* constant or -1 if the string is not of [:name:] format. (regex_compile): Use re_wctype_parse. * src/syntax.c (skip_chars): Use re_wctype_parse.
author: Michal Nazarewicz 2016-07-17 03:09:38 +0200
committer: Michal Nazarewicz 2016-08-02 15:39:10 +0200
commit: 4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree: 43a158bf0635a01bf5946730ac439fd0b3b8f606 /src
parent: e7257061317c604492d20f26f312b9e925aa1860 (diff)
download: emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz
emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.zip
3 files changed, 181 insertions, 239 deletions
diff --git a/src/regex.c b/src/regex.c
index 1f2a1f086de..3a25835f452 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1969,29 +1969,96 @@ struct range_table_work_area
 #if ! WIDE_CHAR_SUPPORT
-/* Map a string to the char class it names (if any).  */
+/* Parse a character class, i.e. string such as "[:name:]".  *strp
+   points to the string to be parsed and limit is length, in bytes, of
+   that string.
+   If *strp point to a string that begins with "[:name:]", where name is
+   a non-empty sequence of lower case letters, *strp will be advanced past the
+   closing square bracket and RECC_* constant which maps to the name will be
+   returned.  If name is not a valid character class name zero, or RECC_ERROR,
+   is returned.
+   Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned.
+   The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
+ */
 re_wctype_t
-re_wctype (const_re_char *str)
+re_wctype_parse (const unsigned char **strp, unsigned limit)
 {
-  const char *string = (const char *) str;
+  const char *beg = (const char *)*strp, *it;
-  if      (STREQ (string, "alnum"))     return RECC_ALNUM;
-  else if (STREQ (string, "alpha"))     return RECC_ALPHA;
+  if (limit < 4 || beg[0] != '[' || beg[1] != ':')
-  else if (STREQ (string, "word"))      return RECC_WORD;
+    return -1;
-  else if (STREQ (string, "ascii"))     return RECC_ASCII;
-  else if (STREQ (string, "nonascii"))  return RECC_NONASCII;
+  beg += 2;  /* skip opening ‘[:’ */
-  else if (STREQ (string, "graph"))     return RECC_GRAPH;
+  limit -= 3;  /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */
-  else if (STREQ (string, "lower"))     return RECC_LOWER;
+  for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
-  else if (STREQ (string, "print"))     return RECC_PRINT;
+    if (!--limit)
-  else if (STREQ (string, "punct"))     return RECC_PUNCT;
+      return -1;
-  else if (STREQ (string, "space"))     return RECC_SPACE;
-  else if (STREQ (string, "upper"))     return RECC_UPPER;
+  *strp = (const unsigned char *)(it + 2);
-  else if (STREQ (string, "unibyte"))   return RECC_UNIBYTE;
-  else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
+  /* Sort tests in the length=five case by frequency the classes to minimise
-  else if (STREQ (string, "digit"))     return RECC_DIGIT;
+     number of times we fail the comparison.  The frequencies of character class
-  else if (STREQ (string, "xdigit"))    return RECC_XDIGIT;
+     names used in Emacs sources as of 2016-07-27:
-  else if (STREQ (string, "cntrl"))     return RECC_CNTRL;
-  else if (STREQ (string, "blank"))     return RECC_BLANK;
+     $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
-  else return 0;
+           sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
+         213 [:alnum:]
+         104 [:alpha:]
+          62 [:space:]
+          39 [:digit:]
+          36 [:blank:]
+          26 [:word:]
+          26 [:upper:]
+          21 [:lower:]
+          10 [:xdigit:]
+          10 [:punct:]
+          10 [:ascii:]
+           4 [:nonascii:]
+           4 [:graph:]
+           2 [:print:]
+           2 [:cntrl:]
+           1 [:ff:]
+     If you update this list, consider also updating chain of or’ed conditions
+     in execute_charset function.
+   */
+  switch (it - beg) {
+  case 4:
+    if (!memcmp (beg, "word", 4))      return RECC_WORD;
+    break;
+  case 5:
+    if (!memcmp (beg, "alnum", 5))     return RECC_ALNUM;
+    if (!memcmp (beg, "alpha", 5))     return RECC_ALPHA;
+    if (!memcmp (beg, "space", 5))     return RECC_SPACE;
+    if (!memcmp (beg, "digit", 5))     return RECC_DIGIT;
+    if (!memcmp (beg, "blank", 5))     return RECC_BLANK;
+    if (!memcmp (beg, "upper", 5))     return RECC_UPPER;
+    if (!memcmp (beg, "lower", 5))     return RECC_LOWER;
+    if (!memcmp (beg, "punct", 5))     return RECC_PUNCT;
+    if (!memcmp (beg, "ascii", 5))     return RECC_ASCII;
+    if (!memcmp (beg, "graph", 5))     return RECC_GRAPH;
+    if (!memcmp (beg, "print", 5))     return RECC_PRINT;
+    if (!memcmp (beg, "cntrl", 5))     return RECC_CNTRL;
+    break;
+  case 6:
+    if (!memcmp (beg, "xdigit", 6))    return RECC_XDIGIT;
+    break;
+  case 7:
+    if (!memcmp (beg, "unibyte", 7))   return RECC_UNIBYTE;
+    break;
+  case 8:
+    if (!memcmp (beg, "nonascii", 8))  return RECC_NONASCII;
+    break;
+  case 9:
+    if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
+    break;
+  }
+  return RECC_ERROR;
 }
 /* True if CH is in the char class CC.  */
@@ -2776,10 +2843,74 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
              {
                boolean escaped_char = false;
                const unsigned char *p2 = p;
+                re_wctype_t cc;
                re_wchar_t ch;
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+                /* See if we're at the beginning of a possible character
+                   class.  */
+                if (syntax & RE_CHAR_CLASSES &&
+                    (cc = re_wctype_parse(&p, pend - p)) != -1)
+                  {
+                    if (cc == 0)
+                      FREE_STACK_RETURN (REG_ECTYPE);
+                    if (p == pend)
+                      FREE_STACK_RETURN (REG_EBRACK);
+#ifndef emacs
+                    for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
+                      if (re_iswctype (btowc (ch), cc))
+                        {
+                          c = TRANSLATE (ch);
+                          if (c < (1 << BYTEWIDTH))
+                            SET_LIST_BIT (c);
+                        }
+#else  /* emacs */
+                    /* Most character classes in a multibyte match just set
+                       a flag.  Exceptions are is_blank, is_digit, is_cntrl, and
+                       is_xdigit, since they can only match ASCII characters.
+                       We don't need to handle them for multibyte.  They are
+                       distinguished by a negative wctype.  */
+                    /* Setup the gl_state object to its buffer-defined value.
+                       This hardcodes the buffer-global syntax-table for ASCII
+                       chars, while the other chars will obey syntax-table
+                       properties.  It's not ideal, but it's the way it's been
+                       done until now.  */
+                    SETUP_BUFFER_SYNTAX_TABLE ();
+                    for (ch = 0; ch < 256; ++ch)
+                      {
+                        c = RE_CHAR_TO_MULTIBYTE (ch);
+                        if (! CHAR_BYTE8_P (c)
+                            && re_iswctype (c, cc))
+                          {
+                            SET_LIST_BIT (ch);
+                            c1 = TRANSLATE (c);
+                            if (c1 == c)
+                              continue;
+                            if (ASCII_CHAR_P (c1))
+                              SET_LIST_BIT (c1);
+                            else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
+                              SET_LIST_BIT (c1);
+                          }
+                      }
+                    SET_RANGE_TABLE_WORK_AREA_BIT
+                      (range_table_work, re_wctype_to_bit (cc));
+#endif  /* emacs */
+                    /* In most cases the matching rule for char classes only
+                       uses the syntax table for multibyte chars, so that the
+                       content of the syntax-table is not hardcoded in the
+                       range_table.  SPACE and WORD are the two exceptions.  */
+                    if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
+                      bufp->used_syntax = 1;
+                    /* Repeat the loop. */
+                    continue;
+                  }
                /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
                   always be determined from TRANSLATE(X) and TRANSLATE(Y)
                   So the translation is done later in a loop.  Example:
@@ -2803,119 +2934,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
                      break;
                  }
-                /* See if we're at the beginning of a possible character
-                   class.  */
-                if (!escaped_char &&
-                    syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
-                  {
-                    /* Leave room for the null.  */
-                    unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
-                    const unsigned char *class_beg;
-                    PATFETCH (c);
-                    c1 = 0;
-                    class_beg = p;
-                    /* If pattern is `[[:'.  */
-                    if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-                    for (;;)
-                      {
-                        PATFETCH (c);
-                        if ((c == ':' && *p == ']') || p == pend)
-                          break;
-                        if (c1 < CHAR_CLASS_MAX_LENGTH)
-                          str[c1++] = c;
-                        else
-                          /* This is in any case an invalid class name.  */
-                          str[0] = '\0';
-                      }
-                    str[c1] = '\0';
-                    /* If isn't a word bracketed by `[:' and `:]':
-                       undo the ending character, the letters, and
-                       leave the leading `:' and `[' (but set bits for
-                       them).  */
-                    if (c == ':' && *p == ']')
-                      {
-                        re_wctype_t cc = re_wctype (str);
-                        if (cc == 0)
-                          FREE_STACK_RETURN (REG_ECTYPE);
-                        /* Throw away the ] at the end of the character
-                           class.  */
-                        PATFETCH (c);
-                        if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-#ifndef emacs
-                        for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
-                          if (re_iswctype (btowc (ch), cc))
-                            {
-                              c = TRANSLATE (ch);
-                              if (c < (1 << BYTEWIDTH))
-                                SET_LIST_BIT (c);
-                            }
-#else  /* emacs */
-                        /* Most character classes in a multibyte match
-                           just set a flag.  Exceptions are is_blank,
-                           is_digit, is_cntrl, and is_xdigit, since
-                           they can only match ASCII characters.  We
-                           don't need to handle them for multibyte.
-                           They are distinguished by a negative wctype.  */
-                        /* Setup the gl_state object to its buffer-defined
-                           value.  This hardcodes the buffer-global
-                           syntax-table for ASCII chars, while the other chars
-                           will obey syntax-table properties.  It's not ideal,
-                           but it's the way it's been done until now.  */
-                        SETUP_BUFFER_SYNTAX_TABLE ();
-                        for (ch = 0; ch < 256; ++ch)
-                          {
-                            c = RE_CHAR_TO_MULTIBYTE (ch);
-                            if (! CHAR_BYTE8_P (c)
-                                && re_iswctype (c, cc))
-                              {
-                                SET_LIST_BIT (ch);
-                                c1 = TRANSLATE (c);
-                                if (c1 == c)
-                                  continue;
-                                if (ASCII_CHAR_P (c1))
-                                  SET_LIST_BIT (c1);
-                                else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
-                                  SET_LIST_BIT (c1);
-                              }
-                          }
-                        SET_RANGE_TABLE_WORK_AREA_BIT
-                          (range_table_work, re_wctype_to_bit (cc));
-#endif  /* emacs */
-                        /* In most cases the matching rule for char classes
-                           only uses the syntax table for multibyte chars,
-                           so that the content of the syntax-table is not
-                           hardcoded in the range_table.  SPACE and WORD are
-                           the two exceptions.  */
-                        if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
-                          bufp->used_syntax = 1;
-                        /* Repeat the loop. */
-                        continue;
-                      }
-                    else
-                      {
-                        /* Go back to right after the "[:".  */
-                        p = class_beg;
-                        SET_LIST_BIT ('[');
-                        /* Because the `:' may start the range, we
-                           can't simply set bit and repeat the loop.
-                           Instead, just set it to C and handle below.  */
-                        c = ':';
-                      }
-                  }
                if (p < pend && p[0] == '-' && p[1] != ']')
                  {
@@ -4659,28 +4677,8 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
      re_wchar_t range_start, range_end;
  /* Sort tests by the most commonly used classes with some adjustment to which
-     tests are easiest to perform.  Frequencies of character class names used in
+     tests are easiest to perform.  Take a look at comment in re_wctype_parse
-     Emacs sources as of 2016-07-15:
+     for table with frequencies of character class names. */
-     $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
-           sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
-         213 [:alnum:]
-         104 [:alpha:]
-          62 [:space:]
-          39 [:digit:]
-          36 [:blank:]
-          26 [:upper:]
-          24 [:word:]
-          21 [:lower:]
-          10 [:punct:]
-          10 [:ascii:]
-           9 [:xdigit:]
-           4 [:nonascii:]
-           4 [:graph:]
-           2 [:print:]
-           2 [:cntrl:]
-           1 [:ff:]
-   */
      if ((class_bits & BIT_MULTIBYTE) ||
          (class_bits & BIT_ALNUM && ISALNUM (c)) ||
diff --git a/src/regex.h b/src/regex.h
index 817167a07ca..01b659addbb 100644
--- a/src/regex.h
+++ b/src/regex.h
@@ -585,25 +585,13 @@ extern void regfree (regex_t *__preg);
 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
 # include <wchar.h>
 # include <wctype.h>
-#endif
-#if WIDE_CHAR_SUPPORT
-/* The GNU C library provides support for user-defined character classes
-   and the functions from ISO C amendment 1.  */
-# ifdef CHARCLASS_NAME_MAX
-#  define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
-# else
-/* This shouldn't happen but some implementation might still have this
-   problem.  Use a reasonable default value.  */
-#  define CHAR_CLASS_MAX_LENGTH 256
-# endif
 typedef wctype_t re_wctype_t;
 typedef wchar_t re_wchar_t;
 # define re_wctype wctype
 # define re_iswctype iswctype
 # define re_wctype_to_bit(cc) 0
 #else
-# define CHAR_CLASS_MAX_LENGTH  9 /* Namely, `multibyte'.  */
 # ifndef emacs
 #  define btowc(c) c
 # endif
@@ -621,7 +609,7 @@ typedef enum { RECC_ERROR = 0,
 } re_wctype_t;
 extern char re_iswctype (int ch,    re_wctype_t cc);
-extern re_wctype_t re_wctype (const unsigned char* str);
+extern re_wctype_t re_wctype_parse (const unsigned char **strp, unsigned limit);
 typedef int re_wchar_t;
diff --git a/src/syntax.c b/src/syntax.c
index f8d987b377c..667de402ec4 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
      /* At first setup fastmap.  */
      while (i_byte < size_byte)
        {
-          c = str[i_byte++];
+          if (handle_iso_classes)
-          if (handle_iso_classes && c == '['
-              && i_byte < size_byte
-              && str[i_byte] == ':')
            {
-              const unsigned char *class_beg = str + i_byte + 1;
+              const unsigned char *ch = str + i_byte;
-              const unsigned char *class_end = class_beg;
+              re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
-              const unsigned char *class_limit = str + size_byte - 2;
-              /* Leave room for the null.  */
-              unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-              re_wctype_t cc;
-              if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-                class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-              while (class_end < class_limit
-                     && *class_end >= 'a' && *class_end <= 'z')
-                class_end++;
-              if (class_end == class_beg
-                  || *class_end != ':' || class_end[1] != ']')
-                goto not_a_class_name;
-              memcpy (class_name, class_beg, class_end - class_beg);
-              class_name[class_end - class_beg] = 0;
-              cc = re_wctype (class_name);
              if (cc == 0)
                error ("Invalid ISO C character class");
+              if (cc != -1)
-              iso_classes = Fcons (make_number (cc), iso_classes);
+                {
+                  iso_classes = Fcons (make_number (cc), iso_classes);
-              i_byte = class_end + 2 - str;
+                  i_byte = ch - str;
-              continue;
+                  continue;
+                }
            }
-        not_a_class_name:
+          c = str[i_byte++];
          if (c == '\\')
            {
              if (i_byte == size_byte)
@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
      while (i_byte < size_byte)
        {
          int leading_code = str[i_byte];
-          c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-          i_byte += len;
-          if (handle_iso_classes && c == '['
+          if (handle_iso_classes)
-              && i_byte < size_byte
-              && STRING_CHAR (str + i_byte) == ':')
            {
-              const unsigned char *class_beg = str + i_byte + 1;
+              const unsigned char *ch = str + i_byte;
-              const unsigned char *class_end = class_beg;
+              re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
-              const unsigned char *class_limit = str + size_byte - 2;
-              /* Leave room for the null.        */
-              unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-              re_wctype_t cc;
-              if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-                class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-              while (class_end < class_limit
-                     && *class_end >= 'a' && *class_end <= 'z')
-                class_end++;
-              if (class_end == class_beg
-                  || *class_end != ':' || class_end[1] != ']')
-                goto not_a_class_name_multibyte;
-              memcpy (class_name, class_beg, class_end - class_beg);
-              class_name[class_end - class_beg] = 0;
-              cc = re_wctype (class_name);
              if (cc == 0)
                error ("Invalid ISO C character class");
+              if (cc != -1)
-              iso_classes = Fcons (make_number (cc), iso_classes);
+                {
+                  iso_classes = Fcons (make_number (cc), iso_classes);
-              i_byte = class_end + 2 - str;
+                  i_byte = ch - str;
-              continue;
+                  continue;
+                }
            }
-        not_a_class_name_multibyte:
+          if (leading_code== '\\')
-          if (c == '\\')
            {
-              if (i_byte == size_byte)
+              if (++i_byte == size_byte)
                break;
              leading_code = str[i_byte];
-              c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-              i_byte += len;
            }
+          c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
+          i_byte += len;
          /* Treat `-' as range character only if another character
             follows.  */
          if (i_byte + 1 < size_byte
author	Michal Nazarewicz	2016-07-17 03:09:38 +0200
committer	Michal Nazarewicz	2016-08-02 15:39:10 +0200
commit	4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree	43a158bf0635a01bf5946730ac439fd0b3b8f606 /src
parent	e7257061317c604492d20f26f312b9e925aa1860 (diff)
download	emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.zip

diff --git a/src/regex.c b/src/regex.c index 1f2a1f086de..3a25835f452 100644 --- a/src/regex.c +++ b/src/regex.c
@@ -1969,29 +1969,96 @@ struct range_table_work_area
1969		1969
1970	#if ! WIDE_CHAR_SUPPORT	1970	#if ! WIDE_CHAR_SUPPORT
1971		1971
1972	/* Map a string to the char class it names (if any). */	1972	/* Parse a character class, i.e. string such as "[:name:]". *strp
		1973	points to the string to be parsed and limit is length, in bytes, of
		1974	that string.
		1975
		1976	If *strp point to a string that begins with "[:name:]", where name is
		1977	a non-empty sequence of lower case letters, *strp will be advanced past the
		1978	closing square bracket and RECC_* constant which maps to the name will be
		1979	returned. If name is not a valid character class name zero, or RECC_ERROR,
		1980	is returned.
		1981
		1982	Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned.
		1983
		1984	The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
		1985	*/
1973	re_wctype_t	1986	re_wctype_t
1974	re_wctype (const_re_char *str)	1987	re_wctype_parse (const unsigned char **strp, unsigned limit)
1975	{	1988	{
1976	const char string = (const char ) str;	1989	const char beg = (const char )strp, it;
1977	if (STREQ (string, "alnum")) return RECC_ALNUM;	1990
1978	else if (STREQ (string, "alpha")) return RECC_ALPHA;	1991	if (limit < 4 \|\| beg[0] != '[' \|\| beg[1] != ':')
1979	else if (STREQ (string, "word")) return RECC_WORD;	1992	return -1;
1980	else if (STREQ (string, "ascii")) return RECC_ASCII;	1993
1981	else if (STREQ (string, "nonascii")) return RECC_NONASCII;	1994	beg += 2; /* skip opening ‘[:’ */
1982	else if (STREQ (string, "graph")) return RECC_GRAPH;	1995	limit -= 3; /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */
1983	else if (STREQ (string, "lower")) return RECC_LOWER;	1996	for (it = beg; it[0] != ':' \|\| it[1] != ']'; ++it)
1984	else if (STREQ (string, "print")) return RECC_PRINT;	1997	if (!--limit)
1985	else if (STREQ (string, "punct")) return RECC_PUNCT;	1998	return -1;
1986	else if (STREQ (string, "space")) return RECC_SPACE;	1999
1987	else if (STREQ (string, "upper")) return RECC_UPPER;	2000	strp = (const unsigned char )(it + 2);
1988	else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;	2001
1989	else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;	2002	/* Sort tests in the length=five case by frequency the classes to minimise
1990	else if (STREQ (string, "digit")) return RECC_DIGIT;	2003	number of times we fail the comparison. The frequencies of character class
1991	else if (STREQ (string, "xdigit")) return RECC_XDIGIT;	2004	names used in Emacs sources as of 2016-07-27:
1992	else if (STREQ (string, "cntrl")) return RECC_CNTRL;	2005
1993	else if (STREQ (string, "blank")) return RECC_BLANK;	2006	$ find \( -name \.c -o -name \.el \) -exec grep -h '\[:[a-z]*:]' {} + \|
1994	else return 0;	2007	sed 's/]/]\n/g' \|grep -o '\[:[a-z]*:]' \|sort \|uniq -c \|sort -nr
		2008	213 [:alnum:]
		2009	104 [:alpha:]
		2010	62 [:space:]
		2011	39 [:digit:]
		2012	36 [:blank:]
		2013	26 [:word:]
		2014	26 [:upper:]
		2015	21 [:lower:]
		2016	10 [:xdigit:]
		2017	10 [:punct:]
		2018	10 [:ascii:]
		2019	4 [:nonascii:]
		2020	4 [:graph:]
		2021	2 [:print:]
		2022	2 [:cntrl:]
		2023	1 [:ff:]
		2024
		2025	If you update this list, consider also updating chain of or’ed conditions
		2026	in execute_charset function.
		2027	*/
		2028
		2029	switch (it - beg) {
		2030	case 4:
		2031	if (!memcmp (beg, "word", 4)) return RECC_WORD;
		2032	break;
		2033	case 5:
		2034	if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM;
		2035	if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA;
		2036	if (!memcmp (beg, "space", 5)) return RECC_SPACE;
		2037	if (!memcmp (beg, "digit", 5)) return RECC_DIGIT;
		2038	if (!memcmp (beg, "blank", 5)) return RECC_BLANK;
		2039	if (!memcmp (beg, "upper", 5)) return RECC_UPPER;
		2040	if (!memcmp (beg, "lower", 5)) return RECC_LOWER;
		2041	if (!memcmp (beg, "punct", 5)) return RECC_PUNCT;
		2042	if (!memcmp (beg, "ascii", 5)) return RECC_ASCII;
		2043	if (!memcmp (beg, "graph", 5)) return RECC_GRAPH;
		2044	if (!memcmp (beg, "print", 5)) return RECC_PRINT;
		2045	if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL;
		2046	break;
		2047	case 6:
		2048	if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT;
		2049	break;
		2050	case 7:
		2051	if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE;
		2052	break;
		2053	case 8:
		2054	if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII;
		2055	break;
		2056	case 9:
		2057	if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
		2058	break;
		2059	}
		2060
		2061	return RECC_ERROR;
1995	}	2062	}
1996		2063
1997	/* True if CH is in the char class CC. */	2064	/* True if CH is in the char class CC. */
@@ -2776,10 +2843,74 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2776	{	2843	{
2777	boolean escaped_char = false;	2844	boolean escaped_char = false;
2778	const unsigned char *p2 = p;	2845	const unsigned char *p2 = p;
		2846	re_wctype_t cc;
2779	re_wchar_t ch;	2847	re_wchar_t ch;
2780		2848
2781	if (p == pend) FREE_STACK_RETURN (REG_EBRACK);	2849	if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2782		2850
		2851	/* See if we're at the beginning of a possible character
		2852	class. */
		2853	if (syntax & RE_CHAR_CLASSES &&
		2854	(cc = re_wctype_parse(&p, pend - p)) != -1)
		2855	{
		2856	if (cc == 0)
		2857	FREE_STACK_RETURN (REG_ECTYPE);
		2858
		2859	if (p == pend)
		2860	FREE_STACK_RETURN (REG_EBRACK);
		2861
		2862	#ifndef emacs
		2863	for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
		2864	if (re_iswctype (btowc (ch), cc))
		2865	{
		2866	c = TRANSLATE (ch);
		2867	if (c < (1 << BYTEWIDTH))
		2868	SET_LIST_BIT (c);
		2869	}
		2870	#else /* emacs */
		2871	/* Most character classes in a multibyte match just set
		2872	a flag. Exceptions are is_blank, is_digit, is_cntrl, and
		2873	is_xdigit, since they can only match ASCII characters.
		2874	We don't need to handle them for multibyte. They are
		2875	distinguished by a negative wctype. */
		2876
		2877	/* Setup the gl_state object to its buffer-defined value.
		2878	This hardcodes the buffer-global syntax-table for ASCII
		2879	chars, while the other chars will obey syntax-table
		2880	properties. It's not ideal, but it's the way it's been
		2881	done until now. */
		2882	SETUP_BUFFER_SYNTAX_TABLE ();
		2883
		2884	for (ch = 0; ch < 256; ++ch)
		2885	{
		2886	c = RE_CHAR_TO_MULTIBYTE (ch);
		2887	if (! CHAR_BYTE8_P (c)
		2888	&& re_iswctype (c, cc))
		2889	{
		2890	SET_LIST_BIT (ch);
		2891	c1 = TRANSLATE (c);
		2892	if (c1 == c)
		2893	continue;
		2894	if (ASCII_CHAR_P (c1))
		2895	SET_LIST_BIT (c1);
		2896	else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
		2897	SET_LIST_BIT (c1);
		2898	}
		2899	}
		2900	SET_RANGE_TABLE_WORK_AREA_BIT
		2901	(range_table_work, re_wctype_to_bit (cc));
		2902	#endif /* emacs */
		2903	/* In most cases the matching rule for char classes only
		2904	uses the syntax table for multibyte chars, so that the
		2905	content of the syntax-table is not hardcoded in the
		2906	range_table. SPACE and WORD are the two exceptions. */
		2907	if ((1 << cc) & ((1 << RECC_SPACE) \| (1 << RECC_WORD)))
		2908	bufp->used_syntax = 1;
		2909
		2910	/* Repeat the loop. */
		2911	continue;
		2912	}
		2913
2783	/* Don't translate yet. The range TRANSLATE(X..Y) cannot	2914	/* Don't translate yet. The range TRANSLATE(X..Y) cannot
2784	always be determined from TRANSLATE(X) and TRANSLATE(Y)	2915	always be determined from TRANSLATE(X) and TRANSLATE(Y)
2785	So the translation is done later in a loop. Example:	2916	So the translation is done later in a loop. Example:
@@ -2803,119 +2934,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2803	break;	2934	break;
2804	}	2935	}
2805		2936
2806	/* See if we're at the beginning of a possible character
2807	class. */
2808
2809	if (!escaped_char &&
2810	syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2811	{
2812	/* Leave room for the null. */
2813	unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2814	const unsigned char *class_beg;
2815
2816	PATFETCH (c);
2817	c1 = 0;
2818	class_beg = p;
2819
2820	/* If pattern is `[[:'. */
2821	if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2822
2823	for (;;)
2824	{
2825	PATFETCH (c);
2826	if ((c == ':' && *p == ']') \|\| p == pend)
2827	break;
2828	if (c1 < CHAR_CLASS_MAX_LENGTH)
2829	str[c1++] = c;
2830	else
2831	/* This is in any case an invalid class name. */
2832	str[0] = '\0';
2833	}
2834	str[c1] = '\0';
2835
2836	/* If isn't a word bracketed by `[:' and `:]':
2837	undo the ending character, the letters, and
2838	leave the leading `:' and `[' (but set bits for
2839	them). */
2840	if (c == ':' && *p == ']')
2841	{
2842	re_wctype_t cc = re_wctype (str);
2843
2844	if (cc == 0)
2845	FREE_STACK_RETURN (REG_ECTYPE);
2846
2847	/* Throw away the ] at the end of the character
2848	class. */
2849	PATFETCH (c);
2850
2851	if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2852
2853	#ifndef emacs
2854	for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2855	if (re_iswctype (btowc (ch), cc))
2856	{
2857	c = TRANSLATE (ch);
2858	if (c < (1 << BYTEWIDTH))
2859	SET_LIST_BIT (c);
2860	}
2861	#else /* emacs */
2862	/* Most character classes in a multibyte match
2863	just set a flag. Exceptions are is_blank,
2864	is_digit, is_cntrl, and is_xdigit, since
2865	they can only match ASCII characters. We
2866	don't need to handle them for multibyte.
2867	They are distinguished by a negative wctype. */
2868
2869	/* Setup the gl_state object to its buffer-defined
2870	value. This hardcodes the buffer-global
2871	syntax-table for ASCII chars, while the other chars
2872	will obey syntax-table properties. It's not ideal,
2873	but it's the way it's been done until now. */
2874	SETUP_BUFFER_SYNTAX_TABLE ();
2875
2876	for (ch = 0; ch < 256; ++ch)
2877	{
2878	c = RE_CHAR_TO_MULTIBYTE (ch);
2879	if (! CHAR_BYTE8_P (c)
2880	&& re_iswctype (c, cc))
2881	{
2882	SET_LIST_BIT (ch);
2883	c1 = TRANSLATE (c);
2884	if (c1 == c)
2885	continue;
2886	if (ASCII_CHAR_P (c1))
2887	SET_LIST_BIT (c1);
2888	else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2889	SET_LIST_BIT (c1);
2890	}
2891	}
2892	SET_RANGE_TABLE_WORK_AREA_BIT
2893	(range_table_work, re_wctype_to_bit (cc));
2894	#endif /* emacs */
2895	/* In most cases the matching rule for char classes
2896	only uses the syntax table for multibyte chars,
2897	so that the content of the syntax-table is not
2898	hardcoded in the range_table. SPACE and WORD are
2899	the two exceptions. */
2900	if ((1 << cc) & ((1 << RECC_SPACE) \| (1 << RECC_WORD)))
2901	bufp->used_syntax = 1;
2902
2903	/* Repeat the loop. */
2904	continue;
2905	}
2906	else
2907	{
2908	/* Go back to right after the "[:". */
2909	p = class_beg;
2910	SET_LIST_BIT ('[');
2911
2912	/* Because the `:' may start the range, we
2913	can't simply set bit and repeat the loop.
2914	Instead, just set it to C and handle below. */
2915	c = ':';
2916	}
2917	}
2918
2919	if (p < pend && p[0] == '-' && p[1] != ']')	2937	if (p < pend && p[0] == '-' && p[1] != ']')
2920	{	2938	{
2921		2939
@@ -4659,28 +4677,8 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
4659	re_wchar_t range_start, range_end;	4677	re_wchar_t range_start, range_end;
4660		4678
4661	/* Sort tests by the most commonly used classes with some adjustment to which	4679	/* Sort tests by the most commonly used classes with some adjustment to which
4662	tests are easiest to perform. Frequencies of character class names used in	4680	tests are easiest to perform. Take a look at comment in re_wctype_parse
4663	Emacs sources as of 2016-07-15:	4681	for table with frequencies of character class names. */
4664
4665	$ find \( -name \.c -o -name \.el \) -exec grep -h '\[:[a-z]*:]' {} + \|
4666	sed 's/]/]\n/g' \|grep -o '\[:[a-z]*:]' \|sort \|uniq -c \|sort -nr
4667	213 [:alnum:]
4668	104 [:alpha:]
4669	62 [:space:]
4670	39 [:digit:]
4671	36 [:blank:]
4672	26 [:upper:]
4673	24 [:word:]
4674	21 [:lower:]
4675	10 [:punct:]
4676	10 [:ascii:]
4677	9 [:xdigit:]
4678	4 [:nonascii:]
4679	4 [:graph:]
4680	2 [:print:]
4681	2 [:cntrl:]
4682	1 [:ff:]
4683	*/
4684		4682
4685	if ((class_bits & BIT_MULTIBYTE) \|\|	4683	if ((class_bits & BIT_MULTIBYTE) \|\|
4686	(class_bits & BIT_ALNUM && ISALNUM (c)) \|\|	4684	(class_bits & BIT_ALNUM && ISALNUM (c)) \|\|


diff --git a/src/regex.h b/src/regex.h index 817167a07ca..01b659addbb 100644 --- a/src/regex.h +++ b/src/regex.h
@@ -585,25 +585,13 @@ extern void regfree (regex_t *__preg);
585	/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */	585	/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
586	# include <wchar.h>	586	# include <wchar.h>
587	# include <wctype.h>	587	# include <wctype.h>
588	#endif
589		588
590	#if WIDE_CHAR_SUPPORT
591	/* The GNU C library provides support for user-defined character classes
592	and the functions from ISO C amendment 1. */
593	# ifdef CHARCLASS_NAME_MAX
594	# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
595	# else
596	/* This shouldn't happen but some implementation might still have this
597	problem. Use a reasonable default value. */
598	# define CHAR_CLASS_MAX_LENGTH 256
599	# endif
600	typedef wctype_t re_wctype_t;	589	typedef wctype_t re_wctype_t;
601	typedef wchar_t re_wchar_t;	590	typedef wchar_t re_wchar_t;
602	# define re_wctype wctype	591	# define re_wctype wctype
603	# define re_iswctype iswctype	592	# define re_iswctype iswctype
604	# define re_wctype_to_bit(cc) 0	593	# define re_wctype_to_bit(cc) 0
605	#else	594	#else
606	# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
607	# ifndef emacs	595	# ifndef emacs
608	# define btowc(c) c	596	# define btowc(c) c
609	# endif	597	# endif
@@ -621,7 +609,7 @@ typedef enum { RECC_ERROR = 0,
621	} re_wctype_t;	609	} re_wctype_t;
622		610
623	extern char re_iswctype (int ch, re_wctype_t cc);	611	extern char re_iswctype (int ch, re_wctype_t cc);
624	extern re_wctype_t re_wctype (const unsigned char* str);	612	extern re_wctype_t re_wctype_parse (const unsigned char **strp, unsigned limit);
625		613
626	typedef int re_wchar_t;	614	typedef int re_wchar_t;
627		615


diff --git a/src/syntax.c b/src/syntax.c index f8d987b377c..667de402ec4 100644 --- a/src/syntax.c +++ b/src/syntax.c
@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
1691	/* At first setup fastmap. */	1691	/* At first setup fastmap. */
1692	while (i_byte < size_byte)	1692	while (i_byte < size_byte)
1693	{	1693	{
1694	c = str[i_byte++];	1694	if (handle_iso_classes)
1695
1696	if (handle_iso_classes && c == '['
1697	&& i_byte < size_byte
1698	&& str[i_byte] == ':')
1699	{	1695	{
1700	const unsigned char *class_beg = str + i_byte + 1;	1696	const unsigned char *ch = str + i_byte;
1701	const unsigned char *class_end = class_beg;	1697	re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
1702	const unsigned char *class_limit = str + size_byte - 2;
1703	/* Leave room for the null. */
1704	unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
1705	re_wctype_t cc;
1706
1707	if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
1708	class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
1709
1710	while (class_end < class_limit
1711	&& class_end >= 'a' && class_end <= 'z')
1712	class_end++;
1713
1714	if (class_end == class_beg
1715	\|\| *class_end != ':' \|\| class_end[1] != ']')
1716	goto not_a_class_name;
1717
1718	memcpy (class_name, class_beg, class_end - class_beg);
1719	class_name[class_end - class_beg] = 0;
1720
1721	cc = re_wctype (class_name);
1722	if (cc == 0)	1698	if (cc == 0)
1723	error ("Invalid ISO C character class");	1699	error ("Invalid ISO C character class");
1724		1700	if (cc != -1)
1725	iso_classes = Fcons (make_number (cc), iso_classes);	1701	{
1726		1702	iso_classes = Fcons (make_number (cc), iso_classes);
1727	i_byte = class_end + 2 - str;	1703	i_byte = ch - str;
1728	continue;	1704	continue;
		1705	}
1729	}	1706	}
1730		1707
1731	not_a_class_name:	1708	c = str[i_byte++];
		1709
1732	if (c == '\\')	1710	if (c == '\\')
1733	{	1711	{
1734	if (i_byte == size_byte)	1712	if (i_byte == size_byte)
@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
1808	while (i_byte < size_byte)	1786	while (i_byte < size_byte)
1809	{	1787	{
1810	int leading_code = str[i_byte];	1788	int leading_code = str[i_byte];
1811	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
1812	i_byte += len;
1813		1789
1814	if (handle_iso_classes && c == '['	1790	if (handle_iso_classes)
1815	&& i_byte < size_byte
1816	&& STRING_CHAR (str + i_byte) == ':')
1817	{	1791	{
1818	const unsigned char *class_beg = str + i_byte + 1;	1792	const unsigned char *ch = str + i_byte;
1819	const unsigned char *class_end = class_beg;	1793	re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
1820	const unsigned char *class_limit = str + size_byte - 2;
1821	/* Leave room for the null. */
1822	unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
1823	re_wctype_t cc;
1824
1825	if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
1826	class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
1827
1828	while (class_end < class_limit
1829	&& class_end >= 'a' && class_end <= 'z')
1830	class_end++;
1831
1832	if (class_end == class_beg
1833	\|\| *class_end != ':' \|\| class_end[1] != ']')
1834	goto not_a_class_name_multibyte;
1835
1836	memcpy (class_name, class_beg, class_end - class_beg);
1837	class_name[class_end - class_beg] = 0;
1838
1839	cc = re_wctype (class_name);
1840	if (cc == 0)	1794	if (cc == 0)
1841	error ("Invalid ISO C character class");	1795	error ("Invalid ISO C character class");
1842		1796	if (cc != -1)
1843	iso_classes = Fcons (make_number (cc), iso_classes);	1797	{
1844		1798	iso_classes = Fcons (make_number (cc), iso_classes);
1845	i_byte = class_end + 2 - str;	1799	i_byte = ch - str;
1846	continue;	1800	continue;
		1801	}
1847	}	1802	}
1848		1803
1849	not_a_class_name_multibyte:	1804	if (leading_code== '\\')
1850	if (c == '\\')
1851	{	1805	{
1852	if (i_byte == size_byte)	1806	if (++i_byte == size_byte)
1853	break;	1807	break;
1854		1808
1855	leading_code = str[i_byte];	1809	leading_code = str[i_byte];
1856	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
1857	i_byte += len;
1858	}	1810	}
		1811	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
		1812	i_byte += len;
		1813
		1814
1859	/* Treat `-' as range character only if another character	1815	/* Treat `-' as range character only if another character
1860	follows. */	1816	follows. */
1861	if (i_byte + 1 < size_byte	1817	if (i_byte + 1 < size_byte