Refactor regex character class parsing in [:name:]

re_wctype function is used in three separate places and in all of those places almost exact code extracting the name from [:name:] surrounds it. Furthermore, re_wctype requires a NUL-terminated string, so the name of the character class is copied to a temporary buffer. The code duplication and unnecessary memory copying can be avoided by pushing the responsibility of parsing the whole [:name:] sequence to the function. Furthermore, since now the function has access to the length of the character class name (since it’s doing the parsing), it can take advantage of that information in skipping some string comparisons and using a constant-length memcmp instead of strcmp which needs to take care of NUL bytes. * src/regex.c (re_wctype): Delete function. Replace it with: (re_wctype_parse): New function which parses a whole [:name:] string and returns a RECC_* constant or -1 if the string is not of [:name:] format. (regex_compile): Use re_wctype_parse. * src/syntax.c (skip_chars): Use re_wctype_parse.
author: Michal Nazarewicz 2016-07-17 03:09:38 +0200
committer: Michal Nazarewicz 2016-08-02 15:39:10 +0200
commit: 4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree: 43a158bf0635a01bf5946730ac439fd0b3b8f606 /src/syntax.c
parent: e7257061317c604492d20f26f312b9e925aa1860 (diff)
download: emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz
emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.zip
1 files changed, 26 insertions, 70 deletions
diff --git a/src/syntax.c b/src/syntax.c
index f8d987b377c..667de402ec4 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
      /* At first setup fastmap.  */
      while (i_byte < size_byte)
        {
-          c = str[i_byte++];
+          if (handle_iso_classes)
-          if (handle_iso_classes && c == '['
-              && i_byte < size_byte
-              && str[i_byte] == ':')
            {
-              const unsigned char *class_beg = str + i_byte + 1;
+              const unsigned char *ch = str + i_byte;
-              const unsigned char *class_end = class_beg;
+              re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
-              const unsigned char *class_limit = str + size_byte - 2;
-              /* Leave room for the null.  */
-              unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-              re_wctype_t cc;
-              if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-                class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-              while (class_end < class_limit
-                     && *class_end >= 'a' && *class_end <= 'z')
-                class_end++;
-              if (class_end == class_beg
-                  || *class_end != ':' || class_end[1] != ']')
-                goto not_a_class_name;
-              memcpy (class_name, class_beg, class_end - class_beg);
-              class_name[class_end - class_beg] = 0;
-              cc = re_wctype (class_name);
              if (cc == 0)
                error ("Invalid ISO C character class");
+              if (cc != -1)
-              iso_classes = Fcons (make_number (cc), iso_classes);
+                {
+                  iso_classes = Fcons (make_number (cc), iso_classes);
-              i_byte = class_end + 2 - str;
+                  i_byte = ch - str;
-              continue;
+                  continue;
+                }
            }
-        not_a_class_name:
+          c = str[i_byte++];
          if (c == '\\')
            {
              if (i_byte == size_byte)
@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
      while (i_byte < size_byte)
        {
          int leading_code = str[i_byte];
-          c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-          i_byte += len;
-          if (handle_iso_classes && c == '['
+          if (handle_iso_classes)
-              && i_byte < size_byte
-              && STRING_CHAR (str + i_byte) == ':')
            {
-              const unsigned char *class_beg = str + i_byte + 1;
+              const unsigned char *ch = str + i_byte;
-              const unsigned char *class_end = class_beg;
+              re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
-              const unsigned char *class_limit = str + size_byte - 2;
-              /* Leave room for the null.        */
-              unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-              re_wctype_t cc;
-              if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-                class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-              while (class_end < class_limit
-                     && *class_end >= 'a' && *class_end <= 'z')
-                class_end++;
-              if (class_end == class_beg
-                  || *class_end != ':' || class_end[1] != ']')
-                goto not_a_class_name_multibyte;
-              memcpy (class_name, class_beg, class_end - class_beg);
-              class_name[class_end - class_beg] = 0;
-              cc = re_wctype (class_name);
              if (cc == 0)
                error ("Invalid ISO C character class");
+              if (cc != -1)
-              iso_classes = Fcons (make_number (cc), iso_classes);
+                {
+                  iso_classes = Fcons (make_number (cc), iso_classes);
-              i_byte = class_end + 2 - str;
+                  i_byte = ch - str;
-              continue;
+                  continue;
+                }
            }
-        not_a_class_name_multibyte:
+          if (leading_code== '\\')
-          if (c == '\\')
            {
-              if (i_byte == size_byte)
+              if (++i_byte == size_byte)
                break;
              leading_code = str[i_byte];
-              c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-              i_byte += len;
            }
+          c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
+          i_byte += len;
          /* Treat `-' as range character only if another character
             follows.  */
          if (i_byte + 1 < size_byte
author	Michal Nazarewicz	2016-07-17 03:09:38 +0200
committer	Michal Nazarewicz	2016-08-02 15:39:10 +0200
commit	4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree	43a158bf0635a01bf5946730ac439fd0b3b8f606 /src/syntax.c
parent	e7257061317c604492d20f26f312b9e925aa1860 (diff)
download	emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.zip

diff --git a/src/syntax.c b/src/syntax.c index f8d987b377c..667de402ec4 100644 --- a/src/syntax.c +++ b/src/syntax.c
@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
1691	/* At first setup fastmap. */	1691	/* At first setup fastmap. */
1692	while (i_byte < size_byte)	1692	while (i_byte < size_byte)
1693	{	1693	{
1694	c = str[i_byte++];	1694	if (handle_iso_classes)
1695
1696	if (handle_iso_classes && c == '['
1697	&& i_byte < size_byte
1698	&& str[i_byte] == ':')
1699	{	1695	{
1700	const unsigned char *class_beg = str + i_byte + 1;	1696	const unsigned char *ch = str + i_byte;
1701	const unsigned char *class_end = class_beg;	1697	re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
1702	const unsigned char *class_limit = str + size_byte - 2;
1703	/* Leave room for the null. */
1704	unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
1705	re_wctype_t cc;
1706
1707	if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
1708	class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
1709
1710	while (class_end < class_limit
1711	&& class_end >= 'a' && class_end <= 'z')
1712	class_end++;
1713
1714	if (class_end == class_beg
1715	\|\| *class_end != ':' \|\| class_end[1] != ']')
1716	goto not_a_class_name;
1717
1718	memcpy (class_name, class_beg, class_end - class_beg);
1719	class_name[class_end - class_beg] = 0;
1720
1721	cc = re_wctype (class_name);
1722	if (cc == 0)	1698	if (cc == 0)
1723	error ("Invalid ISO C character class");	1699	error ("Invalid ISO C character class");
1724		1700	if (cc != -1)
1725	iso_classes = Fcons (make_number (cc), iso_classes);	1701	{
1726		1702	iso_classes = Fcons (make_number (cc), iso_classes);
1727	i_byte = class_end + 2 - str;	1703	i_byte = ch - str;
1728	continue;	1704	continue;
		1705	}
1729	}	1706	}
1730		1707
1731	not_a_class_name:	1708	c = str[i_byte++];
		1709
1732	if (c == '\\')	1710	if (c == '\\')
1733	{	1711	{
1734	if (i_byte == size_byte)	1712	if (i_byte == size_byte)
@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
1808	while (i_byte < size_byte)	1786	while (i_byte < size_byte)
1809	{	1787	{
1810	int leading_code = str[i_byte];	1788	int leading_code = str[i_byte];
1811	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
1812	i_byte += len;
1813		1789
1814	if (handle_iso_classes && c == '['	1790	if (handle_iso_classes)
1815	&& i_byte < size_byte
1816	&& STRING_CHAR (str + i_byte) == ':')
1817	{	1791	{
1818	const unsigned char *class_beg = str + i_byte + 1;	1792	const unsigned char *ch = str + i_byte;
1819	const unsigned char *class_end = class_beg;	1793	re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
1820	const unsigned char *class_limit = str + size_byte - 2;
1821	/* Leave room for the null. */
1822	unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
1823	re_wctype_t cc;
1824
1825	if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
1826	class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
1827
1828	while (class_end < class_limit
1829	&& class_end >= 'a' && class_end <= 'z')
1830	class_end++;
1831
1832	if (class_end == class_beg
1833	\|\| *class_end != ':' \|\| class_end[1] != ']')
1834	goto not_a_class_name_multibyte;
1835
1836	memcpy (class_name, class_beg, class_end - class_beg);
1837	class_name[class_end - class_beg] = 0;
1838
1839	cc = re_wctype (class_name);
1840	if (cc == 0)	1794	if (cc == 0)
1841	error ("Invalid ISO C character class");	1795	error ("Invalid ISO C character class");
1842		1796	if (cc != -1)
1843	iso_classes = Fcons (make_number (cc), iso_classes);	1797	{
1844		1798	iso_classes = Fcons (make_number (cc), iso_classes);
1845	i_byte = class_end + 2 - str;	1799	i_byte = ch - str;
1846	continue;	1800	continue;
		1801	}
1847	}	1802	}
1848		1803
1849	not_a_class_name_multibyte:	1804	if (leading_code== '\\')
1850	if (c == '\\')
1851	{	1805	{
1852	if (i_byte == size_byte)	1806	if (++i_byte == size_byte)
1853	break;	1807	break;
1854		1808
1855	leading_code = str[i_byte];	1809	leading_code = str[i_byte];
1856	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
1857	i_byte += len;
1858	}	1810	}
		1811	c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
		1812	i_byte += len;
		1813
		1814
1859	/* Treat `-' as range character only if another character	1815	/* Treat `-' as range character only if another character
1860	follows. */	1816	follows. */
1861	if (i_byte + 1 < size_byte	1817	if (i_byte + 1 < size_byte