1 files changed, 55 insertions, 8 deletions
diff --git a/src/regex.c b/src/regex.c
index 846c87041b1..ae80ad0cee8 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -2530,6 +2530,7 @@ regex_compile (pattern, size, syntax, bufp)
  bufp->syntax = syntax;
  bufp->fastmap_accurate = 0;
  bufp->not_bol = bufp->not_eol = 0;
+  bufp->used_syntax = 0;
  /* Set `used' to zero, so that if we return an error, the pattern
     printer (for debugging) will think there's no pattern.  We reset it
@@ -2942,6 +2943,14 @@ regex_compile (pattern, size, syntax, bufp)
                              SET_LIST_BIT (translated);
                          }
+                        /* In most cases the matching rule for char classes
+                           only uses the syntax table for multibyte chars,
+                           so that the content of the syntax-table it is not
+                           hardcoded in the range_table.  SPACE and WORD are
+                           the two exceptions.  */
+                        if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
+                          bufp->used_syntax = 1;
                        /* Repeat the loop. */
                        continue;
                      }
@@ -3877,11 +3886,13 @@ analyse_first (p, pend, fastmap, multibyte)
          if (fastmap)
            {
              int c = RE_STRING_CHAR (p + 1, pend - p);
+              /* When fast-scanning, the fastmap can be indexed either with
+                 a char (smaller than 256) or with the first byte of
+                 a char's byte sequence.  So we have to conservatively add
+                 both to the table.  */
              if (SINGLE_BYTE_CHAR_P (c))
                fastmap[c] = 1;
-              else
+              fastmap[p[1]] = 1;
-                fastmap[p[1]] = 1;
            }
          break;
@@ -3899,6 +3910,10 @@ analyse_first (p, pend, fastmap, multibyte)
             So any that are not listed in the charset
             are possible matches, even in multibyte buffers.  */
          if (!fastmap) break;
+          /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
+             because it will automatically be set when needed by virtue of
+             being larger than the highest char of its charset (0xbf) but
+             smaller than (1<<BYTEWIDTH).  */
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
               j < (1 << BYTEWIDTH); j++)
            fastmap[j] = 1;
@@ -3909,7 +3924,13 @@ analyse_first (p, pend, fastmap, multibyte)
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
               j >= 0; j--)
            if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-              fastmap[j] = 1;
+              {
+                fastmap[j] = 1;
+#ifdef emacs
+                if (j >= 0x80 && j < 0xa0)
+                  fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
+#endif
+              }
          if ((not && multibyte)
              /* Any character set can possibly contain a character
@@ -4352,11 +4373,33 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
                    }
                }
              else
-                while (range > lim && !fastmap[*d])
+                do
                  {
-                    d++;
+                    re_char *d_start = d;
-                    range--;
+                    while (range > lim && !fastmap[*d])
-                  }
+                      {
+                        d++;
+                        range--;
+                      }
+#ifdef emacs
+                    if (multibyte && range > lim)
+                      {
+                        /* Check that we are at the beginning of a char.  */
+                        int at_boundary;
+                        AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
+                        if (at_boundary)
+                          break;
+                        else
+                          { /* We have matched an internal byte of a char
+                               rather than the leading byte, so it's a false
+                               positive: we should keep scanning.  */
+                            d++; range--;
+                          }
+                      }
+                    else
+#endif
+                      break;
+                  } while (1);
              startpos += irange - range;
            }
@@ -6197,6 +6240,10 @@ re_compile_pattern (pattern, length, bufp)
 {
  reg_errcode_t ret;
+#ifdef emacs
+  gl_state.current_syntax_table = current_buffer->syntax_table;
+#endif
  /* GNU code is written to assume at least RE_NREGS registers will be set
     (and at least one extra will be -1).  */
  bufp->regs_allocated = REGS_UNALLOCATED;

diff --git a/src/regex.c b/src/regex.c index 846c87041b1..ae80ad0cee8 100644 --- a/src/regex.c +++ b/src/regex.c
@@ -2530,6 +2530,7 @@ regex_compile (pattern, size, syntax, bufp)
2530	bufp->syntax = syntax;	2530	bufp->syntax = syntax;
2531	bufp->fastmap_accurate = 0;	2531	bufp->fastmap_accurate = 0;
2532	bufp->not_bol = bufp->not_eol = 0;	2532	bufp->not_bol = bufp->not_eol = 0;
		2533	bufp->used_syntax = 0;
2533		2534
2534	/* Set `used' to zero, so that if we return an error, the pattern	2535	/* Set `used' to zero, so that if we return an error, the pattern
2535	printer (for debugging) will think there's no pattern. We reset it	2536	printer (for debugging) will think there's no pattern. We reset it
@@ -2942,6 +2943,14 @@ regex_compile (pattern, size, syntax, bufp)
2942	SET_LIST_BIT (translated);	2943	SET_LIST_BIT (translated);
2943	}	2944	}
2944		2945
		2946	/* In most cases the matching rule for char classes
		2947	only uses the syntax table for multibyte chars,
		2948	so that the content of the syntax-table it is not
		2949	hardcoded in the range_table. SPACE and WORD are
		2950	the two exceptions. */
		2951	if ((1 << cc) & ((1 << RECC_SPACE) \| (1 << RECC_WORD)))
		2952	bufp->used_syntax = 1;
		2953
2945	/* Repeat the loop. */	2954	/* Repeat the loop. */
2946	continue;	2955	continue;
2947	}	2956	}
@@ -3877,11 +3886,13 @@ analyse_first (p, pend, fastmap, multibyte)
3877	if (fastmap)	3886	if (fastmap)
3878	{	3887	{
3879	int c = RE_STRING_CHAR (p + 1, pend - p);	3888	int c = RE_STRING_CHAR (p + 1, pend - p);
3880		3889	/* When fast-scanning, the fastmap can be indexed either with
		3890	a char (smaller than 256) or with the first byte of
		3891	a char's byte sequence. So we have to conservatively add
		3892	both to the table. */
3881	if (SINGLE_BYTE_CHAR_P (c))	3893	if (SINGLE_BYTE_CHAR_P (c))
3882	fastmap[c] = 1;	3894	fastmap[c] = 1;
3883	else	3895	fastmap[p[1]] = 1;
3884	fastmap[p[1]] = 1;
3885	}	3896	}
3886	break;	3897	break;
3887		3898
@@ -3899,6 +3910,10 @@ analyse_first (p, pend, fastmap, multibyte)
3899	So any that are not listed in the charset	3910	So any that are not listed in the charset
3900	are possible matches, even in multibyte buffers. */	3911	are possible matches, even in multibyte buffers. */
3901	if (!fastmap) break;	3912	if (!fastmap) break;
		3913	/* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
		3914	because it will automatically be set when needed by virtue of
		3915	being larger than the highest char of its charset (0xbf) but
		3916	smaller than (1<<BYTEWIDTH). */
3902	for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;	3917	for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3903	j < (1 << BYTEWIDTH); j++)	3918	j < (1 << BYTEWIDTH); j++)
3904	fastmap[j] = 1;	3919	fastmap[j] = 1;
@@ -3909,7 +3924,13 @@ analyse_first (p, pend, fastmap, multibyte)
3909	for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;	3924	for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3910	j >= 0; j--)	3925	j >= 0; j--)
3911	if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)	3926	if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3912	fastmap[j] = 1;	3927	{
		3928	fastmap[j] = 1;
		3929	#ifdef emacs
		3930	if (j >= 0x80 && j < 0xa0)
		3931	fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
		3932	#endif
		3933	}
3913		3934
3914	if ((not && multibyte)	3935	if ((not && multibyte)
3915	/* Any character set can possibly contain a character	3936	/* Any character set can possibly contain a character
@@ -4352,11 +4373,33 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
4352	}	4373	}
4353	}	4374	}
4354	else	4375	else
4355	while (range > lim && !fastmap[*d])	4376	do
4356	{	4377	{
4357	d++;	4378	re_char *d_start = d;
4358	range--;	4379	while (range > lim && !fastmap[*d])
4359	}	4380	{
		4381	d++;
		4382	range--;
		4383	}
		4384	#ifdef emacs
		4385	if (multibyte && range > lim)
		4386	{
		4387	/* Check that we are at the beginning of a char. */
		4388	int at_boundary;
		4389	AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
		4390	if (at_boundary)
		4391	break;
		4392	else
		4393	{ /* We have matched an internal byte of a char
		4394	rather than the leading byte, so it's a false
		4395	positive: we should keep scanning. */
		4396	d++; range--;
		4397	}
		4398	}
		4399	else
		4400	#endif
		4401	break;
		4402	} while (1);
4360		4403
4361	startpos += irange - range;	4404	startpos += irange - range;
4362	}	4405	}
@@ -6197,6 +6240,10 @@ re_compile_pattern (pattern, length, bufp)
6197	{	6240	{
6198	reg_errcode_t ret;	6241	reg_errcode_t ret;
6199		6242
		6243	#ifdef emacs
		6244	gl_state.current_syntax_table = current_buffer->syntax_table;
		6245	#endif
		6246
6200	/* GNU code is written to assume at least RE_NREGS registers will be set	6247	/* GNU code is written to assume at least RE_NREGS registers will be set
6201	(and at least one extra will be -1). */	6248	(and at least one extra will be -1). */
6202	bufp->regs_allocated = REGS_UNALLOCATED;	6249	bufp->regs_allocated = REGS_UNALLOCATED;