(PATFETCH): Remove the translating fetch.

(PATFETCH_RAW): Rename to PATFETCH. (set_image_of_range): New fun. (SET_RANGE_TABLE_WORK_AREA): Use it. (regex_compile): Don't translate the pattern chars so eagerly. Only do it when inserting an `exactn' bytecode or when handling a char-range. (mutually_exclusive_p): Avoid empty statement.
author: Stefan Monnier 2002-08-23 22:21:51 +0000
committer: Stefan Monnier 2002-08-23 22:21:51 +0000
commit: 365958144ea38255d543a4232b926ca81e849fa9 (patch)
tree: 43beeeefed478bcbfac634c44348351456decaff /src
parent: d846a776e1043ad6d23a71a8daf42cc8b197c4f9 (diff)
download: emacs-365958144ea38255d543a4232b926ca81e849fa9.tar.gz
emacs-365958144ea38255d543a4232b926ca81e849fa9.zip
2 files changed, 66 insertions, 32 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index 6dcc95b7f8d..c6180468193 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,14 @@
+2002-08-23  Stefan Monnier  <monnier@cs.yale.edu>
+        * regex.c (PATFETCH): Remove the translating fetch.
+        (PATFETCH_RAW): Rename to PATFETCH.
+        (set_image_of_range): New fun.
+        (SET_RANGE_TABLE_WORK_AREA): Use it.
+        (regex_compile): Don't translate the pattern chars so eagerly.
+        Only do it when inserting an `exactn' bytecode or when handling
+        a char-range.
+        (mutually_exclusive_p): Avoid empty statement.
 2002-08-22  Kim F. Storm  <storm@cua.dk>
        * xdisp.c (redisplay_window): Do not `goto try_to_scroll' when we
@@ -511,11 +522,10 @@
        (parse_solitary_modifier, Fexecute_extended_command): Likewise.
        * textprop.c (validate_interval_range, interval_of): Likewise.
-        * fontset.c (Fset_fontset_font): Use SDATA instead of
+        * fontset.c (Fset_fontset_font): Use SDATA instead of XSTRING()->data.
-        XSTRING()->data.
-        * charset.h (FETCH_STRING_CHAR_ADVANCE,
+        * charset.h (FETCH_STRING_CHAR_ADVANCE)
-        FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of
+        (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of
        XSTRING()->size_byte.
        * lisp.h (SDATA, SREF): Produce rvalue.
@@ -524,8 +534,8 @@
        * buffer.c (Fother_buffer): Use SREF when retrieving a byte from
        a string.
        * casefiddle.c (casify_object): Use SSET.
-        * charset.h (FETCH_STRING_CHAR_ADVANCE,
+        * charset.h (FETCH_STRING_CHAR_ADVANCE)
-        FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting
+        (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting
        address of string contents.
        * data.c (Faref): Use SDATA.
        (Faset): Use SDATA, SSET.
diff --git a/src/regex.c b/src/regex.c
index 591d6f14e12..e01259cc85a 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -19,7 +19,9 @@
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.  */
-/* TODO:
+/* BUGS:
+   - (x?)*y\1z should match both xxxxyxz and xxxyz.
+   TODO:
   - structure the opcode space into opcode+flag.
   - merge with glibc's regex.[ch].
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
@@ -1682,17 +1684,9 @@ static re_char *skip_one_char _RE_ARGS ((re_char *p));
 static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
                                    char *fastmap, const int multibyte));
-/* Fetch the next character in the uncompiled pattern---translating it
-   if necessary.  */
-#define PATFETCH(c)                                                     \
-  do {                                                                  \
-    PATFETCH_RAW (c);                                                   \
-    c = TRANSLATE (c);                                                  \
-  } while (0)
 /* Fetch the next character in the uncompiled pattern, with no
   translation.  */
-#define PATFETCH_RAW(c)                                                 \
+#define PATFETCH(c)                                                     \
  do {                                                                  \
    int len;                                                            \
    if (p == pend) return REG_EEND;                                     \
@@ -1914,12 +1908,13 @@ struct range_table_work_area
 #define BIT_UPPER       0x10
 #define BIT_MULTIBYTE   0x20
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
+/* Set a range START..END to WORK_AREA.
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)    \
+   The range is passed through TRANSLATE, so START and END
-  do {                                                                  \
+   should be untranslated.  */
-    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);                      \
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)        \
-    (work_area).table[(work_area).used++] = (range_start);              \
+  do {                                                          \
-    (work_area).table[(work_area).used++] = (range_end);                \
+    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);              \
+    set_image_of_range (&work_area, start, end, translate);     \
  } while (0)
 /* Free allocated memory for WORK_AREA.  */
@@ -2077,6 +2072,31 @@ re_wctype_to_bit (cc)
 }
 #endif
+/* We need to find the image of the range start..end when passed through
+   TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+   and is not even necessarily contiguous.
+   We approximate it with the smallest contiguous range that contains
+   all the chars we need.  */
+static void
+set_image_of_range (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end);
+  if (RE_TRANSLATE_P (translate))
+    for (; start <= end; start++)
+      {
+        re_wchar_t c = TRANSLATE (start);
+        cmin = MIN (cmin, c);
+        cmax = MAX (cmax, c);
+      }
+  work_area->table[work_area->used++] = (cmin);
+  work_area->table[work_area->used++] = (cmax);
+}
 /* Explicit quit checking is only used on NTemacs.  */
 #if defined WINDOWSNT && defined emacs && defined QUIT
 extern int immediate_quit;
@@ -2525,6 +2545,10 @@ regex_compile (pattern, size, syntax, bufp)
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+                /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
+                   always be determined from TRANSLATE(X) and TRANSLATE(Y)
+                   So the translation is done later in a loop.  Example:
+                   (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
                PATFETCH (c);
                /* \ might escape characters inside [...] and [^...].  */
@@ -2584,7 +2608,7 @@ regex_compile (pattern, size, syntax, bufp)
                       them).  */
                    if (c == ':' && *p == ']')
                      {
-                        int ch;
+                        re_wchar_t ch;
                        re_wctype_t cc;
                        cc = re_wctype (str);
@@ -2653,8 +2677,8 @@ regex_compile (pattern, size, syntax, bufp)
                               starting at the smallest character in
                               the charset of C1 and ending at C1.  */
                            int charset = CHAR_CHARSET (c1);
-                            int c2 = MAKE_CHAR (charset, 0, 0);
+                            re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
-                            
                            SET_RANGE_TABLE_WORK_AREA (range_table_work,
                                                       c2, c1);
                            c1 = 0377;
@@ -2672,7 +2696,7 @@ regex_compile (pattern, size, syntax, bufp)
                  /* ... into bitmap.  */
                  {
                    re_wchar_t this_char;
-                    int range_start = c, range_end = c1;
+                    re_wchar_t range_start = c, range_end = c1;
                    /* If the start is after the end, the range is empty.  */
                    if (range_start > range_end)
@@ -2769,7 +2793,7 @@ regex_compile (pattern, size, syntax, bufp)
          /* Do not translate the character after the \, so that we can
             distinguish, e.g., \B from \b, even if we normally would
             translate, e.g., B to b.  */
-          PATFETCH_RAW (c);
+          PATFETCH (c);
          switch (c)
            {
@@ -3129,13 +3153,13 @@ regex_compile (pattern, size, syntax, bufp)
            case 'c':
              laststart = b;
-              PATFETCH_RAW (c);
+              PATFETCH (c);
              BUF_PUSH_2 (categoryspec, c);
              break;
            case 'C':
              laststart = b;
-              PATFETCH_RAW (c);
+              PATFETCH (c);
              BUF_PUSH_2 (notcategoryspec, c);
              break;
 #endif /* emacs */
@@ -3225,7 +3249,6 @@ regex_compile (pattern, size, syntax, bufp)
              /* You might think it would be useful for \ to mean
                 not to translate; but if we don't translate it
                 it will never match anything.  */
-              c = TRANSLATE (c);
              goto normal_char;
            }
          break;
@@ -3234,7 +3257,7 @@ regex_compile (pattern, size, syntax, bufp)
        default:
        /* Expects the character in `c'.  */
        normal_char:
-              /* If no exactn currently being built.  */
+          /* If no exactn currently being built.  */
          if (!pending_exact
              /* If last exactn not at current position.  */
@@ -3265,6 +3288,7 @@ regex_compile (pattern, size, syntax, bufp)
          {
            int len;
+            c = TRANSLATE (c);
            if (multibyte)
              len = CHAR_STRING (c, b);
            else
@@ -4427,7 +4451,7 @@ mutually_exclusive_p (bufp, p1, p2)
             they don't overlap.  The union of the two sets of excluded
             chars should cover all possible chars, which, as a matter of
             fact, is virtually impossible in multibyte buffers.  */
-          ;
+          break;
        }
      break;
author	Stefan Monnier	2002-08-23 22:21:51 +0000
committer	Stefan Monnier	2002-08-23 22:21:51 +0000
commit	365958144ea38255d543a4232b926ca81e849fa9 (patch)
tree	43beeeefed478bcbfac634c44348351456decaff /src
parent	d846a776e1043ad6d23a71a8daf42cc8b197c4f9 (diff)
download	emacs-365958144ea38255d543a4232b926ca81e849fa9.tar.gz emacs-365958144ea38255d543a4232b926ca81e849fa9.zip