(RE_STRING_CHAR, RE_STRING_CHAR_AND_LENGTH): New arg

multibte. Callers changed. (RE_CHAR_TO_MULTIBYTE, RE_CHAR_TO_UNIBYTE): New macros. (MAKE_CHAR_MULTIBYTE, MAKE_CHAR_UNIBYTE): Deleted. Callers changed to use RE_CHAR_TO_MULTIBYTE and RE_CHAR_TO_UNIBYTE respectively. (SETUP_ASCII_RANGE, SETUP_UNIBYTE_RANGE): New macros. (SETUP_MULTIBYTE_RANGE): Generate more compact range_table. (regex_compile): Make the compiled pattern usable both for multibyte and unibyte targets. (analyse_first): Make the fastmap usable both for multibyte and unibyte targets. (TRANSLATE_VIA_MULTIBYTE): Deleted. (re_match_2_internal): Pay attention to the case that the multibyteness of bufp and target may be different.
author: Kenichi Handa 2007-02-15 11:23:52 +0000
committer: Kenichi Handa 2007-02-15 11:23:52 +0000
commit: cf9c99bcf551d4a50b029f6fc766b7403c846dd5 (patch)
tree: 32465f5419610ab16c4b2d96f992b390d66883ea /src
parent: b13abc5d94ee9e85223cb620b91206bda70d7685 (diff)
download: emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.tar.gz
emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.zip
1 files changed, 300 insertions, 153 deletions
diff --git a/src/regex.c b/src/regex.c
index 177908cb751..782f758468f 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -145,11 +145,18 @@
 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
-# define RE_STRING_CHAR(p, s) \
+# define RE_STRING_CHAR(p, s, multibyte) \
  (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
-# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
+# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
  (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
+# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
+# define RE_CHAR_TO_UNIBYTE(c)                  \
+  (ASCII_CHAR_P (c) ? (c)                       \
+   : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c)       \
+   : multibyte_char_to_unibyte_safe (c))
 /* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
@@ -165,7 +172,7 @@
    else                                                                     \
      {                                                                      \
        (c = ((p) == (str2) ? (end1) : (p))[-1]);                            \
-        MAKE_CHAR_MULTIBYTE (c);                                             \
+        (c) = RE_CHAR_TO_MULTIBYTE (c);                                      \
      }                                                                      \
  } while (0)
@@ -174,12 +181,12 @@
 # define GET_CHAR_AFTER(c, p, len)              \
  do {                                          \
    if (multibyte)                              \
-      c = STRING_CHAR_AND_LENGTH (p, 0, len);   \
+      (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \
    else                                        \
      {                                         \
-        c = *p;                                 \
+        (c) = *p;                               \
        len = 1;                                \
-        MAKE_CHAR_MULTIBYTE (c);                \
+        (c) = RE_CHAR_TO_MULTIBYTE (c);         \
      }                                         \
   } while (0)
@@ -301,10 +308,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
 # define MULTIBYTE_FORM_LENGTH(p, s) (1)
 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
 # define STRING_CHAR(p, s) (*(p))
-# define RE_STRING_CHAR STRING_CHAR
+# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
 # define CHAR_STRING(c, s) (*(s) = (c), 1)
 # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
-# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
+# define RE_STRING_CHAR_AND_LENGTH(p, s, multibyte) STRING_CHAR_AND_LENGTH ((p), (s))
+# define RE_CHAR_TO_MULTIBYTE(c) (c)
+# define RE_CHAR_TO_UNIBYTE(c) (c)
 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
  (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
 # define GET_CHAR_AFTER(c, p, len)      \
@@ -312,8 +321,6 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
 # define MAKE_CHAR(charset, c1, c2) (c1)
 # define BYTE8_TO_CHAR(c) (c)
 # define CHAR_BYTE8_P(c) (0)
-# define MAKE_CHAR_MULTIBYTE(c) (c)
-# define MAKE_CHAR_UNIBYTE(c) (c)
 # define CHAR_LEADING_CODE(c) (c)
 #endif /* not emacs */
@@ -1761,7 +1768,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
  do {                                                                  \
    int len;                                                            \
    if (p == pend) return REG_EEND;                                     \
-    c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len);                   \
+    c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte);        \
    p += len;                                                           \
  } while (0)
@@ -2019,32 +2026,107 @@ struct range_table_work_area
 #ifdef emacs
-/* Store characters in the rage range C0 to C1 in WORK_AREA while
+/* Store characters in the range FROM to TO in the bitmap at B (for
-   translating them and paying attention to the continuity of
+   ASCII and unibyte characters) and WORK_AREA (for multibyte
-   translated characters.
+   characters) while translating them and paying attention to the
+   continuity of translated characters.
-   Implementation note: It is better to implement this fairly big
+   Implementation note: It is better to implement these fairly big
-   macro by a function, but it's not that easy because macros called
+   macros by a function, but it's not that easy because macros called
   in this macro assume various local variables already declared.  */
-#define SETUP_MULTIBYTE_RANGE(work_area, c0, c1)                              \
+/* Both FROM and TO are ASCII characters.  */
-  do {                                                                        \
-    re_wchar_t c, t, t_last;                                                  \
+#define SETUP_ASCII_RANGE(work_area, FROM, TO)                  \
-    int n;                                                                    \
+  do {                                                          \
-                                                                              \
+    int C0, C1;                                                 \
-    c = (c0);                                                                 \
+                                                                \
-    t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
+    for (C0 = (FROM); C0 <= (TO); C0++)                         \
-    for (c++, n = 1; c <= (c1); c++, n++)                                     \
+      {                                                         \
-      {                                                                       \
+        C1 = TRANSLATE (C0);                                    \
-        t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c));  \
+        if (! ASCII_CHAR_P (C1))                                \
-        if (t_last + n == t)                                                  \
+          {                                                     \
-          continue;                                                           \
+            SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);    \
-        SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1);      \
+            if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)             \
-        t_last = t;                                                           \
+              C1 = C0;                                          \
-        n = 0;                                                                \
+          }                                                     \
-      }                                                                       \
+        SET_LIST_BIT (C1);                                      \
-    if (n > 0)                                                                \
+      }                                                         \
-      SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1);        \
+  } while (0)
+/* Both FROM and TO are unibyte characters (0x80..0xFF).  */
+#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)                               \
+  do {                                                                         \
+    int C0, C1, C2, I;                                                         \
+    int USED = RANGE_TABLE_WORK_USED (work_area);                              \
+                                                                               \
+    for (C0 = (FROM); C0 <= (TO); C0++)                                        \
+      {                                                                        \
+        C1 = RE_CHAR_TO_MULTIBYTE (C0);                                        \
+        if (CHAR_BYTE8_P (C1))                                                 \
+          SET_LIST_BIT (C0);                                                   \
+        else                                                                   \
+          {                                                                    \
+            C2 = TRANSLATE (C1);                                               \
+            if (C2 == C1                                                       \
+                || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)                         \
+              C1 = C0;                                                         \
+            SET_LIST_BIT (C1);                                                 \
+            for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+              {                                                                \
+                int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
+                int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
+                                                                               \
+                if (C2 >= from - 1 && C2 <= to + 1)                            \
+                  {                                                            \
+                    if (C2 == from - 1)                                        \
+                      RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
+                    else if (C2 == to + 1)                                     \
+                      RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
+                    break;                                                     \
+                  }                                                            \
+              }                                                                \
+            if (I < USED)                                                      \
+              SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);                 \
+          }                                                                    \
+      }                                                                        \
+  } while (0)
+/* Both FROM and TO are mulitbyte characters.  */
+#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)                         \
+  do {                                                                     \
+    int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);           \
+                                                                           \
+    SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));                 \
+    for (C0 = (FROM); C0 <= (TO); C0++)                                    \
+      {                                                                    \
+        C1 = TRANSLATE (C0);                                               \
+        if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0                            \
+            || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))          \
+          SET_LIST_BIT (C2);                                               \
+        if (C1 >= (FROM) && C1 <= (TO))                                    \
+          continue;                                                        \
+        for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+          {                                                                \
+            int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
+            int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
+                                                                           \
+            if (C1 >= from - 1 && C1 <= to + 1)                            \
+              {                                                            \
+                if (C1 == from - 1)                                        \
+                  RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
+                else if (C1 == to + 1)                                     \
+                  RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
+                break;                                                     \
+              }                                                            \
+          }                                                                \
+        if (I < USED)                                                      \
+          SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);                 \
+      }                                                                    \
  } while (0)
 #endif /* emacs */
@@ -2904,6 +2986,7 @@ regex_compile (pattern, size, syntax, bufp)
              {
                boolean escaped_char = false;
                const unsigned char *p2 = p;
+                re_wchar_t ch, c2;
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
@@ -2966,7 +3049,6 @@ regex_compile (pattern, size, syntax, bufp)
                       them).  */
                    if (c == ':' && *p == ']')
                      {
-                        re_wchar_t ch;
                        re_wctype_t cc;
                        int limit;
@@ -2981,41 +3063,41 @@ regex_compile (pattern, size, syntax, bufp)
                        if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-                        /* Most character classes in a multibyte match
+#ifndef emacs
-                           just set a flag.  Exceptions are is_blank,
+                        for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
-                           is_digit, is_cntrl, and is_xdigit, since
-                           they can only match ASCII characters.  We
-                           don't need to handle them for multibyte.
-                           They are distinguished by a negative wctype.  */
-                        for (ch = 0; ch < 128; ++ch)
                          if (re_iswctype (btowc (ch), cc))
                            {
                              c = TRANSLATE (ch);
                              if (c < (1 << BYTEWIDTH))
                                SET_LIST_BIT (c);
                            }
+#else  /* emacs */
+                        /* Most character classes in a multibyte match
+                           just set a flag.  Exceptions are is_blank,
+                           is_digit, is_cntrl, and is_xdigit, since
+                           they can only match ASCII characters.  We
+                           don't need to handle them for multibyte.
+                           They are distinguished by a negative wctype.  */
-                        if (target_multibyte)
+                        for (ch = 0; ch < 256; ++ch)
                          {
-                            SET_RANGE_TABLE_WORK_AREA_BIT
+                            c = RE_CHAR_TO_MULTIBYTE (ch);
-                              (range_table_work, re_wctype_to_bit (cc));
+                            if (! CHAR_BYTE8_P (c)
-                          }
+                                && re_iswctype (c, cc))
-                        else
-                          {
-                            for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
                              {
-                                c = ch;
+                                SET_LIST_BIT (ch);
-                                MAKE_CHAR_MULTIBYTE (c);
+                                c1 = TRANSLATE (c);
-                                if (re_iswctype (btowc (c), cc))
+                                if (c1 == c)
-                                  {
+                                  continue;
-                                    c = TRANSLATE (c);
+                                if (ASCII_CHAR_P (c1))
-                                    MAKE_CHAR_UNIBYTE (c);
+                                  SET_LIST_BIT (c1);
-                                    SET_LIST_BIT (c);
+                                else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
-                                  }
+                                  SET_LIST_BIT (c1);
                              }
                          }
+                        SET_RANGE_TABLE_WORK_AREA_BIT
+                          (range_table_work, re_wctype_to_bit (cc));
+#endif  /* emacs */
                        /* In most cases the matching rule for char classes
                           only uses the syntax table for multibyte chars,
                           so that the content of the syntax-table it is not
@@ -3048,51 +3130,63 @@ regex_compile (pattern, size, syntax, bufp)
                    /* Fetch the character which ends the range. */
                    PATFETCH (c1);
-                    if (c > c1)
+#ifdef emacs
-                      {
+                    if (CHAR_BYTE8_P (c1)
-                        if (syntax & RE_NO_EMPTY_RANGES)
+                        && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
-                          FREE_STACK_RETURN (REG_ERANGEX);
+                      /* Treat the range from a multibyte character to
-                        /* Else, repeat the loop.  */
+                         raw-byte character as empty.  */
-                      }
+                      c = c1 + 1;
+#endif  /* emacs */
                  }
                else
                  /* Range from C to C. */
                  c1 = c;
-#ifndef emacs
+                if (c > c1)
-                c = TRANSLATE (c);
-                c1 = TRANSLATE (c1);
-                /* Set the range into bitmap */
-                for (; c <= c1; c++)
-                  SET_LIST_BIT (TRANSLATE (c));
-#else  /* not emacs */
-                if (target_multibyte)
                  {
-                    if (c1 >= 128)
+                    if (syntax & RE_NO_EMPTY_RANGES)
-                      {
+                      FREE_STACK_RETURN (REG_ERANGEX);
-                        re_wchar_t c0 = MAX (c, 128);
+                    /* Else, repeat the loop.  */
-                        SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1);
-                        c1 = 127;
-                      }
-                    for (; c <= c1; c++)
-                      SET_LIST_BIT (TRANSLATE (c));
                  }
                else
                  {
-                    re_wchar_t c0;
+#ifndef emacs
+                    /* Set the range into bitmap */
                    for (; c <= c1; c++)
                      {
-                        c0 = c;
+                        ch = TRANSLATE (c);
-                        if (! multibyte)
+                        if (ch < (1 << BYTEWIDTH))
-                          MAKE_CHAR_MULTIBYTE (c0);
+                          SET_LIST_BIT (ch);
-                        c0 = TRANSLATE (c0);
+                      }
-                        MAKE_CHAR_UNIBYTE (c0);
+#else  /* emacs */
-                        SET_LIST_BIT (c0);
+                    if (c < 128)
+                      {
+                        ch = MIN (127, c1);
+                        SETUP_ASCII_RANGE (range_table_work, c, ch);
+                        c = ch + 1;
+                        if (CHAR_BYTE8_P (c1))
+                          c = BYTE8_TO_CHAR (128);
+                      }
+                    if (c <= c1)
+                      {
+                        if (CHAR_BYTE8_P (c))
+                          {
+                            c = CHAR_TO_BYTE8 (c);
+                            c1 = CHAR_TO_BYTE8 (c1);
+                            for (; c <= c1; c++)
+                              SET_LIST_BIT (c);
+                          }
+                        else if (multibyte)
+                          {
+                            SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
+                          }
+                        else
+                          {
+                            SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
+                          }
                      }
+#endif /* emacs */
                  }
-#endif /* not emacs */
              }
            /* Discard any (non)matching list bytes that are all 0 at the
@@ -3677,17 +3771,22 @@ regex_compile (pattern, size, syntax, bufp)
          {
            int len;
-            if (! multibyte)
+            if (multibyte)
-              MAKE_CHAR_MULTIBYTE (c);
-            c = TRANSLATE (c);
-            if (target_multibyte)
              {
+                c = TRANSLATE (c);
                len = CHAR_STRING (c, b);
                b += len;
              }
            else
              {
-                MAKE_CHAR_UNIBYTE (c);
+                c1 = RE_CHAR_TO_MULTIBYTE (c);
+                if (! CHAR_BYTE8_P (c1))
+                  {
+                    re_wchar_t c2 = TRANSLATE (c1);
+                    if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
+                      c = c1;
+                  }                   
                *b++ = c;
                len = 1;
              }
@@ -3714,11 +3813,6 @@ regex_compile (pattern, size, syntax, bufp)
  /* We have succeeded; set the length of the buffer.  */
  bufp->used = b - bufp->buffer;
-#ifdef emacs
-  /* Now the buffer is adjusted for the multibyteness of a target.  */
-  bufp->multibyte = bufp->target_multibyte;
-#endif
 #ifdef DEBUG
  if (debug > 0)
    {
@@ -3964,11 +4058,23 @@ analyse_first (p, pend, fastmap, multibyte)
        case exactn:
          if (fastmap)
-            /* If multibyte is nonzero, the first byte of each
+            {
-               character is an ASCII or a leading code.  Otherwise,
+              /* If multibyte is nonzero, the first byte of each
-               each byte is a character.  Thus, this works in both
+                 character is an ASCII or a leading code.  Otherwise,
-               cases. */
+                 each byte is a character.  Thus, this works in both
-            fastmap[p[1]] = 1;
+                 cases. */
+              fastmap[p[1]] = 1;
+              if (! multibyte)
+                {
+                  /* For the case of matching this unibyte regex
+                     against multibyte, we must set a leading code of
+                     the corresponding multibyte character.  */
+                  int c = RE_CHAR_TO_MULTIBYTE (p[1]);
+                  if (! CHAR_BYTE8_P (c))
+                    fastmap[CHAR_LEADING_CODE (c)] = 1;
+                }
+            }
          break;
@@ -3983,12 +4089,8 @@ analyse_first (p, pend, fastmap, multibyte)
          if (!fastmap) break;
          {
            /* Chars beyond end of bitmap are possible matches.  */
-            /* In a multibyte case, the bitmap is used only for ASCII
-               characters.  */
-            int limit = multibyte ? 128 : (1 << BYTEWIDTH);
            for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-                 j < limit; j++)
+                 j < (1 << BYTEWIDTH); j++)
              fastmap[j] = 1;
          }
@@ -4031,7 +4133,7 @@ analyse_first (p, pend, fastmap, multibyte)
              /* Extract the number of ranges in range table into COUNT.  */
              EXTRACT_NUMBER_AND_INCR (count, p);
-              for (; count > 0; count--, p += 2 * 3) /* XXX */
+              for (; count > 0; count--, p += 3)
                {
                  /* Extract the start and end of each range.  */
                  EXTRACT_CHARACTER (c, p);
@@ -4329,9 +4431,8 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
  int total_size = size1 + size2;
  int endpos = startpos + range;
  boolean anchored_start;
-  /* Nonzero if BUFP is setup for multibyte characters.  We are sure
+  /* Nonzero if we are searching multibyte string.  */
-     that it is the same as RE_TARGET_MULTIBYTE_P (bufp).  */
+  const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
-  const boolean multibyte = RE_MULTIBYTE_P (bufp);
  /* Check for out-of-range STARTPOS.  */
  if (startpos < 0 || startpos > total_size)
@@ -4437,10 +4538,14 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
                  else
                    while (range > lim)
                      {
+                        register re_wchar_t ch, translated;
                        buf_ch = *d;
-                        MAKE_CHAR_MULTIBYTE (buf_ch);
+                        ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
-                        buf_ch = RE_TRANSLATE (translate, buf_ch);
+                        translated = RE_TRANSLATE (translate, ch);
-                        MAKE_CHAR_UNIBYTE (buf_ch);
+                        if (translated != ch
+                            && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+                          buf_ch = ch;
                        if (fastmap[buf_ch])
                          break;
                        d++;
@@ -4484,7 +4589,15 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
                }
              else
                {
-                  if (! fastmap[TRANSLATE (*d)])
+                  register re_wchar_t ch, translated;
+                  buf_ch = *d;
+                  ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
+                  translated = TRANSLATE (ch);
+                  if (translated != ch
+                      && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+                    buf_ch = ch;
+                  if (! fastmap[TRANSLATE (buf_ch)])
                    goto advance;
                }
            }
@@ -4765,11 +4878,11 @@ mutually_exclusive_p (bufp, p1, p2)
      {
        register re_wchar_t c
          = (re_opcode_t) *p2 == endline ? '\n'
-          : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
+          : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
        if ((re_opcode_t) *p1 == exactn)
          {
-            if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
+            if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
              {
                DEBUG_PRINT3 ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
                return 1;
@@ -4993,23 +5106,6 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
 }
 WEAK_ALIAS (__re_match_2, re_match_2)
-#ifdef emacs
-#define TRANSLATE_VIA_MULTIBYTE(c)      \
-  do {                                  \
-    if (multibyte)                      \
-      (c) = TRANSLATE (c);              \
-    else                                \
-      {                                 \
-        MAKE_CHAR_MULTIBYTE (c);        \
-        (c) = TRANSLATE (c);            \
-        MAKE_CHAR_UNIBYTE (c);          \
-      }                                 \
-  } while (0)
-#else
-#define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c))
-#endif
 /* This is a separate function so that we can force an alloca cleanup
   afterwards.  */
@@ -5050,10 +5146,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
  /* We use this to map every character in the string.  */
  RE_TRANSLATE_TYPE translate = bufp->translate;
-  /* Nonzero if BUFP is setup for multibyte characters.  We are sure
+  /* Nonzero if BUFP is setup from a multibyte regex.  */
-     that it is the same as RE_TARGET_MULTIBYTE_P (bufp).  */
  const boolean multibyte = RE_MULTIBYTE_P (bufp);
+  /* Nonzero if STRING1/STRING2 are multibyte.  */
+  const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
  /* Failure point stack.  Each place that can handle a failure further
     down the line pushes a failure point on this stack.  It consists of
     regstart, and regend for all registers corresponding to
@@ -5433,14 +5531,20 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
            while (--mcnt);
 #else  /* emacs */
          /* The cost of testing `translate' is comparatively small.  */
-          if (multibyte)
+          if (target_multibyte)
            do
              {
                int pat_charlen, buf_charlen;
-                unsigned int pat_ch, buf_ch;
+                int pat_ch, buf_ch;
                PREFETCH ();
-                pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+                if (multibyte)
+                  pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+                else
+                  {
+                    pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
+                    pat_charlen = 1;
+                  }
                buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
                if (TRANSLATE (buf_ch) != pat_ch)
@@ -5457,16 +5561,38 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
          else
            do
              {
-                unsigned int buf_ch;
+                int pat_charlen, buf_charlen;
+                int pat_ch, buf_ch;
                PREFETCH ();
-                buf_ch = *d++;
+                if (multibyte)
-                TRANSLATE_VIA_MULTIBYTE (buf_ch);
+                  {
-                if (buf_ch != *p++)
+                    pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+                    if (CHAR_BYTE8_P (pat_ch))
+                      pat_ch = CHAR_TO_BYTE8 (pat_ch);
+                    else
+                      pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
+                  }
+                else
+                  {
+                    pat_ch = *p;
+                    pat_charlen = 1;
+                  }
+                buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
+                if (! CHAR_BYTE8_P (buf_ch))
+                  {
+                    buf_ch = TRANSLATE (buf_ch);
+                    buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
+                    if (buf_ch < 0)
+                      buf_ch = *d;
+                  }
+                if (buf_ch != pat_ch)
                  {
                    d = dfail;
                    goto fail;
                  }
+                p += pat_charlen;
+                d++;
              }
            while (--mcnt);
 #endif
@@ -5482,7 +5608,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
            DEBUG_PRINT1 ("EXECUTING anychar.\n");
            PREFETCH ();
-            buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+            buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
+                                                target_multibyte);
            buf_ch = TRANSLATE (buf_ch);
            if ((!(bufp->syntax & RE_DOT_NEWLINE)
@@ -5526,10 +5653,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
              }
            PREFETCH ();
-            c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
+            c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
-            TRANSLATE_VIA_MULTIBYTE (c); /* The character to match.  */
+            if (target_multibyte)
+              {
+                int c1;
-            if (! multibyte || IS_REAL_ASCII (c))
+                c = TRANSLATE (c);
+                c1 = RE_CHAR_TO_UNIBYTE (c);
+                if (c1 >= 0)
+                  c = c1;
+              }
+            else
+              {
+                int c1 = RE_CHAR_TO_MULTIBYTE (c);
+                if (! CHAR_BYTE8_P (c1))
+                  {
+                    c1 = TRANSLATE (c1);
+                    c1 = RE_CHAR_TO_UNIBYTE (c1);
+                    if (c1 >= 0)
+                      c = c1;
+                  }
+              }
+            if (c < (1 << BYTEWIDTH))
              {                 /* Lookup bitmap.  */
                /* Cast to `unsigned' instead of `unsigned char' in
                   case the bit list is a full 32 bytes long.  */
@@ -6096,7 +6243,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
              UPDATE_SYNTAX_TABLE (charpos);
 #endif
              PREFETCH ();
-              c2 = RE_STRING_CHAR (d, dend - d);
+              c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
              s2 = SYNTAX (c2);
              /* Case 2: S2 is neither Sword nor Ssymbol. */
@@ -6149,7 +6296,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
              if (!AT_STRINGS_END (d))
                {
                  PREFETCH_NOLIMIT ();
-                  c2 = RE_STRING_CHAR (d, dend - d);
+                  c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
 #ifdef emacs
                  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
author	Kenichi Handa	2007-02-15 11:23:52 +0000
committer	Kenichi Handa	2007-02-15 11:23:52 +0000
commit	cf9c99bcf551d4a50b029f6fc766b7403c846dd5 (patch)
tree	32465f5419610ab16c4b2d96f992b390d66883ea /src
parent	b13abc5d94ee9e85223cb620b91206bda70d7685 (diff)
download	emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.tar.gz emacs-cf9c99bcf551d4a50b029f6fc766b7403c846dd5.zip