aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Eggert2018-08-05 18:41:20 -0700
committerPaul Eggert2018-08-05 19:36:09 -0700
commitd904cc83f3036db96107a3976cee1a0112547de6 (patch)
tree7186eee577f0112462a024368c014819c4055152
parente5652268a993ad9117f7253553c143d60460eb8f (diff)
downloademacs-d904cc83f3036db96107a3976cee1a0112547de6.tar.gz
emacs-d904cc83f3036db96107a3976cee1a0112547de6.zip
Use Gnulib regex for lib-src
Emacs regular expressions forked from everyone else long ago. This makes it official and should allow simplification later. etags.c now uses the glibc regex API, falling back on a Gnulib-supplied substitute lib/regex.c if necessary. Emacs proper now uses its own regular expression module. Although this patch may look dauntingly large, most of it was generated automatically by admin/merge-gnulib and contains an exact copy of the glibc regex source, and the by-hand changes do not grow the Emacs source code. * admin/merge-gnulib (GNULIB_MODULES): Add regex. (AVOIDED_MODULES): Add btowc, langinfo, lock, mbrtowc, mbsinit, nl_langinfo, wchar, wcrtomb, wctype-h. * lib-src/Makefile.in (regex-emacs.o): Remove; Gnulib does it now. (etags_deps, etags_libs): Remove regex-emacs.o. * lib-src/etags.c: Go back to including regex.h. (add_regex): Use unsigned char translation array, since glibc regex requires that. * lib/Makefile.in (not_emacs_OBJECTS, for_emacs_OBJECTS): New macros. (libegnu_a_OBJECTS): Use them, to avoid building e-regex.o. * lib/gnulib.mk.in, m4/gnulib-comp.m4: Regenerate. * lib/regcomp.c, lib/regex.c, lib/regex.h, lib/regex_internal.c: * lib/regex_internal.h, lib/regexec.c, m4/builtin-expect.m4: * m4/eealloc.m4, m4/glibc21.m4, m4/mbstate_t.m4, m4/regex.m4: New files, copied from Gnulib. * src/regex-emacs.h, src/conf_post.h: (RE_TRANSLATE_TYPE, RE_TRANSLATE, RE_TRANSLATE_P): Move from src/conf_post.h to src/regex-emacs.h, so that they don’t interfere with compiling lib/regex.c.
-rwxr-xr-xadmin/merge-gnulib7
-rw-r--r--etc/NEWS7
-rw-r--r--lib-src/Makefile.in8
-rw-r--r--lib-src/etags.c4
-rw-r--r--lib/Makefile.in8
-rw-r--r--lib/gnulib.mk.in23
-rw-r--r--lib/regcomp.c3944
-rw-r--r--lib/regex.c81
-rw-r--r--lib/regex.h658
-rw-r--r--lib/regex_internal.c1740
-rw-r--r--lib/regex_internal.h911
-rw-r--r--lib/regexec.c4324
-rw-r--r--m4/builtin-expect.m449
-rw-r--r--m4/eealloc.m431
-rw-r--r--m4/glibc21.m434
-rw-r--r--m4/gnulib-comp.m430
-rw-r--r--m4/mbstate_t.m441
-rw-r--r--m4/regex.m4300
-rw-r--r--src/conf_post.h7
-rw-r--r--src/regex-emacs.h7
20 files changed, 12194 insertions, 20 deletions
diff --git a/admin/merge-gnulib b/admin/merge-gnulib
index 1397ecfb9f7..abb192911d9 100755
--- a/admin/merge-gnulib
+++ b/admin/merge-gnulib
@@ -37,7 +37,7 @@ GNULIB_MODULES='
37 getloadavg getopt-gnu gettime gettimeofday gitlog-to-changelog 37 getloadavg getopt-gnu gettime gettimeofday gitlog-to-changelog
38 ieee754-h ignore-value intprops largefile lstat 38 ieee754-h ignore-value intprops largefile lstat
39 manywarnings memrchr minmax mkostemp mktime nstrftime 39 manywarnings memrchr minmax mkostemp mktime nstrftime
40 pipe2 pselect pthread_sigmask putenv qcopy-acl readlink readlinkat 40 pipe2 pselect pthread_sigmask putenv qcopy-acl readlink readlinkat regex
41 sig2str socklen stat-time std-gnu11 stdalign stddef stdio 41 sig2str socklen stat-time std-gnu11 stdalign stddef stdio
42 stpcpy strtoimax symlink sys_stat sys_time 42 stpcpy strtoimax symlink sys_stat sys_time
43 tempname time time_r time_rz timegm timer-time timespec-add timespec-sub 43 tempname time time_r time_rz timegm timer-time timespec-add timespec-sub
@@ -46,11 +46,12 @@ GNULIB_MODULES='
46' 46'
47 47
48AVOIDED_MODULES=' 48AVOIDED_MODULES='
49 close dup fchdir fstat 49 btowc close dup fchdir fstat langinfo lock
50 malloc-posix msvc-inval msvc-nothrow 50 malloc-posix mbrtowc mbsinit msvc-inval msvc-nothrow nl_langinfo
51 openat-die opendir raise 51 openat-die opendir raise
52 save-cwd select setenv sigprocmask stat stdarg stdbool 52 save-cwd select setenv sigprocmask stat stdarg stdbool
53 threadlib tzset unsetenv utime utime-h 53 threadlib tzset unsetenv utime utime-h
54 wchar wcrtomb wctype-h
54' 55'
55 56
56GNULIB_TOOL_FLAGS=' 57GNULIB_TOOL_FLAGS='
diff --git a/etc/NEWS b/etc/NEWS
index fa8a7afd523..21887f5bfd3 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -31,6 +31,13 @@ functions 'json-serialize', 'json-insert', 'json-parse-string', and
31'json-parse-buffer' are typically much faster than their Lisp 31'json-parse-buffer' are typically much faster than their Lisp
32counterparts from json.el. 32counterparts from json.el.
33 33
34** The etags program now uses the C library's regular expression matcher
35when possible, and a compatible regex substitute otherwise. This will
36let developers maintain Emacs's own regex code without having to also
37support other programs. The new configure option '--without-included-regex'
38forces etags to use the C library's regex matcher even if the regex
39substitute ordinarily would be used to work around compatibility problems.
40
34** Emacs has been ported to the -fcheck-pointer-bounds option of GCC. 41** Emacs has been ported to the -fcheck-pointer-bounds option of GCC.
35This causes Emacs to check bounds of some arrays addressed by its 42This causes Emacs to check bounds of some arrays addressed by its
36internal pointers, which can be helpful when debugging the Emacs 43internal pointers, which can be helpful when debugging the Emacs
diff --git a/lib-src/Makefile.in b/lib-src/Makefile.in
index e70b23c4b3f..b2b901788a5 100644
--- a/lib-src/Makefile.in
+++ b/lib-src/Makefile.in
@@ -361,13 +361,9 @@ TAGS: etags${EXEEXT} ${tagsfiles}
361../lib/libgnu.a: $(config_h) 361../lib/libgnu.a: $(config_h)
362 $(MAKE) -C ../lib all 362 $(MAKE) -C ../lib all
363 363
364regex-emacs.o: $(srcdir)/../src/regex-emacs.c $(srcdir)/../src/regex-emacs.h $(config_h) 364etags_deps = ${srcdir}/etags.c $(NTLIB) $(config_h)
365 $(AM_V_CC)$(CC) -c $(CPP_CFLAGS) $<
366
367
368etags_deps = ${srcdir}/etags.c regex-emacs.o $(NTLIB) $(config_h)
369etags_cflags = -DEMACS_NAME="\"GNU Emacs\"" -DVERSION="\"${version}\"" -o $@ 365etags_cflags = -DEMACS_NAME="\"GNU Emacs\"" -DVERSION="\"${version}\"" -o $@
370etags_libs = regex-emacs.o $(NTLIB) $(LOADLIBES) 366etags_libs = $(NTLIB) $(LOADLIBES)
371 367
372etags${EXEEXT}: ${etags_deps} 368etags${EXEEXT}: ${etags_deps}
373 $(AM_V_CCLD)$(CC) ${ALL_CFLAGS} $(etags_cflags) $< $(etags_libs) 369 $(AM_V_CCLD)$(CC) ${ALL_CFLAGS} $(etags_cflags) $< $(etags_libs)
diff --git a/lib-src/etags.c b/lib-src/etags.c
index 47d13116db6..ee506703436 100644
--- a/lib-src/etags.c
+++ b/lib-src/etags.c
@@ -135,7 +135,7 @@ char pot_etags_version[] = "@(#) pot revision number is 17.38.1.4";
135#endif 135#endif
136 136
137#include <getopt.h> 137#include <getopt.h>
138#include <regex-emacs.h> 138#include <regex.h>
139 139
140/* Define CTAGS to make the program "ctags" compatible with the usual one. 140/* Define CTAGS to make the program "ctags" compatible with the usual one.
141 Leave it undefined to make the program "etags", which makes emacs-style 141 Leave it undefined to make the program "etags", which makes emacs-style
@@ -6401,7 +6401,7 @@ add_regex (char *regexp_pattern, language *lang)
6401 *patbuf = zeropattern; 6401 *patbuf = zeropattern;
6402 if (ignore_case) 6402 if (ignore_case)
6403 { 6403 {
6404 static char lc_trans[UCHAR_MAX + 1]; 6404 static unsigned char lc_trans[UCHAR_MAX + 1];
6405 int i; 6405 int i;
6406 for (i = 0; i < UCHAR_MAX + 1; i++) 6406 for (i = 0; i < UCHAR_MAX + 1; i++)
6407 lc_trans[i] = c_tolower (i); 6407 lc_trans[i] = c_tolower (i);
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 201f4b53836..b26db27423d 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -79,9 +79,15 @@ endif
79Makefile: ../config.status $(srcdir)/Makefile.in 79Makefile: ../config.status $(srcdir)/Makefile.in
80 $(MAKE) -C .. src/$@ 80 $(MAKE) -C .. src/$@
81 81
82# Object modules that need not be built for Emacs.
83# Emacs does not need e-regex.o (it has its own regex-emacs.c),
84# and building it would just waste time.
85not_emacs_OBJECTS = regex.o
86
82libgnu_a_OBJECTS = $(gl_LIBOBJS) \ 87libgnu_a_OBJECTS = $(gl_LIBOBJS) \
83 $(patsubst %.c,%.o,$(filter %.c,$(libgnu_a_SOURCES))) 88 $(patsubst %.c,%.o,$(filter %.c,$(libgnu_a_SOURCES)))
84libegnu_a_OBJECTS = $(patsubst %.o,e-%.o,$(libgnu_a_OBJECTS)) 89for_emacs_OBJECTS = $(filter-out $(not_emacs_OBJECTS),$(libgnu_a_OBJECTS))
90libegnu_a_OBJECTS = $(patsubst %.o,e-%.o,$(for_emacs_OBJECTS))
85 91
86$(libegnu_a_OBJECTS) $(libgnu_a_OBJECTS): $(BUILT_SOURCES) 92$(libegnu_a_OBJECTS) $(libgnu_a_OBJECTS): $(BUILT_SOURCES)
87 93
diff --git a/lib/gnulib.mk.in b/lib/gnulib.mk.in
index 7d28dcc62b8..7ad390875b0 100644
--- a/lib/gnulib.mk.in
+++ b/lib/gnulib.mk.in
@@ -34,13 +34,19 @@
34# --no-libtool \ 34# --no-libtool \
35# --macro-prefix=gl \ 35# --macro-prefix=gl \
36# --no-vc-files \ 36# --no-vc-files \
37# --avoid=btowc \
37# --avoid=close \ 38# --avoid=close \
38# --avoid=dup \ 39# --avoid=dup \
39# --avoid=fchdir \ 40# --avoid=fchdir \
40# --avoid=fstat \ 41# --avoid=fstat \
42# --avoid=langinfo \
43# --avoid=lock \
41# --avoid=malloc-posix \ 44# --avoid=malloc-posix \
45# --avoid=mbrtowc \
46# --avoid=mbsinit \
42# --avoid=msvc-inval \ 47# --avoid=msvc-inval \
43# --avoid=msvc-nothrow \ 48# --avoid=msvc-nothrow \
49# --avoid=nl_langinfo \
44# --avoid=openat-die \ 50# --avoid=openat-die \
45# --avoid=opendir \ 51# --avoid=opendir \
46# --avoid=raise \ 52# --avoid=raise \
@@ -56,6 +62,9 @@
56# --avoid=unsetenv \ 62# --avoid=unsetenv \
57# --avoid=utime \ 63# --avoid=utime \
58# --avoid=utime-h \ 64# --avoid=utime-h \
65# --avoid=wchar \
66# --avoid=wcrtomb \
67# --avoid=wctype-h \
59# alloca-opt \ 68# alloca-opt \
60# binary-io \ 69# binary-io \
61# byteswap \ 70# byteswap \
@@ -113,6 +122,7 @@
113# qcopy-acl \ 122# qcopy-acl \
114# readlink \ 123# readlink \
115# readlinkat \ 124# readlinkat \
125# regex \
116# sig2str \ 126# sig2str \
117# socklen \ 127# socklen \
118# stat-time \ 128# stat-time \
@@ -216,6 +226,7 @@ GETOPT_CDEFS_H = @GETOPT_CDEFS_H@
216GETOPT_H = @GETOPT_H@ 226GETOPT_H = @GETOPT_H@
217GFILENOTIFY_CFLAGS = @GFILENOTIFY_CFLAGS@ 227GFILENOTIFY_CFLAGS = @GFILENOTIFY_CFLAGS@
218GFILENOTIFY_LIBS = @GFILENOTIFY_LIBS@ 228GFILENOTIFY_LIBS = @GFILENOTIFY_LIBS@
229GLIBC21 = @GLIBC21@
219GL_COND_LIBTOOL = @GL_COND_LIBTOOL@ 230GL_COND_LIBTOOL = @GL_COND_LIBTOOL@
220GL_GENERATE_ALLOCA_H = @GL_GENERATE_ALLOCA_H@ 231GL_GENERATE_ALLOCA_H = @GL_GENERATE_ALLOCA_H@
221GL_GENERATE_BYTESWAP_H = @GL_GENERATE_BYTESWAP_H@ 232GL_GENERATE_BYTESWAP_H = @GL_GENERATE_BYTESWAP_H@
@@ -1024,6 +1035,7 @@ gameuser = @gameuser@
1024gl_GNULIB_ENABLED_03e0aaad4cb89ca757653bd367a6ccb7 = @gl_GNULIB_ENABLED_03e0aaad4cb89ca757653bd367a6ccb7@ 1035gl_GNULIB_ENABLED_03e0aaad4cb89ca757653bd367a6ccb7 = @gl_GNULIB_ENABLED_03e0aaad4cb89ca757653bd367a6ccb7@
1025gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9 = @gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9@ 1036gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9 = @gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9@
1026gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b = @gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b@ 1037gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b = @gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b@
1038gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547 = @gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547@
1027gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31 = @gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31@ 1039gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31 = @gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31@
1028gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c = @gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c@ 1040gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c = @gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c@
1029gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec = @gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec@ 1041gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec = @gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec@
@@ -2095,6 +2107,17 @@ EXTRA_libgnu_a_SOURCES += at-func.c readlinkat.c
2095endif 2107endif
2096## end gnulib module readlinkat 2108## end gnulib module readlinkat
2097 2109
2110## begin gnulib module regex
2111ifeq (,$(OMIT_GNULIB_MODULE_regex))
2112
2113
2114EXTRA_DIST += regcomp.c regex.c regex.h regex_internal.c regex_internal.h regexec.c
2115
2116EXTRA_libgnu_a_SOURCES += regcomp.c regex.c regex_internal.c regexec.c
2117
2118endif
2119## end gnulib module regex
2120
2098## begin gnulib module root-uid 2121## begin gnulib module root-uid
2099ifeq (,$(OMIT_GNULIB_MODULE_root-uid)) 2122ifeq (,$(OMIT_GNULIB_MODULE_root-uid))
2100 2123
diff --git a/lib/regcomp.c b/lib/regcomp.c
new file mode 100644
index 00000000000..53eb2263740
--- /dev/null
+++ b/lib/regcomp.c
@@ -0,0 +1,3944 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30#ifdef RE_ENABLE_I18N
31static void free_charset (re_charset_t *cset);
32#endif /* RE_ENABLE_I18N */
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
38static reg_errcode_t analyze (regex_t *preg);
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92#ifdef RE_ENABLE_I18N
93static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103#else /* not RE_ENABLE_I18N */
104static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110#endif /* not RE_ENABLE_I18N */
111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127/* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
132static const char __re_error_msgid[] =
133 {
134#define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
186static const size_t __re_error_msgid_idx[] =
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206
207/* Entry points for GNU code. */
208
209/* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
212
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
215
216const char *
217re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
219{
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235}
236#ifdef _LIBC
237weak_alias (__re_compile_pattern, re_compile_pattern)
238#endif
239
240/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243/* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245reg_syntax_t re_syntax_options;
246
247
248/* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255reg_syntax_t
256re_set_syntax (reg_syntax_t syntax)
257{
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262}
263#ifdef _LIBC
264weak_alias (__re_set_syntax, re_set_syntax)
265#endif
266
267int
268re_compile_fastmap (struct re_pattern_buffer *bufp)
269{
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
272
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
283}
284#ifdef _LIBC
285weak_alias (__re_compile_fastmap, re_compile_fastmap)
286#endif
287
288static inline void
289__attribute__ ((always_inline))
290re_set_fastmap (char *fastmap, bool icase, int ch)
291{
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
295}
296
297/* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
299
300static void
301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
303{
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308 {
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
311
312 if (type == CHARACTER)
313 {
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315#ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 {
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
335 }
336#endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350#ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
355
356# ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372# endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380# ifdef _LIBC
381 || cset->nequiv_classes
382# endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415#endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417#ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419#endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428}
429
430/* Entry point for POSIX code. */
431/* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466int
467regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468{
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
472
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
476
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
481
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
491 }
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
496
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
503
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
510 {
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
514 }
515
516 return (int) ret;
517}
518#ifdef _LIBC
519libc_hidden_def (__regcomp)
520weak_alias (__regcomp, regcomp)
521#endif
522
523/* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
525
526size_t
527regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 size_t errbuf_size)
529{
530 const char *msg;
531 size_t msg_size;
532
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
546 if (BE (errbuf_size != 0, 1))
547 {
548 size_t cpy_size = msg_size;
549 if (BE (msg_size > errbuf_size, 0))
550 {
551 cpy_size = errbuf_size - 1;
552 errbuf[cpy_size] = '\0';
553 }
554 memcpy (errbuf, msg, cpy_size);
555 }
556
557 return msg_size;
558}
559#ifdef _LIBC
560weak_alias (__regerror, regerror)
561#endif
562
563
564#ifdef RE_ENABLE_I18N
565/* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
569static const bitset_t utf8_sb_map =
570{
571 /* Set the first 128 bits. */
572# if defined __GNUC__ && !defined __STRICT_ANSI__
573 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
574# else
575# if 4 * BITSET_WORD_BITS < ASCII_CHARS
576# error "bitset_word_t is narrower than 32 bits"
577# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX, BITSET_WORD_MAX,
581# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582 BITSET_WORD_MAX,
583# endif
584 (BITSET_WORD_MAX
585 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 ? 0
587 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588# endif
589};
590#endif
591
592
593static void
594free_dfa_content (re_dfa_t *dfa)
595{
596 Idx i, j;
597
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
600 free_token (dfa->nodes + i);
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
603 {
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
610 }
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
615
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 {
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
621 {
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
624 }
625 re_free (entry->array);
626 }
627 re_free (dfa->state_table);
628#ifdef RE_ENABLE_I18N
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
631#endif
632 re_free (dfa->subexp_map);
633#ifdef DEBUG
634 re_free (dfa->re_str);
635#endif
636
637 re_free (dfa);
638}
639
640
641/* Free dynamically allocated space used by PREG. */
642
643void
644regfree (regex_t *preg)
645{
646 re_dfa_t *dfa = preg->buffer;
647 if (BE (dfa != NULL, 1))
648 {
649 lock_fini (dfa->lock);
650 free_dfa_content (dfa);
651 }
652 preg->buffer = NULL;
653 preg->allocated = 0;
654
655 re_free (preg->fastmap);
656 preg->fastmap = NULL;
657
658 re_free (preg->translate);
659 preg->translate = NULL;
660}
661#ifdef _LIBC
662libc_hidden_def (__regfree)
663weak_alias (__regfree, regfree)
664#endif
665
666/* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669#if defined _REGEX_RE_COMP || defined _LIBC
670
671/* BSD has one and only one pattern buffer. */
672static struct re_pattern_buffer re_comp_buf;
673
674char *
675# ifdef _LIBC
676/* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679weak_function
680# endif
681re_comp (const char *s)
682{
683 reg_errcode_t ret;
684 char *fastmap;
685
686 if (!s)
687 {
688 if (!re_comp_buf.buffer)
689 return gettext ("No previous regular expression");
690 return 0;
691 }
692
693 if (re_comp_buf.buffer)
694 {
695 fastmap = re_comp_buf.fastmap;
696 re_comp_buf.fastmap = NULL;
697 __regfree (&re_comp_buf);
698 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
699 re_comp_buf.fastmap = fastmap;
700 }
701
702 if (re_comp_buf.fastmap == NULL)
703 {
704 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
705 if (re_comp_buf.fastmap == NULL)
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx[(int) REG_ESPACE]);
708 }
709
710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
711 don't need to initialize the pattern buffer fields which affect it. */
712
713 /* Match anchors at newlines. */
714 re_comp_buf.newline_anchor = 1;
715
716 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717
718 if (!ret)
719 return NULL;
720
721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
722 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723}
724
725#ifdef _LIBC
726libc_freeres_fn (free_mem)
727{
728 __regfree (&re_comp_buf);
729}
730#endif
731
732#endif /* _REGEX_RE_COMP */
733
734/* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
737
738static reg_errcode_t
739re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
741{
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
745
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
754
755 /* Initialize the dfa. */
756 dfa = preg->buffer;
757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758 {
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
766 preg->allocated = sizeof (re_dfa_t);
767 preg->buffer = dfa;
768 }
769 preg->used = sizeof (re_dfa_t);
770
771 err = init_dfa (dfa, length);
772 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773 err = REG_ESPACE;
774 if (BE (err != REG_NOERROR, 0))
775 {
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
780 }
781#ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785#endif
786
787 err = re_string_construct (&regexp, pattern, length, preg->translate,
788 (syntax & RE_ICASE) != 0, dfa);
789 if (BE (err != REG_NOERROR, 0))
790 {
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
794 lock_fini (dfa->lock);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
806
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
812#ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816#endif
817
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
820
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
824
825 if (BE (err != REG_NOERROR, 0))
826 {
827 lock_fini (dfa->lock);
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
831 }
832
833 return err;
834}
835
836/* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839static reg_errcode_t
840init_dfa (re_dfa_t *dfa, size_t pat_len)
841{
842 __re_size_t table_size;
843#ifndef _LIBC
844 const char *codeset_name;
845#endif
846#ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848#else
849 size_t max_i18n_object_size = 0;
850#endif
851 size_t max_object_size =
852 MAX (sizeof (struct re_state_table_entry),
853 MAX (sizeof (re_token_t),
854 MAX (sizeof (re_node_set),
855 MAX (sizeof (regmatch_t),
856 max_i18n_object_size))));
857
858 memset (dfa, '\0', sizeof (re_dfa_t));
859
860 /* Force allocation of str_tree_storage the first time. */
861 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
868 return REG_ESPACE;
869
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size = 1; ; table_size <<= 1)
875 if (table_size > pat_len)
876 break;
877
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
880
881 dfa->mb_cur_max = MB_CUR_MAX;
882#ifdef _LIBC
883 if (dfa->mb_cur_max == 6
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
888#else
889 codeset_name = nl_langinfo (CODESET);
890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894 dfa->is_utf8 = 1;
895
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
899#endif
900
901#ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
903 {
904 if (dfa->is_utf8)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 else
907 {
908 int i, j, ch;
909
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
913
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
917 {
918 wint_t wch = __btowc (ch);
919 if (wch != WEOF)
920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921# ifndef _LIBC
922 if (isascii (ch) && wch != ch)
923 dfa->map_notascii = 1;
924# endif
925 }
926 }
927 }
928#endif
929
930 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
931 return REG_ESPACE;
932 return REG_NOERROR;
933}
934
935/* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
938
939static void
940init_word_char (re_dfa_t *dfa)
941{
942 int i = 0;
943 int j;
944 int ch = 0;
945 dfa->word_ops_used = 1;
946 if (BE (dfa->map_notascii == 0, 1))
947 {
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
950 bitset_word_t bits0 = 0x00000000;
951 bitset_word_t bits1 = 0x03ff0000;
952 bitset_word_t bits2 = 0x87fffffe;
953 bitset_word_t bits3 = 0x07fffffe;
954 if (BITSET_WORD_BITS == 64)
955 {
956 /* Pacify gcc -Woverflow on 32-bit platformns. */
957 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 i = 2;
960 }
961 else if (BITSET_WORD_BITS == 32)
962 {
963 dfa->word_char[0] = bits0;
964 dfa->word_char[1] = bits1;
965 dfa->word_char[2] = bits2;
966 dfa->word_char[3] = bits3;
967 i = 4;
968 }
969 else
970 goto general_case;
971 ch = 128;
972
973 if (BE (dfa->is_utf8, 1))
974 {
975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 return;
977 }
978 }
979
980 general_case:
981 for (; i < BITSET_WORDS; ++i)
982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983 if (isalnum (ch) || ch == '_')
984 dfa->word_char[i] |= (bitset_word_t) 1 << j;
985}
986
987/* Free the work area which are only used while compiling. */
988
989static void
990free_workarea_compile (regex_t *preg)
991{
992 re_dfa_t *dfa = preg->buffer;
993 bin_tree_storage_t *storage, *next;
994 for (storage = dfa->str_tree_storage; storage; storage = next)
995 {
996 next = storage->next;
997 re_free (storage);
998 }
999 dfa->str_tree_storage = NULL;
1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001 dfa->str_tree = NULL;
1002 re_free (dfa->org_indices);
1003 dfa->org_indices = NULL;
1004}
1005
1006/* Create initial states for all contexts. */
1007
1008static reg_errcode_t
1009create_initial_state (re_dfa_t *dfa)
1010{
1011 Idx first, i;
1012 reg_errcode_t err;
1013 re_node_set init_nodes;
1014
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
1017 first = dfa->str_tree->first->node_idx;
1018 dfa->init_node = first;
1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020 if (BE (err != REG_NOERROR, 0))
1021 return err;
1022
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa->nbackref > 0)
1028 for (i = 0; i < init_nodes.nelem; ++i)
1029 {
1030 Idx node_idx = init_nodes.elems[i];
1031 re_token_type_t type = dfa->nodes[node_idx].type;
1032
1033 Idx clexp_idx;
1034 if (type != OP_BACK_REF)
1035 continue;
1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037 {
1038 re_token_t *clexp_node;
1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 if (clexp_node->type == OP_CLOSE_SUBEXP
1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 break;
1043 }
1044 if (clexp_idx == init_nodes.nelem)
1045 continue;
1046
1047 if (type == OP_BACK_REF)
1048 {
1049 Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 if (!re_node_set_contains (&init_nodes, dest_idx))
1051 {
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 if (merge_err != REG_NOERROR)
1055 return merge_err;
1056 i = 0;
1057 }
1058 }
1059 }
1060
1061 /* It must be the first time to invoke acquire_state. */
1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
1064 if (BE (dfa->init_state == NULL, 0))
1065 return err;
1066 if (dfa->init_state->has_constraint)
1067 {
1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 CONTEXT_WORD);
1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 CONTEXT_NEWLINE);
1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 &init_nodes,
1074 CONTEXT_NEWLINE
1075 | CONTEXT_BEGBUF);
1076 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1077 || dfa->init_state_begbuf == NULL, 0))
1078 return err;
1079 }
1080 else
1081 dfa->init_state_word = dfa->init_state_nl
1082 = dfa->init_state_begbuf = dfa->init_state;
1083
1084 re_node_set_free (&init_nodes);
1085 return REG_NOERROR;
1086}
1087
1088#ifdef RE_ENABLE_I18N
1089/* If it is possible to do searching in single byte encoding instead of UTF-8
1090 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091 DFA nodes where needed. */
1092
1093static void
1094optimize_utf8 (re_dfa_t *dfa)
1095{
1096 Idx node;
1097 int i;
1098 bool mb_chars = false;
1099 bool has_period = false;
1100
1101 for (node = 0; node < dfa->nodes_len; ++node)
1102 switch (dfa->nodes[node].type)
1103 {
1104 case CHARACTER:
1105 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 mb_chars = true;
1107 break;
1108 case ANCHOR:
1109 switch (dfa->nodes[node].opr.ctx_type)
1110 {
1111 case LINE_FIRST:
1112 case LINE_LAST:
1113 case BUF_FIRST:
1114 case BUF_LAST:
1115 break;
1116 default:
1117 /* Word anchors etc. cannot be handled. It's okay to test
1118 opr.ctx_type since constraints (for all DFA nodes) are
1119 created by ORing one or more opr.ctx_type values. */
1120 return;
1121 }
1122 break;
1123 case OP_PERIOD:
1124 has_period = true;
1125 break;
1126 case OP_BACK_REF:
1127 case OP_ALT:
1128 case END_OF_RE:
1129 case OP_DUP_ASTERISK:
1130 case OP_OPEN_SUBEXP:
1131 case OP_CLOSE_SUBEXP:
1132 break;
1133 case COMPLEX_BRACKET:
1134 return;
1135 case SIMPLE_BRACKET:
1136 /* Just double check. */
1137 {
1138 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139 ? 0
1140 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142 {
1143 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 return;
1145 rshift = 0;
1146 }
1147 }
1148 break;
1149 default:
1150 abort ();
1151 }
1152
1153 if (mb_chars || has_period)
1154 for (node = 0; node < dfa->nodes_len; ++node)
1155 {
1156 if (dfa->nodes[node].type == CHARACTER
1157 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158 dfa->nodes[node].mb_partial = 0;
1159 else if (dfa->nodes[node].type == OP_PERIOD)
1160 dfa->nodes[node].type = OP_UTF8_PERIOD;
1161 }
1162
1163 /* The search can be in single byte locale. */
1164 dfa->mb_cur_max = 1;
1165 dfa->is_utf8 = 0;
1166 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1167}
1168#endif
1169
1170/* Analyze the structure tree, and calculate "first", "next", "edest",
1171 "eclosure", and "inveclosure". */
1172
1173static reg_errcode_t
1174analyze (regex_t *preg)
1175{
1176 re_dfa_t *dfa = preg->buffer;
1177 reg_errcode_t ret;
1178
1179 /* Allocate arrays. */
1180 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1182 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1184 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1185 || dfa->eclosures == NULL, 0))
1186 return REG_ESPACE;
1187
1188 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1189 if (dfa->subexp_map != NULL)
1190 {
1191 Idx i;
1192 for (i = 0; i < preg->re_nsub; i++)
1193 dfa->subexp_map[i] = i;
1194 preorder (dfa->str_tree, optimize_subexps, dfa);
1195 for (i = 0; i < preg->re_nsub; i++)
1196 if (dfa->subexp_map[i] != i)
1197 break;
1198 if (i == preg->re_nsub)
1199 {
1200 re_free (dfa->subexp_map);
1201 dfa->subexp_map = NULL;
1202 }
1203 }
1204
1205 ret = postorder (dfa->str_tree, lower_subexps, preg);
1206 if (BE (ret != REG_NOERROR, 0))
1207 return ret;
1208 ret = postorder (dfa->str_tree, calc_first, dfa);
1209 if (BE (ret != REG_NOERROR, 0))
1210 return ret;
1211 preorder (dfa->str_tree, calc_next, dfa);
1212 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1214 return ret;
1215 ret = calc_eclosure (dfa);
1216 if (BE (ret != REG_NOERROR, 0))
1217 return ret;
1218
1219 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1221 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222 || dfa->nbackref)
1223 {
1224 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225 if (BE (dfa->inveclosures == NULL, 0))
1226 return REG_ESPACE;
1227 ret = calc_inveclosure (dfa);
1228 }
1229
1230 return ret;
1231}
1232
1233/* Our parse trees are very unbalanced, so we cannot use a stack to
1234 implement parse tree visits. Instead, we use parent pointers and
1235 some hairy code in these two functions. */
1236static reg_errcode_t
1237postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
1239{
1240 bin_tree_t *node, *prev;
1241
1242 for (node = root; ; )
1243 {
1244 /* Descend down the tree, preferably to the left (or to the right
1245 if that's the only child). */
1246 while (node->left || node->right)
1247 if (node->left)
1248 node = node->left;
1249 else
1250 node = node->right;
1251
1252 do
1253 {
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1257 if (node->parent == NULL)
1258 return REG_NOERROR;
1259 prev = node;
1260 node = node->parent;
1261 }
1262 /* Go up while we have a node that is reached from the right. */
1263 while (node->right == prev || node->right == NULL);
1264 node = node->right;
1265 }
1266}
1267
1268static reg_errcode_t
1269preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270 void *extra)
1271{
1272 bin_tree_t *node;
1273
1274 for (node = root; ; )
1275 {
1276 reg_errcode_t err = fn (extra, node);
1277 if (BE (err != REG_NOERROR, 0))
1278 return err;
1279
1280 /* Go to the left node, or up and to the right. */
1281 if (node->left)
1282 node = node->left;
1283 else
1284 {
1285 bin_tree_t *prev = NULL;
1286 while (node->right == prev || node->right == NULL)
1287 {
1288 prev = node;
1289 node = node->parent;
1290 if (!node)
1291 return REG_NOERROR;
1292 }
1293 node = node->right;
1294 }
1295 }
1296}
1297
1298/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1300 backreferences as well. Requires a preorder visit. */
1301static reg_errcode_t
1302optimize_subexps (void *extra, bin_tree_t *node)
1303{
1304 re_dfa_t *dfa = (re_dfa_t *) extra;
1305
1306 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1307 {
1308 int idx = node->token.opr.idx;
1309 node->token.opr.idx = dfa->subexp_map[idx];
1310 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1311 }
1312
1313 else if (node->token.type == SUBEXP
1314 && node->left && node->left->token.type == SUBEXP)
1315 {
1316 Idx other_idx = node->left->token.opr.idx;
1317
1318 node->left = node->left->left;
1319 if (node->left)
1320 node->left->parent = node;
1321
1322 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1323 if (other_idx < BITSET_WORD_BITS)
1324 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1325 }
1326
1327 return REG_NOERROR;
1328}
1329
1330/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1332static reg_errcode_t
1333lower_subexps (void *extra, bin_tree_t *node)
1334{
1335 regex_t *preg = (regex_t *) extra;
1336 reg_errcode_t err = REG_NOERROR;
1337
1338 if (node->left && node->left->token.type == SUBEXP)
1339 {
1340 node->left = lower_subexp (&err, preg, node->left);
1341 if (node->left)
1342 node->left->parent = node;
1343 }
1344 if (node->right && node->right->token.type == SUBEXP)
1345 {
1346 node->right = lower_subexp (&err, preg, node->right);
1347 if (node->right)
1348 node->right->parent = node;
1349 }
1350
1351 return err;
1352}
1353
1354static bin_tree_t *
1355lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1356{
1357 re_dfa_t *dfa = preg->buffer;
1358 bin_tree_t *body = node->left;
1359 bin_tree_t *op, *cls, *tree1, *tree;
1360
1361 if (preg->no_sub
1362 /* We do not optimize empty subexpressions, because otherwise we may
1363 have bad CONCAT nodes with NULL children. This is obviously not
1364 very common, so we do not lose much. An example that triggers
1365 this case is the sed "script" /\(\)/x. */
1366 && node->left != NULL
1367 && (node->token.opr.idx >= BITSET_WORD_BITS
1368 || !(dfa->used_bkref_map
1369 & ((bitset_word_t) 1 << node->token.opr.idx))))
1370 return node->left;
1371
1372 /* Convert the SUBEXP node to the concatenation of an
1373 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1374 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377 tree = create_tree (dfa, op, tree1, CONCAT);
1378 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1379 {
1380 *err = REG_ESPACE;
1381 return NULL;
1382 }
1383
1384 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386 return tree;
1387}
1388
1389/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390 nodes. Requires a postorder visit. */
1391static reg_errcode_t
1392calc_first (void *extra, bin_tree_t *node)
1393{
1394 re_dfa_t *dfa = (re_dfa_t *) extra;
1395 if (node->token.type == CONCAT)
1396 {
1397 node->first = node->left->first;
1398 node->node_idx = node->left->node_idx;
1399 }
1400 else
1401 {
1402 node->first = node;
1403 node->node_idx = re_dfa_add_node (dfa, node->token);
1404 if (BE (node->node_idx == -1, 0))
1405 return REG_ESPACE;
1406 if (node->token.type == ANCHOR)
1407 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1408 }
1409 return REG_NOERROR;
1410}
1411
1412/* Pass 2: compute NEXT on the tree. Preorder visit. */
1413static reg_errcode_t
1414calc_next (void *extra, bin_tree_t *node)
1415{
1416 switch (node->token.type)
1417 {
1418 case OP_DUP_ASTERISK:
1419 node->left->next = node;
1420 break;
1421 case CONCAT:
1422 node->left->next = node->right->first;
1423 node->right->next = node->next;
1424 break;
1425 default:
1426 if (node->left)
1427 node->left->next = node->next;
1428 if (node->right)
1429 node->right->next = node->next;
1430 break;
1431 }
1432 return REG_NOERROR;
1433}
1434
1435/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1436static reg_errcode_t
1437link_nfa_nodes (void *extra, bin_tree_t *node)
1438{
1439 re_dfa_t *dfa = (re_dfa_t *) extra;
1440 Idx idx = node->node_idx;
1441 reg_errcode_t err = REG_NOERROR;
1442
1443 switch (node->token.type)
1444 {
1445 case CONCAT:
1446 break;
1447
1448 case END_OF_RE:
1449 assert (node->next == NULL);
1450 break;
1451
1452 case OP_DUP_ASTERISK:
1453 case OP_ALT:
1454 {
1455 Idx left, right;
1456 dfa->has_plural_match = 1;
1457 if (node->left != NULL)
1458 left = node->left->first->node_idx;
1459 else
1460 left = node->next->node_idx;
1461 if (node->right != NULL)
1462 right = node->right->first->node_idx;
1463 else
1464 right = node->next->node_idx;
1465 assert (left > -1);
1466 assert (right > -1);
1467 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1468 }
1469 break;
1470
1471 case ANCHOR:
1472 case OP_OPEN_SUBEXP:
1473 case OP_CLOSE_SUBEXP:
1474 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475 break;
1476
1477 case OP_BACK_REF:
1478 dfa->nexts[idx] = node->next->node_idx;
1479 if (node->token.type == OP_BACK_REF)
1480 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1481 break;
1482
1483 default:
1484 assert (!IS_EPSILON_NODE (node->token.type));
1485 dfa->nexts[idx] = node->next->node_idx;
1486 break;
1487 }
1488
1489 return err;
1490}
1491
1492/* Duplicate the epsilon closure of the node ROOT_NODE.
1493 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494 to their own constraint. */
1495
1496static reg_errcode_t
1497duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498 Idx root_node, unsigned int init_constraint)
1499{
1500 Idx org_node, clone_node;
1501 bool ok;
1502 unsigned int constraint = init_constraint;
1503 for (org_node = top_org_node, clone_node = top_clone_node;;)
1504 {
1505 Idx org_dest, clone_dest;
1506 if (dfa->nodes[org_node].type == OP_BACK_REF)
1507 {
1508 /* If the back reference epsilon-transit, its destination must
1509 also have the constraint. Then duplicate the epsilon closure
1510 of the destination of the back reference, and store it in
1511 edests of the back reference. */
1512 org_dest = dfa->nexts[org_node];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 clone_dest = duplicate_node (dfa, org_dest, constraint);
1515 if (BE (clone_dest == -1, 0))
1516 return REG_ESPACE;
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (! ok, 0))
1520 return REG_ESPACE;
1521 }
1522 else if (dfa->edests[org_node].nelem == 0)
1523 {
1524 /* In case of the node can't epsilon-transit, don't duplicate the
1525 destination and store the original destination as the
1526 destination of the node. */
1527 dfa->nexts[clone_node] = dfa->nexts[org_node];
1528 break;
1529 }
1530 else if (dfa->edests[org_node].nelem == 1)
1531 {
1532 /* In case of the node can epsilon-transit, and it has only one
1533 destination. */
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
1536 /* If the node is root_node itself, it means the epsilon closure
1537 has a loop. Then tie it to the destination of the root_node. */
1538 if (org_node == root_node && clone_node != org_node)
1539 {
1540 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541 if (BE (! ok, 0))
1542 return REG_ESPACE;
1543 break;
1544 }
1545 /* In case the node has another constraint, append it. */
1546 constraint |= dfa->nodes[org_node].constraint;
1547 clone_dest = duplicate_node (dfa, org_dest, constraint);
1548 if (BE (clone_dest == -1, 0))
1549 return REG_ESPACE;
1550 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 if (BE (! ok, 0))
1552 return REG_ESPACE;
1553 }
1554 else /* dfa->edests[org_node].nelem == 2 */
1555 {
1556 /* In case of the node can epsilon-transit, and it has two
1557 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1558 org_dest = dfa->edests[org_node].elems[0];
1559 re_node_set_empty (dfa->edests + clone_node);
1560 /* Search for a duplicated node which satisfies the constraint. */
1561 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562 if (clone_dest == -1)
1563 {
1564 /* There is no such duplicated node, create a new one. */
1565 reg_errcode_t err;
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == -1, 0))
1568 return REG_ESPACE;
1569 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 if (BE (! ok, 0))
1571 return REG_ESPACE;
1572 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573 root_node, constraint);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1576 }
1577 else
1578 {
1579 /* There is a duplicated node which satisfies the constraint,
1580 use it to avoid infinite loop. */
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
1583 return REG_ESPACE;
1584 }
1585
1586 org_dest = dfa->edests[org_node].elems[1];
1587 clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 if (BE (clone_dest == -1, 0))
1589 return REG_ESPACE;
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 if (BE (! ok, 0))
1592 return REG_ESPACE;
1593 }
1594 org_node = org_dest;
1595 clone_node = clone_dest;
1596 }
1597 return REG_NOERROR;
1598}
1599
1600/* Search for a node which is duplicated from the node ORG_NODE, and
1601 satisfies the constraint CONSTRAINT. */
1602
1603static Idx
1604search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1605 unsigned int constraint)
1606{
1607 Idx idx;
1608 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1609 {
1610 if (org_node == dfa->org_indices[idx]
1611 && constraint == dfa->nodes[idx].constraint)
1612 return idx; /* Found. */
1613 }
1614 return -1; /* Not found. */
1615}
1616
1617/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1618 Return the index of the new node, or -1 if insufficient storage is
1619 available. */
1620
1621static Idx
1622duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1623{
1624 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1625 if (BE (dup_idx != -1, 1))
1626 {
1627 dfa->nodes[dup_idx].constraint = constraint;
1628 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1629 dfa->nodes[dup_idx].duplicated = 1;
1630
1631 /* Store the index of the original node. */
1632 dfa->org_indices[dup_idx] = org_idx;
1633 }
1634 return dup_idx;
1635}
1636
1637static reg_errcode_t
1638calc_inveclosure (re_dfa_t *dfa)
1639{
1640 Idx src, idx;
1641 bool ok;
1642 for (idx = 0; idx < dfa->nodes_len; ++idx)
1643 re_node_set_init_empty (dfa->inveclosures + idx);
1644
1645 for (src = 0; src < dfa->nodes_len; ++src)
1646 {
1647 Idx *elems = dfa->eclosures[src].elems;
1648 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1649 {
1650 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651 if (BE (! ok, 0))
1652 return REG_ESPACE;
1653 }
1654 }
1655
1656 return REG_NOERROR;
1657}
1658
1659/* Calculate "eclosure" for all the node in DFA. */
1660
1661static reg_errcode_t
1662calc_eclosure (re_dfa_t *dfa)
1663{
1664 Idx node_idx;
1665 bool incomplete;
1666#ifdef DEBUG
1667 assert (dfa->nodes_len > 0);
1668#endif
1669 incomplete = false;
1670 /* For each nodes, calculate epsilon closure. */
1671 for (node_idx = 0; ; ++node_idx)
1672 {
1673 reg_errcode_t err;
1674 re_node_set eclosure_elem;
1675 if (node_idx == dfa->nodes_len)
1676 {
1677 if (!incomplete)
1678 break;
1679 incomplete = false;
1680 node_idx = 0;
1681 }
1682
1683#ifdef DEBUG
1684 assert (dfa->eclosures[node_idx].nelem != -1);
1685#endif
1686
1687 /* If we have already calculated, skip it. */
1688 if (dfa->eclosures[node_idx].nelem != 0)
1689 continue;
1690 /* Calculate epsilon closure of 'node_idx'. */
1691 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1694
1695 if (dfa->eclosures[node_idx].nelem == 0)
1696 {
1697 incomplete = true;
1698 re_node_set_free (&eclosure_elem);
1699 }
1700 }
1701 return REG_NOERROR;
1702}
1703
1704/* Calculate epsilon closure of NODE. */
1705
1706static reg_errcode_t
1707calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1708{
1709 reg_errcode_t err;
1710 Idx i;
1711 re_node_set eclosure;
1712 bool ok;
1713 bool incomplete = false;
1714 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1715 if (BE (err != REG_NOERROR, 0))
1716 return err;
1717
1718 /* This indicates that we are calculating this node now.
1719 We reference this value to avoid infinite loop. */
1720 dfa->eclosures[node].nelem = -1;
1721
1722 /* If the current node has constraints, duplicate all nodes
1723 since they must inherit the constraints. */
1724 if (dfa->nodes[node].constraint
1725 && dfa->edests[node].nelem
1726 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727 {
1728 err = duplicate_node_closure (dfa, node, node, node,
1729 dfa->nodes[node].constraint);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733
1734 /* Expand each epsilon destination nodes. */
1735 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736 for (i = 0; i < dfa->edests[node].nelem; ++i)
1737 {
1738 re_node_set eclosure_elem;
1739 Idx edest = dfa->edests[node].elems[i];
1740 /* If calculating the epsilon closure of 'edest' is in progress,
1741 return intermediate result. */
1742 if (dfa->eclosures[edest].nelem == -1)
1743 {
1744 incomplete = true;
1745 continue;
1746 }
1747 /* If we haven't calculated the epsilon closure of 'edest' yet,
1748 calculate now. Otherwise use calculated epsilon closure. */
1749 if (dfa->eclosures[edest].nelem == 0)
1750 {
1751 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752 if (BE (err != REG_NOERROR, 0))
1753 return err;
1754 }
1755 else
1756 eclosure_elem = dfa->eclosures[edest];
1757 /* Merge the epsilon closure of 'edest'. */
1758 err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761 /* If the epsilon closure of 'edest' is incomplete,
1762 the epsilon closure of this node is also incomplete. */
1763 if (dfa->eclosures[edest].nelem == 0)
1764 {
1765 incomplete = true;
1766 re_node_set_free (&eclosure_elem);
1767 }
1768 }
1769
1770 /* An epsilon closure includes itself. */
1771 ok = re_node_set_insert (&eclosure, node);
1772 if (BE (! ok, 0))
1773 return REG_ESPACE;
1774 if (incomplete && !root)
1775 dfa->eclosures[node].nelem = 0;
1776 else
1777 dfa->eclosures[node] = eclosure;
1778 *new_set = eclosure;
1779 return REG_NOERROR;
1780}
1781
1782/* Functions for token which are used in the parser. */
1783
1784/* Fetch a token from INPUT.
1785 We must not use this function inside bracket expressions. */
1786
1787static void
1788fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1789{
1790 re_string_skip_bytes (input, peek_token (result, input, syntax));
1791}
1792
1793/* Peek a token from INPUT, and return the length of the token.
1794 We must not use this function inside bracket expressions. */
1795
1796static int
1797peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1798{
1799 unsigned char c;
1800
1801 if (re_string_eoi (input))
1802 {
1803 token->type = END_OF_RE;
1804 return 0;
1805 }
1806
1807 c = re_string_peek_byte (input, 0);
1808 token->opr.c = c;
1809
1810 token->word_char = 0;
1811#ifdef RE_ENABLE_I18N
1812 token->mb_partial = 0;
1813 if (input->mb_cur_max > 1 &&
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 {
1816 token->type = CHARACTER;
1817 token->mb_partial = 1;
1818 return 1;
1819 }
1820#endif
1821 if (c == '\\')
1822 {
1823 unsigned char c2;
1824 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1825 {
1826 token->type = BACK_SLASH;
1827 return 1;
1828 }
1829
1830 c2 = re_string_peek_byte_case (input, 1);
1831 token->opr.c = c2;
1832 token->type = CHARACTER;
1833#ifdef RE_ENABLE_I18N
1834 if (input->mb_cur_max > 1)
1835 {
1836 wint_t wc = re_string_wchar_at (input,
1837 re_string_cur_idx (input) + 1);
1838 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839 }
1840 else
1841#endif
1842 token->word_char = IS_WORD_CHAR (c2) != 0;
1843
1844 switch (c2)
1845 {
1846 case '|':
1847 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848 token->type = OP_ALT;
1849 break;
1850 case '1': case '2': case '3': case '4': case '5':
1851 case '6': case '7': case '8': case '9':
1852 if (!(syntax & RE_NO_BK_REFS))
1853 {
1854 token->type = OP_BACK_REF;
1855 token->opr.idx = c2 - '1';
1856 }
1857 break;
1858 case '<':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 {
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = WORD_FIRST;
1863 }
1864 break;
1865 case '>':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 {
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_LAST;
1870 }
1871 break;
1872 case 'b':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_DELIM;
1877 }
1878 break;
1879 case 'B':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = NOT_WORD_DELIM;
1884 }
1885 break;
1886 case 'w':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_WORD;
1889 break;
1890 case 'W':
1891 if (!(syntax & RE_NO_GNU_OPS))
1892 token->type = OP_NOTWORD;
1893 break;
1894 case 's':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = OP_SPACE;
1897 break;
1898 case 'S':
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = OP_NOTSPACE;
1901 break;
1902 case '`':
1903 if (!(syntax & RE_NO_GNU_OPS))
1904 {
1905 token->type = ANCHOR;
1906 token->opr.ctx_type = BUF_FIRST;
1907 }
1908 break;
1909 case '\'':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 {
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_LAST;
1914 }
1915 break;
1916 case '(':
1917 if (!(syntax & RE_NO_BK_PARENS))
1918 token->type = OP_OPEN_SUBEXP;
1919 break;
1920 case ')':
1921 if (!(syntax & RE_NO_BK_PARENS))
1922 token->type = OP_CLOSE_SUBEXP;
1923 break;
1924 case '+':
1925 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926 token->type = OP_DUP_PLUS;
1927 break;
1928 case '?':
1929 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930 token->type = OP_DUP_QUESTION;
1931 break;
1932 case '{':
1933 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934 token->type = OP_OPEN_DUP_NUM;
1935 break;
1936 case '}':
1937 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938 token->type = OP_CLOSE_DUP_NUM;
1939 break;
1940 default:
1941 break;
1942 }
1943 return 2;
1944 }
1945
1946 token->type = CHARACTER;
1947#ifdef RE_ENABLE_I18N
1948 if (input->mb_cur_max > 1)
1949 {
1950 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952 }
1953 else
1954#endif
1955 token->word_char = IS_WORD_CHAR (token->opr.c);
1956
1957 switch (c)
1958 {
1959 case '\n':
1960 if (syntax & RE_NEWLINE_ALT)
1961 token->type = OP_ALT;
1962 break;
1963 case '|':
1964 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1965 token->type = OP_ALT;
1966 break;
1967 case '*':
1968 token->type = OP_DUP_ASTERISK;
1969 break;
1970 case '+':
1971 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1972 token->type = OP_DUP_PLUS;
1973 break;
1974 case '?':
1975 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1976 token->type = OP_DUP_QUESTION;
1977 break;
1978 case '{':
1979 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1980 token->type = OP_OPEN_DUP_NUM;
1981 break;
1982 case '}':
1983 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1984 token->type = OP_CLOSE_DUP_NUM;
1985 break;
1986 case '(':
1987 if (syntax & RE_NO_BK_PARENS)
1988 token->type = OP_OPEN_SUBEXP;
1989 break;
1990 case ')':
1991 if (syntax & RE_NO_BK_PARENS)
1992 token->type = OP_CLOSE_SUBEXP;
1993 break;
1994 case '[':
1995 token->type = OP_OPEN_BRACKET;
1996 break;
1997 case '.':
1998 token->type = OP_PERIOD;
1999 break;
2000 case '^':
2001 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2002 re_string_cur_idx (input) != 0)
2003 {
2004 char prev = re_string_peek_byte (input, -1);
2005 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2006 break;
2007 }
2008 token->type = ANCHOR;
2009 token->opr.ctx_type = LINE_FIRST;
2010 break;
2011 case '$':
2012 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2013 re_string_cur_idx (input) + 1 != re_string_length (input))
2014 {
2015 re_token_t next;
2016 re_string_skip_bytes (input, 1);
2017 peek_token (&next, input, syntax);
2018 re_string_skip_bytes (input, -1);
2019 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020 break;
2021 }
2022 token->type = ANCHOR;
2023 token->opr.ctx_type = LINE_LAST;
2024 break;
2025 default:
2026 break;
2027 }
2028 return 1;
2029}
2030
2031/* Peek a token from INPUT, and return the length of the token.
2032 We must not use this function out of bracket expressions. */
2033
2034static int
2035peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036{
2037 unsigned char c;
2038 if (re_string_eoi (input))
2039 {
2040 token->type = END_OF_RE;
2041 return 0;
2042 }
2043 c = re_string_peek_byte (input, 0);
2044 token->opr.c = c;
2045
2046#ifdef RE_ENABLE_I18N
2047 if (input->mb_cur_max > 1 &&
2048 !re_string_first_byte (input, re_string_cur_idx (input)))
2049 {
2050 token->type = CHARACTER;
2051 return 1;
2052 }
2053#endif /* RE_ENABLE_I18N */
2054
2055 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056 && re_string_cur_idx (input) + 1 < re_string_length (input))
2057 {
2058 /* In this case, '\' escape a character. */
2059 unsigned char c2;
2060 re_string_skip_bytes (input, 1);
2061 c2 = re_string_peek_byte (input, 0);
2062 token->opr.c = c2;
2063 token->type = CHARACTER;
2064 return 1;
2065 }
2066 if (c == '[') /* '[' is a special char in a bracket exps. */
2067 {
2068 unsigned char c2;
2069 int token_len;
2070 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 c2 = re_string_peek_byte (input, 1);
2072 else
2073 c2 = 0;
2074 token->opr.c = c2;
2075 token_len = 2;
2076 switch (c2)
2077 {
2078 case '.':
2079 token->type = OP_OPEN_COLL_ELEM;
2080 break;
2081
2082 case '=':
2083 token->type = OP_OPEN_EQUIV_CLASS;
2084 break;
2085
2086 case ':':
2087 if (syntax & RE_CHAR_CLASSES)
2088 {
2089 token->type = OP_OPEN_CHAR_CLASS;
2090 break;
2091 }
2092 FALLTHROUGH;
2093 default:
2094 token->type = CHARACTER;
2095 token->opr.c = c;
2096 token_len = 1;
2097 break;
2098 }
2099 return token_len;
2100 }
2101 switch (c)
2102 {
2103 case '-':
2104 token->type = OP_CHARSET_RANGE;
2105 break;
2106 case ']':
2107 token->type = OP_CLOSE_BRACKET;
2108 break;
2109 case '^':
2110 token->type = OP_NON_MATCH_LIST;
2111 break;
2112 default:
2113 token->type = CHARACTER;
2114 }
2115 return 1;
2116}
2117
2118/* Functions for parser. */
2119
2120/* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
2122 If an error occurs, ERR is set by error code, and return NULL.
2123 This function build the following tree, from regular expression <reg_exp>:
2124 CAT
2125 / \
2126 / \
2127 <reg_exp> EOR
2128
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2131
2132static bin_tree_t *
2133parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134 reg_errcode_t *err)
2135{
2136 re_dfa_t *dfa = preg->buffer;
2137 bin_tree_t *tree, *eor, *root;
2138 re_token_t current_token;
2139 dfa->syntax = syntax;
2140 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2142 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143 return NULL;
2144 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145 if (tree != NULL)
2146 root = create_tree (dfa, tree, eor, CONCAT);
2147 else
2148 root = eor;
2149 if (BE (eor == NULL || root == NULL, 0))
2150 {
2151 *err = REG_ESPACE;
2152 return NULL;
2153 }
2154 return root;
2155}
2156
2157/* This function build the following tree, from regular expression
2158 <branch1>|<branch2>:
2159 ALT
2160 / \
2161 / \
2162 <branch1> <branch2>
2163
2164 ALT means alternative, which represents the operator '|'. */
2165
2166static bin_tree_t *
2167parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2169{
2170 re_dfa_t *dfa = preg->buffer;
2171 bin_tree_t *tree, *branch = NULL;
2172 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175 return NULL;
2176
2177 while (token->type == OP_ALT)
2178 {
2179 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180 if (token->type != OP_ALT && token->type != END_OF_RE
2181 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182 {
2183 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184 dfa->completed_bkref_map = initial_bkref_map;
2185 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187 {
2188 if (tree != NULL)
2189 postorder (tree, free_tree, NULL);
2190 return NULL;
2191 }
2192 dfa->completed_bkref_map |= accumulated_bkref_map;
2193 }
2194 else
2195 branch = NULL;
2196 tree = create_tree (dfa, tree, branch, OP_ALT);
2197 if (BE (tree == NULL, 0))
2198 {
2199 *err = REG_ESPACE;
2200 return NULL;
2201 }
2202 }
2203 return tree;
2204}
2205
2206/* This function build the following tree, from regular expression
2207 <exp1><exp2>:
2208 CAT
2209 / \
2210 / \
2211 <exp1> <exp2>
2212
2213 CAT means concatenation. */
2214
2215static bin_tree_t *
2216parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218{
2219 bin_tree_t *tree, *expr;
2220 re_dfa_t *dfa = preg->buffer;
2221 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 return NULL;
2224
2225 while (token->type != OP_ALT && token->type != END_OF_RE
2226 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227 {
2228 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230 {
2231 if (tree != NULL)
2232 postorder (tree, free_tree, NULL);
2233 return NULL;
2234 }
2235 if (tree != NULL && expr != NULL)
2236 {
2237 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238 if (newtree == NULL)
2239 {
2240 postorder (expr, free_tree, NULL);
2241 postorder (tree, free_tree, NULL);
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
2245 tree = newtree;
2246 }
2247 else if (tree == NULL)
2248 tree = expr;
2249 /* Otherwise expr == NULL, we don't need to create new tree. */
2250 }
2251 return tree;
2252}
2253
2254/* This function build the following tree, from regular expression a*:
2255 *
2256 |
2257 a
2258*/
2259
2260static bin_tree_t *
2261parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263{
2264 re_dfa_t *dfa = preg->buffer;
2265 bin_tree_t *tree;
2266 switch (token->type)
2267 {
2268 case CHARACTER:
2269 tree = create_token_tree (dfa, NULL, NULL, token);
2270 if (BE (tree == NULL, 0))
2271 {
2272 *err = REG_ESPACE;
2273 return NULL;
2274 }
2275#ifdef RE_ENABLE_I18N
2276 if (dfa->mb_cur_max > 1)
2277 {
2278 while (!re_string_eoi (regexp)
2279 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280 {
2281 bin_tree_t *mbc_remain;
2282 fetch_token (token, regexp, syntax);
2283 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285 if (BE (mbc_remain == NULL || tree == NULL, 0))
2286 {
2287 *err = REG_ESPACE;
2288 return NULL;
2289 }
2290 }
2291 }
2292#endif
2293 break;
2294
2295 case OP_OPEN_SUBEXP:
2296 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298 return NULL;
2299 break;
2300
2301 case OP_OPEN_BRACKET:
2302 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 return NULL;
2305 break;
2306
2307 case OP_BACK_REF:
2308 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309 {
2310 *err = REG_ESUBREG;
2311 return NULL;
2312 }
2313 dfa->used_bkref_map |= 1 << token->opr.idx;
2314 tree = create_token_tree (dfa, NULL, NULL, token);
2315 if (BE (tree == NULL, 0))
2316 {
2317 *err = REG_ESPACE;
2318 return NULL;
2319 }
2320 ++dfa->nbackref;
2321 dfa->has_mb_node = 1;
2322 break;
2323
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2326 {
2327 *err = REG_BADRPT;
2328 return NULL;
2329 }
2330 FALLTHROUGH;
2331 case OP_DUP_ASTERISK:
2332 case OP_DUP_PLUS:
2333 case OP_DUP_QUESTION:
2334 if (syntax & RE_CONTEXT_INVALID_OPS)
2335 {
2336 *err = REG_BADRPT;
2337 return NULL;
2338 }
2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
2340 {
2341 fetch_token (token, regexp, syntax);
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2343 }
2344 FALLTHROUGH;
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 {
2349 *err = REG_ERPAREN;
2350 return NULL;
2351 }
2352 FALLTHROUGH;
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2355
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
2358 /* mb_partial and word_char bits should be initialized already
2359 by peek_token. */
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2362 {
2363 *err = REG_ESPACE;
2364 return NULL;
2365 }
2366 break;
2367
2368 case ANCHOR:
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2375 {
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2378 {
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2382 }
2383 else
2384 {
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2388 }
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
2397 else
2398 {
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
2405 }
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2411 return tree;
2412
2413 case OP_PERIOD:
2414 tree = create_token_tree (dfa, NULL, NULL, token);
2415 if (BE (tree == NULL, 0))
2416 {
2417 *err = REG_ESPACE;
2418 return NULL;
2419 }
2420 if (dfa->mb_cur_max > 1)
2421 dfa->has_mb_node = 1;
2422 break;
2423
2424 case OP_WORD:
2425 case OP_NOTWORD:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 "alnum",
2428 "_",
2429 token->type == OP_NOTWORD, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2433
2434 case OP_SPACE:
2435 case OP_NOTSPACE:
2436 tree = build_charclass_op (dfa, regexp->trans,
2437 "space",
2438 "",
2439 token->type == OP_NOTSPACE, err);
2440 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441 return NULL;
2442 break;
2443
2444 case OP_ALT:
2445 case END_OF_RE:
2446 return NULL;
2447
2448 case BACK_SLASH:
2449 *err = REG_EESCAPE;
2450 return NULL;
2451
2452 default:
2453 /* Must not happen? */
2454#ifdef DEBUG
2455 assert (0);
2456#endif
2457 return NULL;
2458 }
2459 fetch_token (token, regexp, syntax);
2460
2461 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2463 {
2464 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465 syntax, err);
2466 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467 {
2468 if (tree != NULL)
2469 postorder (tree, free_tree, NULL);
2470 return NULL;
2471 }
2472 tree = dup_tree;
2473 /* In BRE consecutive duplications are not allowed. */
2474 if ((syntax & RE_CONTEXT_INVALID_DUP)
2475 && (token->type == OP_DUP_ASTERISK
2476 || token->type == OP_OPEN_DUP_NUM))
2477 {
2478 if (tree != NULL)
2479 postorder (tree, free_tree, NULL);
2480 *err = REG_BADRPT;
2481 return NULL;
2482 }
2483 }
2484
2485 return tree;
2486}
2487
2488/* This function build the following tree, from regular expression
2489 (<reg_exp>):
2490 SUBEXP
2491 |
2492 <reg_exp>
2493*/
2494
2495static bin_tree_t *
2496parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2498{
2499 re_dfa_t *dfa = preg->buffer;
2500 bin_tree_t *tree;
2501 size_t cur_nsub;
2502 cur_nsub = preg->re_nsub++;
2503
2504 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2505
2506 /* The subexpression may be a null string. */
2507 if (token->type == OP_CLOSE_SUBEXP)
2508 tree = NULL;
2509 else
2510 {
2511 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2513 {
2514 if (tree != NULL)
2515 postorder (tree, free_tree, NULL);
2516 *err = REG_EPAREN;
2517 }
2518 if (BE (*err != REG_NOERROR, 0))
2519 return NULL;
2520 }
2521
2522 if (cur_nsub <= '9' - '1')
2523 dfa->completed_bkref_map |= 1 << cur_nsub;
2524
2525 tree = create_tree (dfa, tree, NULL, SUBEXP);
2526 if (BE (tree == NULL, 0))
2527 {
2528 *err = REG_ESPACE;
2529 return NULL;
2530 }
2531 tree->token.opr.idx = cur_nsub;
2532 return tree;
2533}
2534
2535/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2536
2537static bin_tree_t *
2538parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2540{
2541 bin_tree_t *tree = NULL, *old_tree = NULL;
2542 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543 re_token_t start_token = *token;
2544
2545 if (token->type == OP_OPEN_DUP_NUM)
2546 {
2547 end = 0;
2548 start = fetch_number (regexp, token, syntax);
2549 if (start == -1)
2550 {
2551 if (token->type == CHARACTER && token->opr.c == ',')
2552 start = 0; /* We treat "{,m}" as "{0,m}". */
2553 else
2554 {
2555 *err = REG_BADBR; /* <re>{} is invalid. */
2556 return NULL;
2557 }
2558 }
2559 if (BE (start != -2, 1))
2560 {
2561 /* We treat "{n}" as "{n,n}". */
2562 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563 : ((token->type == CHARACTER && token->opr.c == ',')
2564 ? fetch_number (regexp, token, syntax) : -2));
2565 }
2566 if (BE (start == -2 || end == -2, 0))
2567 {
2568 /* Invalid sequence. */
2569 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570 {
2571 if (token->type == END_OF_RE)
2572 *err = REG_EBRACE;
2573 else
2574 *err = REG_BADBR;
2575
2576 return NULL;
2577 }
2578
2579 /* If the syntax bit is set, rollback. */
2580 re_string_set_index (regexp, start_idx);
2581 *token = start_token;
2582 token->type = CHARACTER;
2583 /* mb_partial and word_char bits should be already initialized by
2584 peek_token. */
2585 return elem;
2586 }
2587
2588 if (BE ((end != -1 && start > end)
2589 || token->type != OP_CLOSE_DUP_NUM, 0))
2590 {
2591 /* First number greater than second. */
2592 *err = REG_BADBR;
2593 return NULL;
2594 }
2595
2596 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597 {
2598 *err = REG_ESIZE;
2599 return NULL;
2600 }
2601 }
2602 else
2603 {
2604 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606 }
2607
2608 fetch_token (token, regexp, syntax);
2609
2610 if (BE (elem == NULL, 0))
2611 return NULL;
2612 if (BE (start == 0 && end == 0, 0))
2613 {
2614 postorder (elem, free_tree, NULL);
2615 return NULL;
2616 }
2617
2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2619 if (BE (start > 0, 0))
2620 {
2621 tree = elem;
2622 for (i = 2; i <= start; ++i)
2623 {
2624 elem = duplicate_tree (elem, dfa);
2625 tree = create_tree (dfa, tree, elem, CONCAT);
2626 if (BE (elem == NULL || tree == NULL, 0))
2627 goto parse_dup_op_espace;
2628 }
2629
2630 if (start == end)
2631 return tree;
2632
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem = duplicate_tree (elem, dfa);
2635 if (BE (elem == NULL, 0))
2636 goto parse_dup_op_espace;
2637 old_tree = tree;
2638 }
2639 else
2640 old_tree = NULL;
2641
2642 if (elem->token.type == SUBEXP)
2643 {
2644 uintptr_t subidx = elem->token.opr.idx;
2645 postorder (elem, mark_opt_subexp, (void *) subidx);
2646 }
2647
2648 tree = create_tree (dfa, elem, NULL,
2649 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650 if (BE (tree == NULL, 0))
2651 goto parse_dup_op_espace;
2652
2653 /* This loop is actually executed only when end != -1,
2654 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2655 already created the start+1-th copy. */
2656 if (TYPE_SIGNED (Idx) || end != -1)
2657 for (i = start + 2; i <= end; ++i)
2658 {
2659 elem = duplicate_tree (elem, dfa);
2660 tree = create_tree (dfa, tree, elem, CONCAT);
2661 if (BE (elem == NULL || tree == NULL, 0))
2662 goto parse_dup_op_espace;
2663
2664 tree = create_tree (dfa, tree, NULL, OP_ALT);
2665 if (BE (tree == NULL, 0))
2666 goto parse_dup_op_espace;
2667 }
2668
2669 if (old_tree)
2670 tree = create_tree (dfa, old_tree, tree, CONCAT);
2671
2672 return tree;
2673
2674 parse_dup_op_espace:
2675 *err = REG_ESPACE;
2676 return NULL;
2677}
2678
2679/* Size of the names for collating symbol/equivalence_class/character_class.
2680 I'm not sure, but maybe enough. */
2681#define BRACKET_NAME_BUF_SIZE 32
2682
2683#ifndef _LIBC
2684
2685# ifdef RE_ENABLE_I18N
2686/* Convert the byte B to the corresponding wide character. In a
2687 unibyte locale, treat B as itself if it is an encoding error.
2688 In a multibyte locale, return WEOF if B is an encoding error. */
2689static wint_t
2690parse_byte (unsigned char b, re_charset_t *mbcset)
2691{
2692 wint_t wc = __btowc (b);
2693 return wc == WEOF && !mbcset ? b : wc;
2694}
2695#endif
2696
2697 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2698 Build the range expression which starts from START_ELEM, and ends
2699 at END_ELEM. The result are written to MBCSET and SBCSET.
2700 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2701 mbcset->range_ends, is a pointer argument since we may
2702 update it. */
2703
2704static reg_errcode_t
2705# ifdef RE_ENABLE_I18N
2706build_range_exp (const reg_syntax_t syntax,
2707 bitset_t sbcset,
2708 re_charset_t *mbcset,
2709 Idx *range_alloc,
2710 const bracket_elem_t *start_elem,
2711 const bracket_elem_t *end_elem)
2712# else /* not RE_ENABLE_I18N */
2713build_range_exp (const reg_syntax_t syntax,
2714 bitset_t sbcset,
2715 const bracket_elem_t *start_elem,
2716 const bracket_elem_t *end_elem)
2717# endif /* not RE_ENABLE_I18N */
2718{
2719 unsigned int start_ch, end_ch;
2720 /* Equivalence Classes and Character Classes can't be a range start/end. */
2721 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2722 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2723 0))
2724 return REG_ERANGE;
2725
2726 /* We can handle no multi character collating elements without libc
2727 support. */
2728 if (BE ((start_elem->type == COLL_SYM
2729 && strlen ((char *) start_elem->opr.name) > 1)
2730 || (end_elem->type == COLL_SYM
2731 && strlen ((char *) end_elem->opr.name) > 1), 0))
2732 return REG_ECOLLATE;
2733
2734# ifdef RE_ENABLE_I18N
2735 {
2736 wchar_t wc;
2737 wint_t start_wc;
2738 wint_t end_wc;
2739
2740 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2741 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2742 : 0));
2743 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2744 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2745 : 0));
2746 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2747 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2748 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2749 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2750 if (start_wc == WEOF || end_wc == WEOF)
2751 return REG_ECOLLATE;
2752 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2753 return REG_ERANGE;
2754
2755 /* Got valid collation sequence values, add them as a new entry.
2756 However, for !_LIBC we have no collation elements: if the
2757 character set is single byte, the single byte character set
2758 that we build below suffices. parse_bracket_exp passes
2759 no MBCSET if dfa->mb_cur_max == 1. */
2760 if (mbcset)
2761 {
2762 /* Check the space of the arrays. */
2763 if (BE (*range_alloc == mbcset->nranges, 0))
2764 {
2765 /* There is not enough space, need realloc. */
2766 wchar_t *new_array_start, *new_array_end;
2767 Idx new_nranges;
2768
2769 /* +1 in case of mbcset->nranges is 0. */
2770 new_nranges = 2 * mbcset->nranges + 1;
2771 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2772 are NULL if *range_alloc == 0. */
2773 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2774 new_nranges);
2775 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2776 new_nranges);
2777
2778 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2779 {
2780 re_free (new_array_start);
2781 re_free (new_array_end);
2782 return REG_ESPACE;
2783 }
2784
2785 mbcset->range_starts = new_array_start;
2786 mbcset->range_ends = new_array_end;
2787 *range_alloc = new_nranges;
2788 }
2789
2790 mbcset->range_starts[mbcset->nranges] = start_wc;
2791 mbcset->range_ends[mbcset->nranges++] = end_wc;
2792 }
2793
2794 /* Build the table for single byte characters. */
2795 for (wc = 0; wc < SBC_MAX; ++wc)
2796 {
2797 if (start_wc <= wc && wc <= end_wc)
2798 bitset_set (sbcset, wc);
2799 }
2800 }
2801# else /* not RE_ENABLE_I18N */
2802 {
2803 unsigned int ch;
2804 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2805 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806 : 0));
2807 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2808 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809 : 0));
2810 if (start_ch > end_ch)
2811 return REG_ERANGE;
2812 /* Build the table for single byte characters. */
2813 for (ch = 0; ch < SBC_MAX; ++ch)
2814 if (start_ch <= ch && ch <= end_ch)
2815 bitset_set (sbcset, ch);
2816 }
2817# endif /* not RE_ENABLE_I18N */
2818 return REG_NOERROR;
2819}
2820#endif /* not _LIBC */
2821
2822#ifndef _LIBC
2823/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824 Build the collating element which is represented by NAME.
2825 The result are written to MBCSET and SBCSET.
2826 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827 pointer argument since we may update it. */
2828
2829static reg_errcode_t
2830# ifdef RE_ENABLE_I18N
2831build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 Idx *coll_sym_alloc, const unsigned char *name)
2833# else /* not RE_ENABLE_I18N */
2834build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835# endif /* not RE_ENABLE_I18N */
2836{
2837 size_t name_len = strlen ((const char *) name);
2838 if (BE (name_len != 1, 0))
2839 return REG_ECOLLATE;
2840 else
2841 {
2842 bitset_set (sbcset, name[0]);
2843 return REG_NOERROR;
2844 }
2845}
2846#endif /* not _LIBC */
2847
2848/* This function parse bracket expression like "[abc]", "[a-c]",
2849 "[[.a-a.]]" etc. */
2850
2851static bin_tree_t *
2852parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 reg_syntax_t syntax, reg_errcode_t *err)
2854{
2855#ifdef _LIBC
2856 const unsigned char *collseqmb;
2857 const char *collseqwc;
2858 uint32_t nrules;
2859 int32_t table_size;
2860 const int32_t *symb_table;
2861 const unsigned char *extra;
2862
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
2867
2868 auto inline int32_t
2869 __attribute__ ((always_inline))
2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871 {
2872 int32_t elem;
2873
2874 for (elem = 0; elem < table_size; elem++)
2875 if (symb_table[2 * elem] != 0)
2876 {
2877 int32_t idx = symb_table[2 * elem + 1];
2878 /* Skip the name of collating element name. */
2879 idx += 1 + extra[idx];
2880 if (/* Compare the length of the name. */
2881 name_len == extra[idx]
2882 /* Compare the name. */
2883 && memcmp (name, &extra[idx + 1], name_len) == 0)
2884 /* Yep, this is the entry. */
2885 return elem;
2886 }
2887 return -1;
2888 }
2889
2890 /* Local function for parse_bracket_exp used in _LIBC environment.
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2893
2894 auto inline unsigned int
2895 __attribute__ ((always_inline))
2896 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897 {
2898 if (br_elem->type == SB_CHAR)
2899 {
2900 /*
2901 if (MB_CUR_MAX == 1)
2902 */
2903 if (nrules == 0)
2904 return collseqmb[br_elem->opr.ch];
2905 else
2906 {
2907 wint_t wc = __btowc (br_elem->opr.ch);
2908 return __collseq_table_lookup (collseqwc, wc);
2909 }
2910 }
2911 else if (br_elem->type == MB_CHAR)
2912 {
2913 if (nrules != 0)
2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 }
2916 else if (br_elem->type == COLL_SYM)
2917 {
2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 if (nrules != 0)
2920 {
2921 int32_t elem, idx;
2922 elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 sym_name_len);
2924 if (elem != -1)
2925 {
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 /* Skip the byte sequence of the collating element. */
2931 idx += 1 + extra[idx];
2932 /* Adjust for the alignment. */
2933 idx = (idx + 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx += sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx += sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra + idx));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra + idx);
2941 }
2942 else if (sym_name_len == 1)
2943 {
2944 /* No valid character. Match it as a single byte
2945 character. */
2946 return collseqmb[br_elem->opr.name[0]];
2947 }
2948 }
2949 else if (sym_name_len == 1)
2950 return collseqmb[br_elem->opr.name[0]];
2951 }
2952 return UINT_MAX;
2953 }
2954
2955 /* Local function for parse_bracket_exp used in _LIBC environment.
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959 mbcset->range_ends, is a pointer argument since we may
2960 update it. */
2961
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline))
2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966 {
2967 unsigned int ch;
2968 uint32_t start_collseq;
2969 uint32_t end_collseq;
2970
2971 /* Equivalence Classes and Character Classes can't be a range
2972 start/end. */
2973 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2974 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 0))
2976 return REG_ERANGE;
2977
2978 /* FIXME: Implement rational ranges here, too. */
2979 start_collseq = lookup_collation_sequence_value (start_elem);
2980 end_collseq = lookup_collation_sequence_value (end_elem);
2981 /* Check start/end collation sequence values. */
2982 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2983 return REG_ECOLLATE;
2984 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2985 return REG_ERANGE;
2986
2987 /* Got valid collation sequence values, add them as a new entry.
2988 However, if we have no collation elements, and the character set
2989 is single byte, the single byte character set that we
2990 build below suffices. */
2991 if (nrules > 0 || dfa->mb_cur_max > 1)
2992 {
2993 /* Check the space of the arrays. */
2994 if (BE (*range_alloc == mbcset->nranges, 0))
2995 {
2996 /* There is not enough space, need realloc. */
2997 uint32_t *new_array_start;
2998 uint32_t *new_array_end;
2999 Idx new_nranges;
3000
3001 /* +1 in case of mbcset->nranges is 0. */
3002 new_nranges = 2 * mbcset->nranges + 1;
3003 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 new_nranges);
3005 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3006 new_nranges);
3007
3008 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3009 return REG_ESPACE;
3010
3011 mbcset->range_starts = new_array_start;
3012 mbcset->range_ends = new_array_end;
3013 *range_alloc = new_nranges;
3014 }
3015
3016 mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3018 }
3019
3020 /* Build the table for single byte characters. */
3021 for (ch = 0; ch < SBC_MAX; ch++)
3022 {
3023 uint32_t ch_collseq;
3024 /*
3025 if (MB_CUR_MAX == 1)
3026 */
3027 if (nrules == 0)
3028 ch_collseq = collseqmb[ch];
3029 else
3030 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3031 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 bitset_set (sbcset, ch);
3033 }
3034 return REG_NOERROR;
3035 }
3036
3037 /* Local function for parse_bracket_exp used in _LIBC environment.
3038 Build the collating element which is represented by NAME.
3039 The result are written to MBCSET and SBCSET.
3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041 pointer argument since we may update it. */
3042
3043 auto inline reg_errcode_t
3044 __attribute__ ((always_inline))
3045 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3046 Idx *coll_sym_alloc, const unsigned char *name)
3047 {
3048 int32_t elem, idx;
3049 size_t name_len = strlen ((const char *) name);
3050 if (nrules != 0)
3051 {
3052 elem = seek_collating_symbol_entry (name, name_len);
3053 if (elem != -1)
3054 {
3055 /* We found the entry. */
3056 idx = symb_table[2 * elem + 1];
3057 /* Skip the name of collating element name. */
3058 idx += 1 + extra[idx];
3059 }
3060 else if (name_len == 1)
3061 {
3062 /* No valid character, treat it as a normal
3063 character. */
3064 bitset_set (sbcset, name[0]);
3065 return REG_NOERROR;
3066 }
3067 else
3068 return REG_ECOLLATE;
3069
3070 /* Got valid collation sequence, add it as a new entry. */
3071 /* Check the space of the arrays. */
3072 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3073 {
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->ncoll_syms is 0. */
3076 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3077 /* Use realloc since mbcset->coll_syms is NULL
3078 if *alloc == 0. */
3079 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 new_coll_sym_alloc);
3081 if (BE (new_coll_syms == NULL, 0))
3082 return REG_ESPACE;
3083 mbcset->coll_syms = new_coll_syms;
3084 *coll_sym_alloc = new_coll_sym_alloc;
3085 }
3086 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 return REG_NOERROR;
3088 }
3089 else
3090 {
3091 if (BE (name_len != 1, 0))
3092 return REG_ECOLLATE;
3093 else
3094 {
3095 bitset_set (sbcset, name[0]);
3096 return REG_NOERROR;
3097 }
3098 }
3099 }
3100#endif
3101
3102 re_token_t br_token;
3103 re_bitset_ptr_t sbcset;
3104#ifdef RE_ENABLE_I18N
3105 re_charset_t *mbcset;
3106 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3108#endif /* not RE_ENABLE_I18N */
3109 bool non_match = false;
3110 bin_tree_t *work_tree;
3111 int token_len;
3112 bool first_round = true;
3113#ifdef _LIBC
3114 collseqmb = (const unsigned char *)
3115 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3116 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117 if (nrules)
3118 {
3119 /*
3120 if (MB_CUR_MAX > 1)
3121 */
3122 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3123 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3125 _NL_COLLATE_SYMB_TABLEMB);
3126 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3127 _NL_COLLATE_SYMB_EXTRAMB);
3128 }
3129#endif
3130 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3131#ifdef RE_ENABLE_I18N
3132 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3133#endif /* RE_ENABLE_I18N */
3134#ifdef RE_ENABLE_I18N
3135 if (BE (sbcset == NULL || mbcset == NULL, 0))
3136#else
3137 if (BE (sbcset == NULL, 0))
3138#endif /* RE_ENABLE_I18N */
3139 {
3140 re_free (sbcset);
3141#ifdef RE_ENABLE_I18N
3142 re_free (mbcset);
3143#endif
3144 *err = REG_ESPACE;
3145 return NULL;
3146 }
3147
3148 token_len = peek_token_bracket (token, regexp, syntax);
3149 if (BE (token->type == END_OF_RE, 0))
3150 {
3151 *err = REG_BADPAT;
3152 goto parse_bracket_exp_free_return;
3153 }
3154 if (token->type == OP_NON_MATCH_LIST)
3155 {
3156#ifdef RE_ENABLE_I18N
3157 mbcset->non_match = 1;
3158#endif /* not RE_ENABLE_I18N */
3159 non_match = true;
3160 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3161 bitset_set (sbcset, '\n');
3162 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3163 token_len = peek_token_bracket (token, regexp, syntax);
3164 if (BE (token->type == END_OF_RE, 0))
3165 {
3166 *err = REG_BADPAT;
3167 goto parse_bracket_exp_free_return;
3168 }
3169 }
3170
3171 /* We treat the first ']' as a normal character. */
3172 if (token->type == OP_CLOSE_BRACKET)
3173 token->type = CHARACTER;
3174
3175 while (1)
3176 {
3177 bracket_elem_t start_elem, end_elem;
3178 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3180 reg_errcode_t ret;
3181 int token_len2 = 0;
3182 bool is_range_exp = false;
3183 re_token_t token2;
3184
3185 start_elem.opr.name = start_name_buf;
3186 start_elem.type = COLL_SYM;
3187 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3188 syntax, first_round);
3189 if (BE (ret != REG_NOERROR, 0))
3190 {
3191 *err = ret;
3192 goto parse_bracket_exp_free_return;
3193 }
3194 first_round = false;
3195
3196 /* Get information about the next token. We need it in any case. */
3197 token_len = peek_token_bracket (token, regexp, syntax);
3198
3199 /* Do not check for ranges if we know they are not allowed. */
3200 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3201 {
3202 if (BE (token->type == END_OF_RE, 0))
3203 {
3204 *err = REG_EBRACK;
3205 goto parse_bracket_exp_free_return;
3206 }
3207 if (token->type == OP_CHARSET_RANGE)
3208 {
3209 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3210 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 if (BE (token2.type == END_OF_RE, 0))
3212 {
3213 *err = REG_EBRACK;
3214 goto parse_bracket_exp_free_return;
3215 }
3216 if (token2.type == OP_CLOSE_BRACKET)
3217 {
3218 /* We treat the last '-' as a normal character. */
3219 re_string_skip_bytes (regexp, -token_len);
3220 token->type = CHARACTER;
3221 }
3222 else
3223 is_range_exp = true;
3224 }
3225 }
3226
3227 if (is_range_exp == true)
3228 {
3229 end_elem.opr.name = end_name_buf;
3230 end_elem.type = COLL_SYM;
3231 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3232 dfa, syntax, true);
3233 if (BE (ret != REG_NOERROR, 0))
3234 {
3235 *err = ret;
3236 goto parse_bracket_exp_free_return;
3237 }
3238
3239 token_len = peek_token_bracket (token, regexp, syntax);
3240
3241#ifdef _LIBC
3242 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 &start_elem, &end_elem);
3244#else
3245# ifdef RE_ENABLE_I18N
3246 *err = build_range_exp (syntax, sbcset,
3247 dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 &range_alloc, &start_elem, &end_elem);
3249# else
3250 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3251# endif
3252#endif /* RE_ENABLE_I18N */
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3255 }
3256 else
3257 {
3258 switch (start_elem.type)
3259 {
3260 case SB_CHAR:
3261 bitset_set (sbcset, start_elem.opr.ch);
3262 break;
3263#ifdef RE_ENABLE_I18N
3264 case MB_CHAR:
3265 /* Check whether the array has enough space. */
3266 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3267 {
3268 wchar_t *new_mbchars;
3269 /* Not enough, realloc it. */
3270 /* +1 in case of mbcset->nmbchars is 0. */
3271 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 /* Use realloc since array is NULL if *alloc == 0. */
3273 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 mbchar_alloc);
3275 if (BE (new_mbchars == NULL, 0))
3276 goto parse_bracket_exp_espace;
3277 mbcset->mbchars = new_mbchars;
3278 }
3279 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 break;
3281#endif /* RE_ENABLE_I18N */
3282 case EQUIV_CLASS:
3283 *err = build_equiv_class (sbcset,
3284#ifdef RE_ENABLE_I18N
3285 mbcset, &equiv_class_alloc,
3286#endif /* RE_ENABLE_I18N */
3287 start_elem.opr.name);
3288 if (BE (*err != REG_NOERROR, 0))
3289 goto parse_bracket_exp_free_return;
3290 break;
3291 case COLL_SYM:
3292 *err = build_collating_symbol (sbcset,
3293#ifdef RE_ENABLE_I18N
3294 mbcset, &coll_sym_alloc,
3295#endif /* RE_ENABLE_I18N */
3296 start_elem.opr.name);
3297 if (BE (*err != REG_NOERROR, 0))
3298 goto parse_bracket_exp_free_return;
3299 break;
3300 case CHAR_CLASS:
3301 *err = build_charclass (regexp->trans, sbcset,
3302#ifdef RE_ENABLE_I18N
3303 mbcset, &char_class_alloc,
3304#endif /* RE_ENABLE_I18N */
3305 (const char *) start_elem.opr.name,
3306 syntax);
3307 if (BE (*err != REG_NOERROR, 0))
3308 goto parse_bracket_exp_free_return;
3309 break;
3310 default:
3311 assert (0);
3312 break;
3313 }
3314 }
3315 if (BE (token->type == END_OF_RE, 0))
3316 {
3317 *err = REG_EBRACK;
3318 goto parse_bracket_exp_free_return;
3319 }
3320 if (token->type == OP_CLOSE_BRACKET)
3321 break;
3322 }
3323
3324 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3325
3326 /* If it is non-matching list. */
3327 if (non_match)
3328 bitset_not (sbcset);
3329
3330#ifdef RE_ENABLE_I18N
3331 /* Ensure only single byte characters are set. */
3332 if (dfa->mb_cur_max > 1)
3333 bitset_mask (sbcset, dfa->sb_char);
3334
3335 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3336 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 || mbcset->non_match)))
3338 {
3339 bin_tree_t *mbc_tree;
3340 int sbc_idx;
3341 /* Build a tree for complex bracket. */
3342 dfa->has_mb_node = 1;
3343 br_token.type = COMPLEX_BRACKET;
3344 br_token.opr.mbcset = mbcset;
3345 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346 if (BE (mbc_tree == NULL, 0))
3347 goto parse_bracket_exp_espace;
3348 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3349 if (sbcset[sbc_idx])
3350 break;
3351 /* If there are no bits set in sbcset, there is no point
3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3353 if (sbc_idx < BITSET_WORDS)
3354 {
3355 /* Build a tree for simple bracket. */
3356 br_token.type = SIMPLE_BRACKET;
3357 br_token.opr.sbcset = sbcset;
3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
3361
3362 /* Then join them by ALT node. */
3363 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 if (BE (work_tree == NULL, 0))
3365 goto parse_bracket_exp_espace;
3366 }
3367 else
3368 {
3369 re_free (sbcset);
3370 work_tree = mbc_tree;
3371 }
3372 }
3373 else
3374#endif /* not RE_ENABLE_I18N */
3375 {
3376#ifdef RE_ENABLE_I18N
3377 free_charset (mbcset);
3378#endif
3379 /* Build a tree for simple bracket. */
3380 br_token.type = SIMPLE_BRACKET;
3381 br_token.opr.sbcset = sbcset;
3382 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383 if (BE (work_tree == NULL, 0))
3384 goto parse_bracket_exp_espace;
3385 }
3386 return work_tree;
3387
3388 parse_bracket_exp_espace:
3389 *err = REG_ESPACE;
3390 parse_bracket_exp_free_return:
3391 re_free (sbcset);
3392#ifdef RE_ENABLE_I18N
3393 free_charset (mbcset);
3394#endif /* RE_ENABLE_I18N */
3395 return NULL;
3396}
3397
3398/* Parse an element in the bracket expression. */
3399
3400static reg_errcode_t
3401parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 re_token_t *token, int token_len, re_dfa_t *dfa,
3403 reg_syntax_t syntax, bool accept_hyphen)
3404{
3405#ifdef RE_ENABLE_I18N
3406 int cur_char_size;
3407 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408 if (cur_char_size > 1)
3409 {
3410 elem->type = MB_CHAR;
3411 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412 re_string_skip_bytes (regexp, cur_char_size);
3413 return REG_NOERROR;
3414 }
3415#endif /* RE_ENABLE_I18N */
3416 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3417 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418 || token->type == OP_OPEN_EQUIV_CLASS)
3419 return parse_bracket_symbol (elem, regexp, token);
3420 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421 {
3422 /* A '-' must only appear as anything but a range indicator before
3423 the closing bracket. Everything else is an error. */
3424 re_token_t token2;
3425 (void) peek_token_bracket (&token2, regexp, syntax);
3426 if (token2.type != OP_CLOSE_BRACKET)
3427 /* The actual error value is not standardized since this whole
3428 case is undefined. But ERANGE makes good sense. */
3429 return REG_ERANGE;
3430 }
3431 elem->type = SB_CHAR;
3432 elem->opr.ch = token->opr.c;
3433 return REG_NOERROR;
3434}
3435
3436/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3437 such as [:<character_class>:], [.<collating_element>.], and
3438 [=<equivalent_class>=]. */
3439
3440static reg_errcode_t
3441parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 re_token_t *token)
3443{
3444 unsigned char ch, delim = token->opr.c;
3445 int i = 0;
3446 if (re_string_eoi(regexp))
3447 return REG_EBRACK;
3448 for (;; ++i)
3449 {
3450 if (i >= BRACKET_NAME_BUF_SIZE)
3451 return REG_EBRACK;
3452 if (token->type == OP_OPEN_CHAR_CLASS)
3453 ch = re_string_fetch_byte_case (regexp);
3454 else
3455 ch = re_string_fetch_byte (regexp);
3456 if (re_string_eoi(regexp))
3457 return REG_EBRACK;
3458 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3459 break;
3460 elem->opr.name[i] = ch;
3461 }
3462 re_string_skip_bytes (regexp, 1);
3463 elem->opr.name[i] = '\0';
3464 switch (token->type)
3465 {
3466 case OP_OPEN_COLL_ELEM:
3467 elem->type = COLL_SYM;
3468 break;
3469 case OP_OPEN_EQUIV_CLASS:
3470 elem->type = EQUIV_CLASS;
3471 break;
3472 case OP_OPEN_CHAR_CLASS:
3473 elem->type = CHAR_CLASS;
3474 break;
3475 default:
3476 break;
3477 }
3478 return REG_NOERROR;
3479}
3480
3481 /* Helper function for parse_bracket_exp.
3482 Build the equivalence class which is represented by NAME.
3483 The result are written to MBCSET and SBCSET.
3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485 is a pointer argument since we may update it. */
3486
3487static reg_errcode_t
3488#ifdef RE_ENABLE_I18N
3489build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3490 Idx *equiv_class_alloc, const unsigned char *name)
3491#else /* not RE_ENABLE_I18N */
3492build_equiv_class (bitset_t sbcset, const unsigned char *name)
3493#endif /* not RE_ENABLE_I18N */
3494{
3495#ifdef _LIBC
3496 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497 if (nrules != 0)
3498 {
3499 const int32_t *table, *indirect;
3500 const unsigned char *weights, *extra, *cp;
3501 unsigned char char_buf[2];
3502 int32_t idx1, idx2;
3503 unsigned int ch;
3504 size_t len;
3505 /* Calculate the index for equivalence class. */
3506 cp = name;
3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_WEIGHTMB);
3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_EXTRAMB);
3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_INDIRECTMB);
3514 idx1 = findidx (table, indirect, extra, &cp, -1);
3515 if (BE (idx1 == 0 || *cp != '\0', 0))
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE;
3518
3519 /* Build single byte matching table for this equivalence class. */
3520 len = weights[idx1 & 0xffffff];
3521 for (ch = 0; ch < SBC_MAX; ++ch)
3522 {
3523 char_buf[0] = ch;
3524 cp = char_buf;
3525 idx2 = findidx (table, indirect, extra, &cp, 1);
3526/*
3527 idx2 = table[ch];
3528*/
3529 if (idx2 == 0)
3530 /* This isn't a valid character. */
3531 continue;
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3535 {
3536 int cnt = 0;
3537
3538 while (cnt <= len &&
3539 weights[(idx1 & 0xffffff) + 1 + cnt]
3540 == weights[(idx2 & 0xffffff) + 1 + cnt])
3541 ++cnt;
3542
3543 if (cnt > len)
3544 bitset_set (sbcset, ch);
3545 }
3546 }
3547 /* Check whether the array has enough space. */
3548 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3549 {
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nequiv_classes is 0. */
3552 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 /* Use realloc since the array is NULL if *alloc == 0. */
3554 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 int32_t,
3556 new_equiv_class_alloc);
3557 if (BE (new_equiv_classes == NULL, 0))
3558 return REG_ESPACE;
3559 mbcset->equiv_classes = new_equiv_classes;
3560 *equiv_class_alloc = new_equiv_class_alloc;
3561 }
3562 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563 }
3564 else
3565#endif /* _LIBC */
3566 {
3567 if (BE (strlen ((const char *) name) != 1, 0))
3568 return REG_ECOLLATE;
3569 bitset_set (sbcset, *name);
3570 }
3571 return REG_NOERROR;
3572}
3573
3574 /* Helper function for parse_bracket_exp.
3575 Build the character class which is represented by NAME.
3576 The result are written to MBCSET and SBCSET.
3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578 is a pointer argument since we may update it. */
3579
3580static reg_errcode_t
3581#ifdef RE_ENABLE_I18N
3582build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 re_charset_t *mbcset, Idx *char_class_alloc,
3584 const char *class_name, reg_syntax_t syntax)
3585#else /* not RE_ENABLE_I18N */
3586build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 const char *class_name, reg_syntax_t syntax)
3588#endif /* not RE_ENABLE_I18N */
3589{
3590 int i;
3591 const char *name = class_name;
3592
3593 /* In case of REG_ICASE "upper" and "lower" match the both of
3594 upper and lower cases. */
3595 if ((syntax & RE_ICASE)
3596 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597 name = "alpha";
3598
3599#ifdef RE_ENABLE_I18N
3600 /* Check the space of the arrays. */
3601 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3602 {
3603 /* Not enough, realloc it. */
3604 /* +1 in case of mbcset->nchar_classes is 0. */
3605 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606 /* Use realloc since array is NULL if *alloc == 0. */
3607 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 new_char_class_alloc);
3609 if (BE (new_char_classes == NULL, 0))
3610 return REG_ESPACE;
3611 mbcset->char_classes = new_char_classes;
3612 *char_class_alloc = new_char_class_alloc;
3613 }
3614 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615#endif /* RE_ENABLE_I18N */
3616
3617#define BUILD_CHARCLASS_LOOP(ctype_func) \
3618 do { \
3619 if (BE (trans != NULL, 0)) \
3620 { \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, trans[i]); \
3624 } \
3625 else \
3626 { \
3627 for (i = 0; i < SBC_MAX; ++i) \
3628 if (ctype_func (i)) \
3629 bitset_set (sbcset, i); \
3630 } \
3631 } while (0)
3632
3633 if (strcmp (name, "alnum") == 0)
3634 BUILD_CHARCLASS_LOOP (isalnum);
3635 else if (strcmp (name, "cntrl") == 0)
3636 BUILD_CHARCLASS_LOOP (iscntrl);
3637 else if (strcmp (name, "lower") == 0)
3638 BUILD_CHARCLASS_LOOP (islower);
3639 else if (strcmp (name, "space") == 0)
3640 BUILD_CHARCLASS_LOOP (isspace);
3641 else if (strcmp (name, "alpha") == 0)
3642 BUILD_CHARCLASS_LOOP (isalpha);
3643 else if (strcmp (name, "digit") == 0)
3644 BUILD_CHARCLASS_LOOP (isdigit);
3645 else if (strcmp (name, "print") == 0)
3646 BUILD_CHARCLASS_LOOP (isprint);
3647 else if (strcmp (name, "upper") == 0)
3648 BUILD_CHARCLASS_LOOP (isupper);
3649 else if (strcmp (name, "blank") == 0)
3650 BUILD_CHARCLASS_LOOP (isblank);
3651 else if (strcmp (name, "graph") == 0)
3652 BUILD_CHARCLASS_LOOP (isgraph);
3653 else if (strcmp (name, "punct") == 0)
3654 BUILD_CHARCLASS_LOOP (ispunct);
3655 else if (strcmp (name, "xdigit") == 0)
3656 BUILD_CHARCLASS_LOOP (isxdigit);
3657 else
3658 return REG_ECTYPE;
3659
3660 return REG_NOERROR;
3661}
3662
3663static bin_tree_t *
3664build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 const char *class_name,
3666 const char *extra, bool non_match,
3667 reg_errcode_t *err)
3668{
3669 re_bitset_ptr_t sbcset;
3670#ifdef RE_ENABLE_I18N
3671 re_charset_t *mbcset;
3672 Idx alloc = 0;
3673#endif /* not RE_ENABLE_I18N */
3674 reg_errcode_t ret;
3675 re_token_t br_token;
3676 bin_tree_t *tree;
3677
3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679 if (BE (sbcset == NULL, 0))
3680 {
3681 *err = REG_ESPACE;
3682 return NULL;
3683 }
3684#ifdef RE_ENABLE_I18N
3685 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3686 if (BE (mbcset == NULL, 0))
3687 {
3688 re_free (sbcset);
3689 *err = REG_ESPACE;
3690 return NULL;
3691 }
3692 mbcset->non_match = non_match;
3693#endif /* RE_ENABLE_I18N */
3694
3695 /* We don't care the syntax in this case. */
3696 ret = build_charclass (trans, sbcset,
3697#ifdef RE_ENABLE_I18N
3698 mbcset, &alloc,
3699#endif /* RE_ENABLE_I18N */
3700 class_name, 0);
3701
3702 if (BE (ret != REG_NOERROR, 0))
3703 {
3704 re_free (sbcset);
3705#ifdef RE_ENABLE_I18N
3706 free_charset (mbcset);
3707#endif /* RE_ENABLE_I18N */
3708 *err = ret;
3709 return NULL;
3710 }
3711 /* \w match '_' also. */
3712 for (; *extra; extra++)
3713 bitset_set (sbcset, *extra);
3714
3715 /* If it is non-matching list. */
3716 if (non_match)
3717 bitset_not (sbcset);
3718
3719#ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa->mb_cur_max > 1)
3722 bitset_mask (sbcset, dfa->sb_char);
3723#endif
3724
3725 /* Build a tree for simple bracket. */
3726#if defined GCC_LINT || defined lint
3727 memset (&br_token, 0, sizeof br_token);
3728#endif
3729 br_token.type = SIMPLE_BRACKET;
3730 br_token.opr.sbcset = sbcset;
3731 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3732 if (BE (tree == NULL, 0))
3733 goto build_word_op_espace;
3734
3735#ifdef RE_ENABLE_I18N
3736 if (dfa->mb_cur_max > 1)
3737 {
3738 bin_tree_t *mbc_tree;
3739 /* Build a tree for complex bracket. */
3740 br_token.type = COMPLEX_BRACKET;
3741 br_token.opr.mbcset = mbcset;
3742 dfa->has_mb_node = 1;
3743 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3744 if (BE (mbc_tree == NULL, 0))
3745 goto build_word_op_espace;
3746 /* Then join them by ALT node. */
3747 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3748 if (BE (mbc_tree != NULL, 1))
3749 return tree;
3750 }
3751 else
3752 {
3753 free_charset (mbcset);
3754 return tree;
3755 }
3756#else /* not RE_ENABLE_I18N */
3757 return tree;
3758#endif /* not RE_ENABLE_I18N */
3759
3760 build_word_op_espace:
3761 re_free (sbcset);
3762#ifdef RE_ENABLE_I18N
3763 free_charset (mbcset);
3764#endif /* RE_ENABLE_I18N */
3765 *err = REG_ESPACE;
3766 return NULL;
3767}
3768
3769/* This is intended for the expressions like "a{1,3}".
3770 Fetch a number from 'input', and return the number.
3771 Return -1 if the number field is empty like "{,1}".
3772 Return RE_DUP_MAX + 1 if the number field is too large.
3773 Return -2 if an error occurred. */
3774
3775static Idx
3776fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3777{
3778 Idx num = -1;
3779 unsigned char c;
3780 while (1)
3781 {
3782 fetch_token (token, input, syntax);
3783 c = token->opr.c;
3784 if (BE (token->type == END_OF_RE, 0))
3785 return -2;
3786 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3787 break;
3788 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3789 ? -2
3790 : num == -1
3791 ? c - '0'
3792 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3793 }
3794 return num;
3795}
3796
3797#ifdef RE_ENABLE_I18N
3798static void
3799free_charset (re_charset_t *cset)
3800{
3801 re_free (cset->mbchars);
3802# ifdef _LIBC
3803 re_free (cset->coll_syms);
3804 re_free (cset->equiv_classes);
3805 re_free (cset->range_starts);
3806 re_free (cset->range_ends);
3807# endif
3808 re_free (cset->char_classes);
3809 re_free (cset);
3810}
3811#endif /* RE_ENABLE_I18N */
3812
3813/* Functions for binary tree operation. */
3814
3815/* Create a tree node. */
3816
3817static bin_tree_t *
3818create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3819 re_token_type_t type)
3820{
3821 re_token_t t;
3822#if defined GCC_LINT || defined lint
3823 memset (&t, 0, sizeof t);
3824#endif
3825 t.type = type;
3826 return create_token_tree (dfa, left, right, &t);
3827}
3828
3829static bin_tree_t *
3830create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3831 const re_token_t *token)
3832{
3833 bin_tree_t *tree;
3834 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3835 {
3836 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3837
3838 if (storage == NULL)
3839 return NULL;
3840 storage->next = dfa->str_tree_storage;
3841 dfa->str_tree_storage = storage;
3842 dfa->str_tree_storage_idx = 0;
3843 }
3844 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3845
3846 tree->parent = NULL;
3847 tree->left = left;
3848 tree->right = right;
3849 tree->token = *token;
3850 tree->token.duplicated = 0;
3851 tree->token.opt_subexp = 0;
3852 tree->first = NULL;
3853 tree->next = NULL;
3854 tree->node_idx = -1;
3855
3856 if (left != NULL)
3857 left->parent = tree;
3858 if (right != NULL)
3859 right->parent = tree;
3860 return tree;
3861}
3862
3863/* Mark the tree SRC as an optional subexpression.
3864 To be called from preorder or postorder. */
3865
3866static reg_errcode_t
3867mark_opt_subexp (void *extra, bin_tree_t *node)
3868{
3869 Idx idx = (uintptr_t) extra;
3870 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3871 node->token.opt_subexp = 1;
3872
3873 return REG_NOERROR;
3874}
3875
3876/* Free the allocated memory inside NODE. */
3877
3878static void
3879free_token (re_token_t *node)
3880{
3881#ifdef RE_ENABLE_I18N
3882 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3883 free_charset (node->opr.mbcset);
3884 else
3885#endif /* RE_ENABLE_I18N */
3886 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3887 re_free (node->opr.sbcset);
3888}
3889
3890/* Worker function for tree walking. Free the allocated memory inside NODE
3891 and its children. */
3892
3893static reg_errcode_t
3894free_tree (void *extra, bin_tree_t *node)
3895{
3896 free_token (&node->token);
3897 return REG_NOERROR;
3898}
3899
3900
3901/* Duplicate the node SRC, and return new node. This is a preorder
3902 visit similar to the one implemented by the generic visitor, but
3903 we need more infrastructure to maintain two parallel trees --- so,
3904 it's easier to duplicate. */
3905
3906static bin_tree_t *
3907duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3908{
3909 const bin_tree_t *node;
3910 bin_tree_t *dup_root;
3911 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3912
3913 for (node = root; ; )
3914 {
3915 /* Create a new tree and link it back to the current parent. */
3916 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3917 if (*p_new == NULL)
3918 return NULL;
3919 (*p_new)->parent = dup_node;
3920 (*p_new)->token.duplicated = 1;
3921 dup_node = *p_new;
3922
3923 /* Go to the left node, or up and to the right. */
3924 if (node->left)
3925 {
3926 node = node->left;
3927 p_new = &dup_node->left;
3928 }
3929 else
3930 {
3931 const bin_tree_t *prev = NULL;
3932 while (node->right == prev || node->right == NULL)
3933 {
3934 prev = node;
3935 node = node->parent;
3936 dup_node = dup_node->parent;
3937 if (!node)
3938 return dup_root;
3939 }
3940 node = node->right;
3941 p_new = &dup_node->right;
3942 }
3943 }
3944}
diff --git a/lib/regex.c b/lib/regex.c
new file mode 100644
index 00000000000..499e1f0e035
--- /dev/null
+++ b/lib/regex.c
@@ -0,0 +1,81 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifndef _LIBC
21# include <config.h>
22
23# if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
24# pragma GCC diagnostic ignored "-Wsuggest-attribute=pure"
25# endif
26# if (__GNUC__ == 4 && 3 <= __GNUC_MINOR__) || 4 < __GNUC__
27# pragma GCC diagnostic ignored "-Wold-style-definition"
28# pragma GCC diagnostic ignored "-Wtype-limits"
29# endif
30#endif
31
32/* Make sure no one compiles this code with a C++ compiler. */
33#if defined __cplusplus && defined _LIBC
34# error "This is C code, use a C compiler"
35#endif
36
37#ifdef _LIBC
38/* We have to keep the namespace clean. */
39# define regfree(preg) __regfree (preg)
40# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
41# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
42# define regerror(errcode, preg, errbuf, errbuf_size) \
43 __regerror(errcode, preg, errbuf, errbuf_size)
44# define re_set_registers(bu, re, nu, st, en) \
45 __re_set_registers (bu, re, nu, st, en)
46# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
47 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
48# define re_match(bufp, string, size, pos, regs) \
49 __re_match (bufp, string, size, pos, regs)
50# define re_search(bufp, string, size, startpos, range, regs) \
51 __re_search (bufp, string, size, startpos, range, regs)
52# define re_compile_pattern(pattern, length, bufp) \
53 __re_compile_pattern (pattern, length, bufp)
54# define re_set_syntax(syntax) __re_set_syntax (syntax)
55# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
56 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
57# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
58
59# include "../locale/localeinfo.h"
60#endif
61
62/* On some systems, limits.h sets RE_DUP_MAX to a lower value than
63 GNU regex allows. Include it before <regex.h>, which correctly
64 #undefs RE_DUP_MAX and sets it to the right value. */
65#include <limits.h>
66
67#include <regex.h>
68#include "regex_internal.h"
69
70#include "regex_internal.c"
71#include "regcomp.c"
72#include "regexec.c"
73
74/* Binary backward compatibility. */
75#if _LIBC
76# include <shlib-compat.h>
77# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
78link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
79int re_max_failures = 2000;
80# endif
81#endif
diff --git a/lib/regex.h b/lib/regex.h
new file mode 100644
index 00000000000..f2ac9507adb
--- /dev/null
+++ b/lib/regex.h
@@ -0,0 +1,658 @@
1/* Definitions for data structures and routines for the regular
2 expression library.
3 Copyright (C) 1985, 1989-2018 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifndef _REGEX_H
21#define _REGEX_H 1
22
23#include <sys/types.h>
24
25/* Allow the use in C++ code. */
26#ifdef __cplusplus
27extern "C" {
28#endif
29
30/* Define __USE_GNU to declare GNU extensions that violate the
31 POSIX name space rules. */
32#ifdef _GNU_SOURCE
33# define __USE_GNU 1
34#endif
35
36#ifdef _REGEX_LARGE_OFFSETS
37
38/* Use types and values that are wide enough to represent signed and
39 unsigned byte offsets in memory. This currently works only when
40 the regex code is used outside of the GNU C library; it is not yet
41 supported within glibc itself, and glibc users should not define
42 _REGEX_LARGE_OFFSETS. */
43
44/* The type of object sizes. */
45typedef size_t __re_size_t;
46
47/* The type of object sizes, in places where the traditional code
48 uses unsigned long int. */
49typedef size_t __re_long_size_t;
50
51#else
52
53/* The traditional GNU regex implementation mishandles strings longer
54 than INT_MAX. */
55typedef unsigned int __re_size_t;
56typedef unsigned long int __re_long_size_t;
57
58#endif
59
60/* The following two types have to be signed and unsigned integer type
61 wide enough to hold a value of a pointer. For most ANSI compilers
62 ptrdiff_t and size_t should be likely OK. Still size of these two
63 types is 2 for Microsoft C. Ugh... */
64typedef long int s_reg_t;
65typedef unsigned long int active_reg_t;
66
67/* The following bits are used to determine the regexp syntax we
68 recognize. The set/not-set meanings are chosen so that Emacs syntax
69 remains the value 0. The bits are given in alphabetical order, and
70 the definitions shifted by one from the previous bit; thus, when we
71 add or remove a bit, only one other definition need change. */
72typedef unsigned long int reg_syntax_t;
73
74#ifdef __USE_GNU
75/* If this bit is not set, then \ inside a bracket expression is literal.
76 If set, then such a \ quotes the following character. */
77# define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1)
78
79/* If this bit is not set, then + and ? are operators, and \+ and \? are
80 literals.
81 If set, then \+ and \? are operators and + and ? are literals. */
82# define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
83
84/* If this bit is set, then character classes are supported. They are:
85 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
86 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
87 If not set, then character classes are not supported. */
88# define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
89
90/* If this bit is set, then ^ and $ are always anchors (outside bracket
91 expressions, of course).
92 If this bit is not set, then it depends:
93 ^ is an anchor if it is at the beginning of a regular
94 expression or after an open-group or an alternation operator;
95 $ is an anchor if it is at the end of a regular expression, or
96 before a close-group or an alternation operator.
97
98 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
99 POSIX draft 11.2 says that * etc. in leading positions is undefined.
100 We already implemented a previous draft which made those constructs
101 invalid, though, so we haven't changed the code back. */
102# define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
103
104/* If this bit is set, then special characters are always special
105 regardless of where they are in the pattern.
106 If this bit is not set, then special characters are special only in
107 some contexts; otherwise they are ordinary. Specifically,
108 * + ? and intervals are only special when not after the beginning,
109 open-group, or alternation operator. */
110# define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
111
112/* If this bit is set, then *, +, ?, and { cannot be first in an re or
113 immediately after an alternation or begin-group operator. */
114# define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
115
116/* If this bit is set, then . matches newline.
117 If not set, then it doesn't. */
118# define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
119
120/* If this bit is set, then . doesn't match NUL.
121 If not set, then it does. */
122# define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
123
124/* If this bit is set, nonmatching lists [^...] do not match newline.
125 If not set, they do. */
126# define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
127
128/* If this bit is set, either \{...\} or {...} defines an
129 interval, depending on RE_NO_BK_BRACES.
130 If not set, \{, \}, {, and } are literals. */
131# define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
132
133/* If this bit is set, +, ? and | aren't recognized as operators.
134 If not set, they are. */
135# define RE_LIMITED_OPS (RE_INTERVALS << 1)
136
137/* If this bit is set, newline is an alternation operator.
138 If not set, newline is literal. */
139# define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
140
141/* If this bit is set, then '{...}' defines an interval, and \{ and \}
142 are literals.
143 If not set, then '\{...\}' defines an interval. */
144# define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
145
146/* If this bit is set, (...) defines a group, and \( and \) are literals.
147 If not set, \(...\) defines a group, and ( and ) are literals. */
148# define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
149
150/* If this bit is set, then \<digit> matches <digit>.
151 If not set, then \<digit> is a back-reference. */
152# define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
153
154/* If this bit is set, then | is an alternation operator, and \| is literal.
155 If not set, then \| is an alternation operator, and | is literal. */
156# define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
157
158/* If this bit is set, then an ending range point collating higher
159 than the starting range point, as in [z-a], is invalid.
160 If not set, then when ending range point collates higher than the
161 starting range point, the range is ignored. */
162# define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
163
164/* If this bit is set, then an unmatched ) is ordinary.
165 If not set, then an unmatched ) is invalid. */
166# define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
167
168/* If this bit is set, succeed as soon as we match the whole pattern,
169 without further backtracking. */
170# define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)
171
172/* If this bit is set, do not process the GNU regex operators.
173 If not set, then the GNU regex operators are recognized. */
174# define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1)
175
176/* If this bit is set, turn on internal regex debugging.
177 If not set, and debugging was on, turn it off.
178 This only works if regex.c is compiled -DDEBUG.
179 We define this bit always, so that all that's needed to turn on
180 debugging is to recompile regex.c; the calling code can always have
181 this bit set, and it won't affect anything in the normal case. */
182# define RE_DEBUG (RE_NO_GNU_OPS << 1)
183
184/* If this bit is set, a syntactically invalid interval is treated as
185 a string of ordinary characters. For example, the ERE 'a{1' is
186 treated as 'a\{1'. */
187# define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
188
189/* If this bit is set, then ignore case when matching.
190 If not set, then case is significant. */
191# define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
192
193/* This bit is used internally like RE_CONTEXT_INDEP_ANCHORS but only
194 for ^, because it is difficult to scan the regex backwards to find
195 whether ^ should be special. */
196# define RE_CARET_ANCHORS_HERE (RE_ICASE << 1)
197
198/* If this bit is set, then \{ cannot be first in a regex or
199 immediately after an alternation, open-group or \} operator. */
200# define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
201
202/* If this bit is set, then no_sub will be set to 1 during
203 re_compile_pattern. */
204# define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
205#endif
206
207/* This global variable defines the particular regexp syntax to use (for
208 some interfaces). When a regexp is compiled, the syntax used is
209 stored in the pattern buffer, so changing this does not affect
210 already-compiled regexps. */
211extern reg_syntax_t re_syntax_options;
212
213#ifdef __USE_GNU
214/* Define combinations of the above bits for the standard possibilities.
215 (The [[[ comments delimit what gets put into the Texinfo file, so
216 don't delete them!) */
217/* [[[begin syntaxes]]] */
218# define RE_SYNTAX_EMACS 0
219
220# define RE_SYNTAX_AWK \
221 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
222 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
223 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
224 | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \
225 | RE_CHAR_CLASSES \
226 | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS)
227
228# define RE_SYNTAX_GNU_AWK \
229 ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
230 | RE_INVALID_INTERVAL_ORD) \
231 & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \
232 | RE_CONTEXT_INVALID_OPS ))
233
234# define RE_SYNTAX_POSIX_AWK \
235 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
236 | RE_INTERVALS | RE_NO_GNU_OPS \
237 | RE_INVALID_INTERVAL_ORD)
238
239# define RE_SYNTAX_GREP \
240 ((RE_SYNTAX_POSIX_BASIC | RE_NEWLINE_ALT) \
241 & ~(RE_CONTEXT_INVALID_DUP | RE_DOT_NOT_NULL))
242
243# define RE_SYNTAX_EGREP \
244 ((RE_SYNTAX_POSIX_EXTENDED | RE_INVALID_INTERVAL_ORD | RE_NEWLINE_ALT) \
245 & ~(RE_CONTEXT_INVALID_OPS | RE_DOT_NOT_NULL))
246
247/* POSIX grep -E behavior is no longer incompatible with GNU. */
248# define RE_SYNTAX_POSIX_EGREP \
249 RE_SYNTAX_EGREP
250
251/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
252# define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
253
254# define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
255
256/* Syntax bits common to both basic and extended POSIX regex syntax. */
257# define _RE_SYNTAX_POSIX_COMMON \
258 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
259 | RE_INTERVALS | RE_NO_EMPTY_RANGES)
260
261# define RE_SYNTAX_POSIX_BASIC \
262 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM | RE_CONTEXT_INVALID_DUP)
263
264/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
265 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
266 isn't minimal, since other operators, such as \`, aren't disabled. */
267# define RE_SYNTAX_POSIX_MINIMAL_BASIC \
268 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
269
270# define RE_SYNTAX_POSIX_EXTENDED \
271 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
272 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
273 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
274 | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD)
275
276/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is
277 removed and RE_NO_BK_REFS is added. */
278# define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
279 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
280 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
281 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
282 | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
283/* [[[end syntaxes]]] */
284
285/* Maximum number of duplicates an interval can allow. POSIX-conforming
286 systems might define this in <limits.h>, but we want our
287 value, so remove any previous define. */
288# ifdef _REGEX_INCLUDE_LIMITS_H
289# include <limits.h>
290# endif
291# ifdef RE_DUP_MAX
292# undef RE_DUP_MAX
293# endif
294
295/* RE_DUP_MAX is 2**15 - 1 because an earlier implementation stored
296 the counter as a 2-byte signed integer. This is no longer true, so
297 RE_DUP_MAX could be increased to (INT_MAX / 10 - 1), or to
298 ((SIZE_MAX - 9) / 10) if _REGEX_LARGE_OFFSETS is defined.
299 However, there would be a huge performance problem if someone
300 actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains
301 its historical value. */
302# define RE_DUP_MAX (0x7fff)
303#endif
304
305
306/* POSIX 'cflags' bits (i.e., information for 'regcomp'). */
307
308/* If this bit is set, then use extended regular expression syntax.
309 If not set, then use basic regular expression syntax. */
310#define REG_EXTENDED 1
311
312/* If this bit is set, then ignore case when matching.
313 If not set, then case is significant. */
314#define REG_ICASE (1 << 1)
315
316/* If this bit is set, then anchors do not match at newline
317 characters in the string.
318 If not set, then anchors do match at newlines. */
319#define REG_NEWLINE (1 << 2)
320
321/* If this bit is set, then report only success or fail in regexec.
322 If not set, then returns differ between not matching and errors. */
323#define REG_NOSUB (1 << 3)
324
325
326/* POSIX 'eflags' bits (i.e., information for regexec). */
327
328/* If this bit is set, then the beginning-of-line operator doesn't match
329 the beginning of the string (presumably because it's not the
330 beginning of a line).
331 If not set, then the beginning-of-line operator does match the
332 beginning of the string. */
333#define REG_NOTBOL 1
334
335/* Like REG_NOTBOL, except for the end-of-line. */
336#define REG_NOTEOL (1 << 1)
337
338/* Use PMATCH[0] to delimit the start and end of the search in the
339 buffer. */
340#define REG_STARTEND (1 << 2)
341
342
343/* If any error codes are removed, changed, or added, update the
344 '__re_error_msgid' table in regcomp.c. */
345
346typedef enum
347{
348 _REG_ENOSYS = -1, /* This will never happen for this implementation. */
349 _REG_NOERROR = 0, /* Success. */
350 _REG_NOMATCH, /* Didn't find a match (for regexec). */
351
352 /* POSIX regcomp return error codes. (In the order listed in the
353 standard.) */
354 _REG_BADPAT, /* Invalid pattern. */
355 _REG_ECOLLATE, /* Invalid collating element. */
356 _REG_ECTYPE, /* Invalid character class name. */
357 _REG_EESCAPE, /* Trailing backslash. */
358 _REG_ESUBREG, /* Invalid back reference. */
359 _REG_EBRACK, /* Unmatched left bracket. */
360 _REG_EPAREN, /* Parenthesis imbalance. */
361 _REG_EBRACE, /* Unmatched \{. */
362 _REG_BADBR, /* Invalid contents of \{\}. */
363 _REG_ERANGE, /* Invalid range end. */
364 _REG_ESPACE, /* Ran out of memory. */
365 _REG_BADRPT, /* No preceding re for repetition op. */
366
367 /* Error codes we've added. */
368 _REG_EEND, /* Premature end. */
369 _REG_ESIZE, /* Too large (e.g., repeat count too large). */
370 _REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
371} reg_errcode_t;
372
373#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K
374# define REG_ENOSYS _REG_ENOSYS
375#endif
376#define REG_NOERROR _REG_NOERROR
377#define REG_NOMATCH _REG_NOMATCH
378#define REG_BADPAT _REG_BADPAT
379#define REG_ECOLLATE _REG_ECOLLATE
380#define REG_ECTYPE _REG_ECTYPE
381#define REG_EESCAPE _REG_EESCAPE
382#define REG_ESUBREG _REG_ESUBREG
383#define REG_EBRACK _REG_EBRACK
384#define REG_EPAREN _REG_EPAREN
385#define REG_EBRACE _REG_EBRACE
386#define REG_BADBR _REG_BADBR
387#define REG_ERANGE _REG_ERANGE
388#define REG_ESPACE _REG_ESPACE
389#define REG_BADRPT _REG_BADRPT
390#define REG_EEND _REG_EEND
391#define REG_ESIZE _REG_ESIZE
392#define REG_ERPAREN _REG_ERPAREN
393
394/* This data structure represents a compiled pattern. Before calling
395 the pattern compiler, the fields 'buffer', 'allocated', 'fastmap',
396 and 'translate' can be set. After the pattern has been compiled,
397 the fields 're_nsub', 'not_bol' and 'not_eol' are available. All
398 other fields are private to the regex routines. */
399
400#ifndef RE_TRANSLATE_TYPE
401# define __RE_TRANSLATE_TYPE unsigned char *
402# ifdef __USE_GNU
403# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
404# endif
405#endif
406
407#ifdef __USE_GNU
408# define __REPB_PREFIX(name) name
409#else
410# define __REPB_PREFIX(name) __##name
411#endif
412
413struct re_pattern_buffer
414{
415 /* Space that holds the compiled pattern. The type
416 'struct re_dfa_t' is private and is not declared here. */
417 struct re_dfa_t *__REPB_PREFIX(buffer);
418
419 /* Number of bytes to which 'buffer' points. */
420 __re_long_size_t __REPB_PREFIX(allocated);
421
422 /* Number of bytes actually used in 'buffer'. */
423 __re_long_size_t __REPB_PREFIX(used);
424
425 /* Syntax setting with which the pattern was compiled. */
426 reg_syntax_t __REPB_PREFIX(syntax);
427
428 /* Pointer to a fastmap, if any, otherwise zero. re_search uses the
429 fastmap, if there is one, to skip over impossible starting points
430 for matches. */
431 char *__REPB_PREFIX(fastmap);
432
433 /* Either a translate table to apply to all characters before
434 comparing them, or zero for no translation. The translation is
435 applied to a pattern when it is compiled and to a string when it
436 is matched. */
437 __RE_TRANSLATE_TYPE __REPB_PREFIX(translate);
438
439 /* Number of subexpressions found by the compiler. */
440 size_t re_nsub;
441
442 /* Zero if this pattern cannot match the empty string, one else.
443 Well, in truth it's used only in 're_search_2', to see whether or
444 not we should use the fastmap, so we don't set this absolutely
445 perfectly; see 're_compile_fastmap' (the "duplicate" case). */
446 unsigned __REPB_PREFIX(can_be_null) : 1;
447
448 /* If REGS_UNALLOCATED, allocate space in the 'regs' structure
449 for 'max (RE_NREGS, re_nsub + 1)' groups.
450 If REGS_REALLOCATE, reallocate space if necessary.
451 If REGS_FIXED, use what's there. */
452#ifdef __USE_GNU
453# define REGS_UNALLOCATED 0
454# define REGS_REALLOCATE 1
455# define REGS_FIXED 2
456#endif
457 unsigned __REPB_PREFIX(regs_allocated) : 2;
458
459 /* Set to zero when 're_compile_pattern' compiles a pattern; set to
460 one by 're_compile_fastmap' if it updates the fastmap. */
461 unsigned __REPB_PREFIX(fastmap_accurate) : 1;
462
463 /* If set, 're_match_2' does not return information about
464 subexpressions. */
465 unsigned __REPB_PREFIX(no_sub) : 1;
466
467 /* If set, a beginning-of-line anchor doesn't match at the beginning
468 of the string. */
469 unsigned __REPB_PREFIX(not_bol) : 1;
470
471 /* Similarly for an end-of-line anchor. */
472 unsigned __REPB_PREFIX(not_eol) : 1;
473
474 /* If true, an anchor at a newline matches. */
475 unsigned __REPB_PREFIX(newline_anchor) : 1;
476};
477
478typedef struct re_pattern_buffer regex_t;
479
480/* Type for byte offsets within the string. POSIX mandates this. */
481#ifdef _REGEX_LARGE_OFFSETS
482/* POSIX 1003.1-2008 requires that regoff_t be at least as wide as
483 ptrdiff_t and ssize_t. We don't know of any hosts where ptrdiff_t
484 is wider than ssize_t, so ssize_t is safe. ptrdiff_t is not
485 visible here, so use ssize_t. */
486typedef ssize_t regoff_t;
487#else
488/* The traditional GNU regex implementation mishandles strings longer
489 than INT_MAX. */
490typedef int regoff_t;
491#endif
492
493
494#ifdef __USE_GNU
495/* This is the structure we store register match data in. See
496 regex.texinfo for a full description of what registers match. */
497struct re_registers
498{
499 __re_size_t num_regs;
500 regoff_t *start;
501 regoff_t *end;
502};
503
504
505/* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
506 're_match_2' returns information about at least this many registers
507 the first time a 'regs' structure is passed. */
508# ifndef RE_NREGS
509# define RE_NREGS 30
510# endif
511#endif
512
513
514/* POSIX specification for registers. Aside from the different names than
515 're_registers', POSIX uses an array of structures, instead of a
516 structure of arrays. */
517typedef struct
518{
519 regoff_t rm_so; /* Byte offset from string's start to substring's start. */
520 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
521} regmatch_t;
522
523/* Declarations for routines. */
524
525#ifdef __USE_GNU
526/* Sets the current default syntax to SYNTAX, and return the old syntax.
527 You can also simply assign to the 're_syntax_options' variable. */
528extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax);
529
530/* Compile the regular expression PATTERN, with length LENGTH
531 and syntax given by the global 're_syntax_options', into the buffer
532 BUFFER. Return NULL if successful, and an error string if not.
533
534 To free the allocated storage, you must call 'regfree' on BUFFER.
535 Note that the translate table must either have been initialized by
536 'regcomp', with a malloc'ed value, or set to NULL before calling
537 'regfree'. */
538extern const char *re_compile_pattern (const char *__pattern, size_t __length,
539 struct re_pattern_buffer *__buffer);
540
541
542/* Compile a fastmap for the compiled pattern in BUFFER; used to
543 accelerate searches. Return 0 if successful and -2 if was an
544 internal error. */
545extern int re_compile_fastmap (struct re_pattern_buffer *__buffer);
546
547
548/* Search in the string STRING (with length LENGTH) for the pattern
549 compiled into BUFFER. Start searching at position START, for RANGE
550 characters. Return the starting position of the match, -1 for no
551 match, or -2 for an internal error. Also return register
552 information in REGS (if REGS and BUFFER->no_sub are nonzero). */
553extern regoff_t re_search (struct re_pattern_buffer *__buffer,
554 const char *__String, regoff_t __length,
555 regoff_t __start, regoff_t __range,
556 struct re_registers *__regs);
557
558
559/* Like 're_search', but search in the concatenation of STRING1 and
560 STRING2. Also, stop searching at index START + STOP. */
561extern regoff_t re_search_2 (struct re_pattern_buffer *__buffer,
562 const char *__string1, regoff_t __length1,
563 const char *__string2, regoff_t __length2,
564 regoff_t __start, regoff_t __range,
565 struct re_registers *__regs,
566 regoff_t __stop);
567
568
569/* Like 're_search', but return how many characters in STRING the regexp
570 in BUFFER matched, starting at position START. */
571extern regoff_t re_match (struct re_pattern_buffer *__buffer,
572 const char *__String, regoff_t __length,
573 regoff_t __start, struct re_registers *__regs);
574
575
576/* Relates to 're_match' as 're_search_2' relates to 're_search'. */
577extern regoff_t re_match_2 (struct re_pattern_buffer *__buffer,
578 const char *__string1, regoff_t __length1,
579 const char *__string2, regoff_t __length2,
580 regoff_t __start, struct re_registers *__regs,
581 regoff_t __stop);
582
583
584/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
585 ENDS. Subsequent matches using BUFFER and REGS will use this memory
586 for recording register information. STARTS and ENDS must be
587 allocated with malloc, and must each be at least 'NUM_REGS * sizeof
588 (regoff_t)' bytes long.
589
590 If NUM_REGS == 0, then subsequent matches should allocate their own
591 register data.
592
593 Unless this function is called, the first search or match using
594 BUFFER will allocate its own register data, without
595 freeing the old data. */
596extern void re_set_registers (struct re_pattern_buffer *__buffer,
597 struct re_registers *__regs,
598 __re_size_t __num_regs,
599 regoff_t *__starts, regoff_t *__ends);
600#endif /* Use GNU */
601
602#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_MISC)
603# ifndef _CRAY
604/* 4.2 bsd compatibility. */
605extern char *re_comp (const char *);
606extern int re_exec (const char *);
607# endif
608#endif
609
610/* For plain 'restrict', use glibc's __restrict if defined.
611 Otherwise, GCC 2.95 and later have "__restrict"; C99 compilers have
612 "restrict", and "configure" may have defined "restrict".
613 Other compilers use __restrict, __restrict__, and _Restrict, and
614 'configure' might #define 'restrict' to those words, so pick a
615 different name. */
616#ifndef _Restrict_
617# if defined __restrict || 2 < __GNUC__ + (95 <= __GNUC_MINOR__)
618# define _Restrict_ __restrict
619# elif 199901L <= __STDC_VERSION__ || defined restrict
620# define _Restrict_ restrict
621# else
622# define _Restrict_
623# endif
624#endif
625/* For [restrict], use glibc's __restrict_arr if available.
626 Otherwise, GCC 3.1 (not in C++ mode) and C99 support [restrict]. */
627#ifndef _Restrict_arr_
628# ifdef __restrict_arr
629# define _Restrict_arr_ __restrict_arr
630# elif ((199901L <= __STDC_VERSION__ || 3 < __GNUC__ + (1 <= __GNUC_MINOR__)) \
631 && !defined __GNUG__)
632# define _Restrict_arr_ _Restrict_
633# else
634# define _Restrict_arr_
635# endif
636#endif
637
638/* POSIX compatibility. */
639extern int regcomp (regex_t *_Restrict_ __preg,
640 const char *_Restrict_ __pattern,
641 int __cflags);
642
643extern int regexec (const regex_t *_Restrict_ __preg,
644 const char *_Restrict_ __String, size_t __nmatch,
645 regmatch_t __pmatch[_Restrict_arr_],
646 int __eflags);
647
648extern size_t regerror (int __errcode, const regex_t *_Restrict_ __preg,
649 char *_Restrict_ __errbuf, size_t __errbuf_size);
650
651extern void regfree (regex_t *__preg);
652
653
654#ifdef __cplusplus
655}
656#endif /* C++ */
657
658#endif /* regex.h */
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
new file mode 100644
index 00000000000..32373565e6d
--- /dev/null
+++ b/lib/regex_internal.c
@@ -0,0 +1,1740 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48__attribute_warn_unused_result__
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51{
52 reg_errcode_t ret;
53 Idx init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (BE (ret != REG_NOERROR, 0))
63 return ret;
64
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
76__attribute_warn_unused_result__
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79{
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (BE (ret != REG_NOERROR, 0))
88 return ret;
89 }
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (BE (ret != REG_NOERROR, 0))
108 return ret;
109 }
110 }
111 else
112#endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117#ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121#endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
130 }
131 }
132
133 return REG_NOERROR;
134}
135
136/* Helper functions for re_string_allocate, and re_string_construct. */
137
138static reg_errcode_t
139__attribute_warn_unused_result__
140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141{
142#ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
144 {
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
150 return REG_ESPACE;
151
152 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_wcs == NULL, 0))
154 return REG_ESPACE;
155 pstr->wcs = new_wcs;
156 if (pstr->offsets != NULL)
157 {
158 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
159 if (BE (new_offsets == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_offsets;
162 }
163 }
164#endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
166 {
167 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_mbs == NULL, 0))
170 return REG_ESPACE;
171 pstr->mbs = new_mbs;
172 }
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
175}
176
177
178static void
179re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
180 RE_TRANSLATE_TYPE trans, bool icase,
181 const re_dfa_t *dfa)
182{
183 pstr->raw_mbs = (const unsigned char *) str;
184 pstr->len = len;
185 pstr->raw_len = len;
186 pstr->trans = trans;
187 pstr->icase = icase;
188 pstr->mbs_allocated = (trans != NULL || icase);
189 pstr->mb_cur_max = dfa->mb_cur_max;
190 pstr->is_utf8 = dfa->is_utf8;
191 pstr->map_notascii = dfa->map_notascii;
192 pstr->stop = pstr->len;
193 pstr->raw_stop = pstr->stop;
194}
195
196#ifdef RE_ENABLE_I18N
197
198/* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
205
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
208
209static void
210build_wcs_buffer (re_string_t *pstr)
211{
212#ifdef _LIBC
213 unsigned char buf[MB_LEN_MAX];
214 assert (MB_LEN_MAX >= pstr->mb_cur_max);
215#else
216 unsigned char buf[64];
217#endif
218 mbstate_t prev_st;
219 Idx byte_idx, end_idx, remain_len;
220 size_t mbclen;
221
222 /* Build the buffers from pstr->valid_len to either pstr->len or
223 pstr->bufs_len. */
224 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
225 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
226 {
227 wchar_t wc;
228 const char *p;
229
230 remain_len = end_idx - byte_idx;
231 prev_st = pstr->cur_state;
232 /* Apply the translation if we need. */
233 if (BE (pstr->trans != NULL, 0))
234 {
235 int i, ch;
236
237 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
238 {
239 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
240 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
241 }
242 p = (const char *) buf;
243 }
244 else
245 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
246 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
247 if (BE (mbclen == (size_t) -1 || mbclen == 0
248 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
249 {
250 /* We treat these cases as a singlebyte character. */
251 mbclen = 1;
252 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253 if (BE (pstr->trans != NULL, 0))
254 wc = pstr->trans[wc];
255 pstr->cur_state = prev_st;
256 }
257 else if (BE (mbclen == (size_t) -2, 0))
258 {
259 /* The buffer doesn't have enough space, finish to build. */
260 pstr->cur_state = prev_st;
261 break;
262 }
263
264 /* Write wide character and padding. */
265 pstr->wcs[byte_idx++] = wc;
266 /* Write paddings. */
267 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 pstr->wcs[byte_idx++] = WEOF;
269 }
270 pstr->valid_len = byte_idx;
271 pstr->valid_raw_len = byte_idx;
272}
273
274/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
276
277static reg_errcode_t
278__attribute_warn_unused_result__
279build_wcs_upper_buffer (re_string_t *pstr)
280{
281 mbstate_t prev_st;
282 Idx src_idx, byte_idx, end_idx, remain_len;
283 size_t mbclen;
284#ifdef _LIBC
285 char buf[MB_LEN_MAX];
286 assert (MB_LEN_MAX >= pstr->mb_cur_max);
287#else
288 char buf[64];
289#endif
290
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297 {
298 while (byte_idx < end_idx)
299 {
300 wchar_t wc;
301
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
304 {
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
313 }
314
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
317 mbclen = __mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
320 if (BE (mbclen < (size_t) -2, 1))
321 {
322 wchar_t wcu = __towupper (wc);
323 if (wcu != wc)
324 {
325 size_t mbcdlen;
326
327 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
328 if (BE (mbclen == mbcdlen, 1))
329 memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 else
331 {
332 src_idx = byte_idx;
333 goto offsets_needed;
334 }
335 }
336 else
337 memcpy (pstr->mbs + byte_idx,
338 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
339 pstr->wcs[byte_idx++] = wcu;
340 /* Write paddings. */
341 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
342 pstr->wcs[byte_idx++] = WEOF;
343 }
344 else if (mbclen == (size_t) -1 || mbclen == 0
345 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
346 {
347 /* It is an invalid character, an incomplete character
348 at the end of the string, or '\0'. Just use the byte. */
349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
355 }
356 else
357 {
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
361 }
362 }
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
366 }
367 else
368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369 {
370 wchar_t wc;
371 const char *p;
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
376 {
377 int i, ch;
378
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 {
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
383 }
384 p = (const char *) buf;
385 }
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen < (size_t) -2, 1))
390 {
391 wchar_t wcu = __towupper (wc);
392 if (wcu != wc)
393 {
394 size_t mbcdlen;
395
396 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
397 if (BE (mbclen == mbcdlen, 1))
398 memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 else if (mbcdlen != (size_t) -1)
400 {
401 size_t i;
402
403 if (byte_idx + mbcdlen > pstr->bufs_len)
404 {
405 pstr->cur_state = prev_st;
406 break;
407 }
408
409 if (pstr->offsets == NULL)
410 {
411 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
412
413 if (pstr->offsets == NULL)
414 return REG_ESPACE;
415 }
416 if (!pstr->offsets_needed)
417 {
418 for (i = 0; i < (size_t) byte_idx; ++i)
419 pstr->offsets[i] = i;
420 pstr->offsets_needed = 1;
421 }
422
423 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 pstr->wcs[byte_idx] = wcu;
425 pstr->offsets[byte_idx] = src_idx;
426 for (i = 1; i < mbcdlen; ++i)
427 {
428 pstr->offsets[byte_idx + i]
429 = src_idx + (i < mbclen ? i : mbclen - 1);
430 pstr->wcs[byte_idx + i] = WEOF;
431 }
432 pstr->len += mbcdlen - mbclen;
433 if (pstr->raw_stop > src_idx)
434 pstr->stop += mbcdlen - mbclen;
435 end_idx = (pstr->bufs_len > pstr->len)
436 ? pstr->len : pstr->bufs_len;
437 byte_idx += mbcdlen;
438 src_idx += mbclen;
439 continue;
440 }
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
443 }
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447 if (BE (pstr->offsets_needed != 0, 0))
448 {
449 size_t i;
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
452 }
453 src_idx += mbclen;
454
455 pstr->wcs[byte_idx++] = wcu;
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
459 }
460 else if (mbclen == (size_t) -1 || mbclen == 0
461 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
462 {
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465
466 if (BE (pstr->trans != NULL, 0))
467 ch = pstr->trans [ch];
468 pstr->mbs[byte_idx] = ch;
469
470 if (BE (pstr->offsets_needed != 0, 0))
471 pstr->offsets[byte_idx] = src_idx;
472 ++src_idx;
473
474 /* And also cast it to wide char. */
475 pstr->wcs[byte_idx++] = (wchar_t) ch;
476 if (BE (mbclen == (size_t) -1, 0))
477 pstr->cur_state = prev_st;
478 }
479 else
480 {
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr->cur_state = prev_st;
483 break;
484 }
485 }
486 pstr->valid_len = byte_idx;
487 pstr->valid_raw_len = src_idx;
488 return REG_NOERROR;
489}
490
491/* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 Return the index. */
493
494static Idx
495re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
496{
497 mbstate_t prev_st;
498 Idx rawbuf_idx;
499 size_t mbclen;
500 wint_t wc = WEOF;
501
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504 rawbuf_idx < new_raw_idx;)
505 {
506 wchar_t wc2;
507 Idx remain_len = pstr->raw_len - rawbuf_idx;
508 prev_st = pstr->cur_state;
509 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 {
513 /* We treat these cases as a single byte character. */
514 if (mbclen == 0 || remain_len == 0)
515 wc = L'\0';
516 else
517 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
518 mbclen = 1;
519 pstr->cur_state = prev_st;
520 }
521 else
522 wc = wc2;
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
525 }
526 *last_wc = wc;
527 return rawbuf_idx;
528}
529#endif /* RE_ENABLE_I18N */
530
531/* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
533
534static void
535build_upper_buffer (re_string_t *pstr)
536{
537 Idx char_idx, end_idx;
538 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539
540 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541 {
542 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543 if (BE (pstr->trans != NULL, 0))
544 ch = pstr->trans[ch];
545 pstr->mbs[char_idx] = toupper (ch);
546 }
547 pstr->valid_len = char_idx;
548 pstr->valid_raw_len = char_idx;
549}
550
551/* Apply TRANS to the buffer in PSTR. */
552
553static void
554re_string_translate_buffer (re_string_t *pstr)
555{
556 Idx buf_idx, end_idx;
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 {
561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562 pstr->mbs[buf_idx] = pstr->trans[ch];
563 }
564
565 pstr->valid_len = buf_idx;
566 pstr->valid_raw_len = buf_idx;
567}
568
569/* This function re-construct the buffers.
570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573static reg_errcode_t
574__attribute_warn_unused_result__
575re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576{
577 Idx offset;
578
579 if (BE (pstr->raw_mbs_idx <= idx, 0))
580 offset = idx - pstr->raw_mbs_idx;
581 else
582 {
583 /* Reset buffer. */
584#ifdef RE_ENABLE_I18N
585 if (pstr->mb_cur_max > 1)
586 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587#endif /* RE_ENABLE_I18N */
588 pstr->len = pstr->raw_len;
589 pstr->stop = pstr->raw_stop;
590 pstr->valid_len = 0;
591 pstr->raw_mbs_idx = 0;
592 pstr->valid_raw_len = 0;
593 pstr->offsets_needed = 0;
594 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596 if (!pstr->mbs_allocated)
597 pstr->mbs = (unsigned char *) pstr->raw_mbs;
598 offset = idx;
599 }
600
601 if (BE (offset != 0, 1))
602 {
603 /* Should the already checked characters be kept? */
604 if (BE (offset < pstr->valid_raw_len, 1))
605 {
606 /* Yes, move them to the front of the buffer. */
607#ifdef RE_ENABLE_I18N
608 if (BE (pstr->offsets_needed, 0))
609 {
610 Idx low = 0, high = pstr->valid_len, mid;
611 do
612 {
613 mid = (high + low) / 2;
614 if (pstr->offsets[mid] > offset)
615 high = mid;
616 else if (pstr->offsets[mid] < offset)
617 low = mid + 1;
618 else
619 break;
620 }
621 while (low < high);
622 if (pstr->offsets[mid] < offset)
623 ++mid;
624 pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 eflags);
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr->valid_len > offset
631 && mid == offset && pstr->offsets[mid] == offset)
632 {
633 memmove (pstr->wcs, pstr->wcs + offset,
634 (pstr->valid_len - offset) * sizeof (wint_t));
635 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 pstr->valid_len -= offset;
637 pstr->valid_raw_len -= offset;
638 for (low = 0; low < pstr->valid_len; low++)
639 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 }
641 else
642 {
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr->len = pstr->raw_len - idx + offset;
646 pstr->stop = pstr->raw_stop - idx + offset;
647 pstr->offsets_needed = 0;
648 while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 --mid;
650 while (mid < pstr->valid_len)
651 if (pstr->wcs[mid] != WEOF)
652 break;
653 else
654 ++mid;
655 if (mid == pstr->valid_len)
656 pstr->valid_len = 0;
657 else
658 {
659 pstr->valid_len = pstr->offsets[mid] - offset;
660 if (pstr->valid_len)
661 {
662 for (low = 0; low < pstr->valid_len; ++low)
663 pstr->wcs[low] = WEOF;
664 memset (pstr->mbs, 255, pstr->valid_len);
665 }
666 }
667 pstr->valid_raw_len = pstr->valid_len;
668 }
669 }
670 else
671#endif
672 {
673 pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 eflags);
675#ifdef RE_ENABLE_I18N
676 if (pstr->mb_cur_max > 1)
677 memmove (pstr->wcs, pstr->wcs + offset,
678 (pstr->valid_len - offset) * sizeof (wint_t));
679#endif /* RE_ENABLE_I18N */
680 if (BE (pstr->mbs_allocated, 0))
681 memmove (pstr->mbs, pstr->mbs + offset,
682 pstr->valid_len - offset);
683 pstr->valid_len -= offset;
684 pstr->valid_raw_len -= offset;
685#if defined DEBUG && DEBUG
686 assert (pstr->valid_len > 0);
687#endif
688 }
689 }
690 else
691 {
692#ifdef RE_ENABLE_I18N
693 /* No, skip all characters until IDX. */
694 Idx prev_valid_len = pstr->valid_len;
695
696 if (BE (pstr->offsets_needed, 0))
697 {
698 pstr->len = pstr->raw_len - idx + offset;
699 pstr->stop = pstr->raw_stop - idx + offset;
700 pstr->offsets_needed = 0;
701 }
702#endif
703 pstr->valid_len = 0;
704#ifdef RE_ENABLE_I18N
705 if (pstr->mb_cur_max > 1)
706 {
707 Idx wcs_idx;
708 wint_t wc = WEOF;
709
710 if (pstr->is_utf8)
711 {
712 const unsigned char *raw, *p, *end;
713
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 end = raw + (offset - pstr->mb_cur_max);
718 if (end < pstr->raw_mbs)
719 end = pstr->raw_mbs;
720 p = raw + offset - 1;
721#ifdef _LIBC
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 {
726 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 /* pstr->valid_len = 0; */
728 wc = (wchar_t) *p;
729 }
730 else
731#endif
732 for (; p >= end; --p)
733 if ((*p & 0xc0) != 0x80)
734 {
735 mbstate_t cur_state;
736 wchar_t wc2;
737 Idx mlen = raw + pstr->len - p;
738 unsigned char buf[6];
739 size_t mbclen;
740
741 const unsigned char *pp = p;
742 if (BE (pstr->trans != NULL, 0))
743 {
744 int i = mlen < 6 ? mlen : 6;
745 while (--i >= 0)
746 buf[i] = pstr->trans[p[i]];
747 pp = buf;
748 }
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
752 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
753 &cur_state);
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
756 {
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
761 }
762 break;
763 }
764 }
765
766 if (wc == WEOF)
767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 if (wc == WEOF)
769 pstr->tip_context
770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
778 if (BE (pstr->valid_len, 0))
779 {
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
784 }
785 pstr->valid_raw_len = pstr->valid_len;
786 }
787 else
788#endif /* RE_ENABLE_I18N */
789 {
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 pstr->valid_raw_len = 0;
792 if (pstr->trans)
793 c = pstr->trans[c];
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 ? CONTEXT_NEWLINE : 0));
798 }
799 }
800 if (!BE (pstr->mbs_allocated, 0))
801 pstr->mbs += offset;
802 }
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
805 pstr->stop -= offset;
806
807 /* Then build the buffers. */
808#ifdef RE_ENABLE_I18N
809 if (pstr->mb_cur_max > 1)
810 {
811 if (pstr->icase)
812 {
813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
816 }
817 else
818 build_wcs_buffer (pstr);
819 }
820 else
821#endif /* RE_ENABLE_I18N */
822 if (BE (pstr->mbs_allocated, 0))
823 {
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
828 }
829 else
830 pstr->valid_len = pstr->len;
831
832 pstr->cur_idx = 0;
833 return REG_NOERROR;
834}
835
836static unsigned char
837__attribute__ ((pure))
838re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839{
840 int ch;
841 Idx off;
842
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr->mbs_allocated, 1))
845 return re_string_peek_byte (pstr, idx);
846
847#ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851#endif
852
853 off = pstr->cur_idx + idx;
854#ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857#endif
858
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860
861#ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868#endif
869
870 return ch;
871}
872
873static unsigned char
874re_string_fetch_byte_case (re_string_t *pstr)
875{
876 if (BE (!pstr->mbs_allocated, 1))
877 return re_string_fetch_byte (pstr);
878
879#ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
881 {
882 Idx off;
883 int ch;
884
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
891
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
894
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
900
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
904 }
905#endif
906
907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908}
909
910static void
911re_string_destruct (re_string_t *pstr)
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916#endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
919}
920
921/* Return the context at IDX in INPUT. */
922
923static unsigned int
924re_string_context_at (const re_string_t *input, Idx idx, int eflags)
925{
926 int c;
927 if (BE (idx < 0, 0))
928 /* In this case, we use the value stored in input->tip_context,
929 since we can't know the character in input->mbs[-1] here. */
930 return input->tip_context;
931 if (BE (idx == input->len, 0))
932 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
934#ifdef RE_ENABLE_I18N
935 if (input->mb_cur_max > 1)
936 {
937 wint_t wc;
938 Idx wc_idx = idx;
939 while(input->wcs[wc_idx] == WEOF)
940 {
941#if defined DEBUG && DEBUG
942 /* It must not happen. */
943 assert (wc_idx >= 0);
944#endif
945 --wc_idx;
946 if (wc_idx < 0)
947 return input->tip_context;
948 }
949 wc = input->wcs[wc_idx];
950 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
951 return CONTEXT_WORD;
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
954 }
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
959 if (bitset_contain (input->word_char, c))
960 return CONTEXT_WORD;
961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962 }
963}
964
965/* Functions for set operation. */
966
967static reg_errcode_t
968__attribute_warn_unused_result__
969re_node_set_alloc (re_node_set *set, Idx size)
970{
971 set->alloc = size;
972 set->nelem = 0;
973 set->elems = re_malloc (Idx, size);
974 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
975 return REG_ESPACE;
976 return REG_NOERROR;
977}
978
979static reg_errcode_t
980__attribute_warn_unused_result__
981re_node_set_init_1 (re_node_set *set, Idx elem)
982{
983 set->alloc = 1;
984 set->nelem = 1;
985 set->elems = re_malloc (Idx, 1);
986 if (BE (set->elems == NULL, 0))
987 {
988 set->alloc = set->nelem = 0;
989 return REG_ESPACE;
990 }
991 set->elems[0] = elem;
992 return REG_NOERROR;
993}
994
995static reg_errcode_t
996__attribute_warn_unused_result__
997re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998{
999 set->alloc = 2;
1000 set->elems = re_malloc (Idx, 2);
1001 if (BE (set->elems == NULL, 0))
1002 return REG_ESPACE;
1003 if (elem1 == elem2)
1004 {
1005 set->nelem = 1;
1006 set->elems[0] = elem1;
1007 }
1008 else
1009 {
1010 set->nelem = 2;
1011 if (elem1 < elem2)
1012 {
1013 set->elems[0] = elem1;
1014 set->elems[1] = elem2;
1015 }
1016 else
1017 {
1018 set->elems[0] = elem2;
1019 set->elems[1] = elem1;
1020 }
1021 }
1022 return REG_NOERROR;
1023}
1024
1025static reg_errcode_t
1026__attribute_warn_unused_result__
1027re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028{
1029 dest->nelem = src->nelem;
1030 if (src->nelem > 0)
1031 {
1032 dest->alloc = dest->nelem;
1033 dest->elems = re_malloc (Idx, dest->alloc);
1034 if (BE (dest->elems == NULL, 0))
1035 {
1036 dest->alloc = dest->nelem = 0;
1037 return REG_ESPACE;
1038 }
1039 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040 }
1041 else
1042 re_node_set_init_empty (dest);
1043 return REG_NOERROR;
1044}
1045
1046/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049
1050static reg_errcode_t
1051__attribute_warn_unused_result__
1052re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 const re_node_set *src2)
1054{
1055 Idx i1, i2, is, id, delta, sbase;
1056 if (src1->nelem == 0 || src2->nelem == 0)
1057 return REG_NOERROR;
1058
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062 {
1063 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065 if (BE (new_elems == NULL, 0))
1066 return REG_ESPACE;
1067 dest->elems = new_elems;
1068 dest->alloc = new_alloc;
1069 }
1070
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase = dest->nelem + src1->nelem + src2->nelem;
1074 i1 = src1->nelem - 1;
1075 i2 = src2->nelem - 1;
1076 id = dest->nelem - 1;
1077 for (;;)
1078 {
1079 if (src1->elems[i1] == src2->elems[i2])
1080 {
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083 --id;
1084
1085 if (id < 0 || dest->elems[id] != src1->elems[i1])
1086 dest->elems[--sbase] = src1->elems[i1];
1087
1088 if (--i1 < 0 || --i2 < 0)
1089 break;
1090 }
1091
1092 /* Lower the highest of the two items. */
1093 else if (src1->elems[i1] < src2->elems[i2])
1094 {
1095 if (--i2 < 0)
1096 break;
1097 }
1098 else
1099 {
1100 if (--i1 < 0)
1101 break;
1102 }
1103 }
1104
1105 id = dest->nelem - 1;
1106 is = dest->nelem + src1->nelem + src2->nelem - 1;
1107 delta = is - sbase + 1;
1108
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest->nelem += delta;
1113 if (delta > 0 && id >= 0)
1114 for (;;)
1115 {
1116 if (dest->elems[is] > dest->elems[id])
1117 {
1118 /* Copy from the top. */
1119 dest->elems[id + delta--] = dest->elems[is--];
1120 if (delta == 0)
1121 break;
1122 }
1123 else
1124 {
1125 /* Slide from the bottom. */
1126 dest->elems[id + delta] = dest->elems[id];
1127 if (--id < 0)
1128 break;
1129 }
1130 }
1131
1132 /* Copy remaining SRC elements. */
1133 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135 return REG_NOERROR;
1136}
1137
1138/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140
1141static reg_errcode_t
1142__attribute_warn_unused_result__
1143re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 const re_node_set *src2)
1145{
1146 Idx i1, i2, id;
1147 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 {
1149 dest->alloc = src1->nelem + src2->nelem;
1150 dest->elems = re_malloc (Idx, dest->alloc);
1151 if (BE (dest->elems == NULL, 0))
1152 return REG_ESPACE;
1153 }
1154 else
1155 {
1156 if (src1 != NULL && src1->nelem > 0)
1157 return re_node_set_init_copy (dest, src1);
1158 else if (src2 != NULL && src2->nelem > 0)
1159 return re_node_set_init_copy (dest, src2);
1160 else
1161 re_node_set_init_empty (dest);
1162 return REG_NOERROR;
1163 }
1164 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 {
1166 if (src1->elems[i1] > src2->elems[i2])
1167 {
1168 dest->elems[id++] = src2->elems[i2++];
1169 continue;
1170 }
1171 if (src1->elems[i1] == src2->elems[i2])
1172 ++i2;
1173 dest->elems[id++] = src1->elems[i1++];
1174 }
1175 if (i1 < src1->nelem)
1176 {
1177 memcpy (dest->elems + id, src1->elems + i1,
1178 (src1->nelem - i1) * sizeof (Idx));
1179 id += src1->nelem - i1;
1180 }
1181 else if (i2 < src2->nelem)
1182 {
1183 memcpy (dest->elems + id, src2->elems + i2,
1184 (src2->nelem - i2) * sizeof (Idx));
1185 id += src2->nelem - i2;
1186 }
1187 dest->nelem = id;
1188 return REG_NOERROR;
1189}
1190
1191/* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193
1194static reg_errcode_t
1195__attribute_warn_unused_result__
1196re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197{
1198 Idx is, id, sbase, delta;
1199 if (src == NULL || src->nelem == 0)
1200 return REG_NOERROR;
1201 if (dest->alloc < 2 * src->nelem + dest->nelem)
1202 {
1203 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205 if (BE (new_buffer == NULL, 0))
1206 return REG_ESPACE;
1207 dest->elems = new_buffer;
1208 dest->alloc = new_alloc;
1209 }
1210
1211 if (BE (dest->nelem == 0, 0))
1212 {
1213 dest->nelem = src->nelem;
1214 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215 return REG_NOERROR;
1216 }
1217
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase = dest->nelem + 2 * src->nelem,
1221 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222 {
1223 if (dest->elems[id] == src->elems[is])
1224 is--, id--;
1225 else if (dest->elems[id] < src->elems[is])
1226 dest->elems[--sbase] = src->elems[is--];
1227 else /* if (dest->elems[id] > src->elems[is]) */
1228 --id;
1229 }
1230
1231 if (is >= 0)
1232 {
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 sbase -= is + 1;
1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236 }
1237
1238 id = dest->nelem - 1;
1239 is = dest->nelem + 2 * src->nelem - 1;
1240 delta = is - sbase + 1;
1241 if (delta == 0)
1242 return REG_NOERROR;
1243
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest->nelem += delta;
1247 for (;;)
1248 {
1249 if (dest->elems[is] > dest->elems[id])
1250 {
1251 /* Copy from the top. */
1252 dest->elems[id + delta--] = dest->elems[is--];
1253 if (delta == 0)
1254 break;
1255 }
1256 else
1257 {
1258 /* Slide from the bottom. */
1259 dest->elems[id + delta] = dest->elems[id];
1260 if (--id < 0)
1261 {
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest->elems, dest->elems + sbase,
1264 delta * sizeof (Idx));
1265 break;
1266 }
1267 }
1268 }
1269
1270 return REG_NOERROR;
1271}
1272
1273/* Insert the new element ELEM to the re_node_set* SET.
1274 SET should not already have ELEM.
1275 Return true if successful. */
1276
1277static bool
1278__attribute_warn_unused_result__
1279re_node_set_insert (re_node_set *set, Idx elem)
1280{
1281 Idx idx;
1282 /* In case the set is empty. */
1283 if (set->alloc == 0)
1284 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1285
1286 if (BE (set->nelem, 0) == 0)
1287 {
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set->elems[0] = elem;
1290 ++set->nelem;
1291 return true;
1292 }
1293
1294 /* Realloc if we need. */
1295 if (set->alloc == set->nelem)
1296 {
1297 Idx *new_elems;
1298 set->alloc = set->alloc * 2;
1299 new_elems = re_realloc (set->elems, Idx, set->alloc);
1300 if (BE (new_elems == NULL, 0))
1301 return false;
1302 set->elems = new_elems;
1303 }
1304
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem < set->elems[0])
1308 {
1309 idx = 0;
1310 for (idx = set->nelem; idx > 0; idx--)
1311 set->elems[idx] = set->elems[idx - 1];
1312 }
1313 else
1314 {
1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316 set->elems[idx] = set->elems[idx - 1];
1317 }
1318
1319 /* Insert the new element. */
1320 set->elems[idx] = elem;
1321 ++set->nelem;
1322 return true;
1323}
1324
1325/* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
1327 Return true if successful. */
1328
1329static bool
1330__attribute_warn_unused_result__
1331re_node_set_insert_last (re_node_set *set, Idx elem)
1332{
1333 /* Realloc if we need. */
1334 if (set->alloc == set->nelem)
1335 {
1336 Idx *new_elems;
1337 set->alloc = (set->alloc + 1) * 2;
1338 new_elems = re_realloc (set->elems, Idx, set->alloc);
1339 if (BE (new_elems == NULL, 0))
1340 return false;
1341 set->elems = new_elems;
1342 }
1343
1344 /* Insert the new element. */
1345 set->elems[set->nelem++] = elem;
1346 return true;
1347}
1348
1349/* Compare two node sets SET1 and SET2.
1350 Return true if SET1 and SET2 are equivalent. */
1351
1352static bool
1353__attribute__ ((pure))
1354re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355{
1356 Idx i;
1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358 return false;
1359 for (i = set1->nelem ; --i >= 0 ; )
1360 if (set1->elems[i] != set2->elems[i])
1361 return false;
1362 return true;
1363}
1364
1365/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1366
1367static Idx
1368__attribute__ ((pure))
1369re_node_set_contains (const re_node_set *set, Idx elem)
1370{
1371 __re_size_t idx, right, mid;
1372 if (set->nelem <= 0)
1373 return 0;
1374
1375 /* Binary search the element. */
1376 idx = 0;
1377 right = set->nelem - 1;
1378 while (idx < right)
1379 {
1380 mid = (idx + right) / 2;
1381 if (set->elems[mid] < elem)
1382 idx = mid + 1;
1383 else
1384 right = mid;
1385 }
1386 return set->elems[idx] == elem ? idx + 1 : 0;
1387}
1388
1389static void
1390re_node_set_remove_at (re_node_set *set, Idx idx)
1391{
1392 if (idx < 0 || idx >= set->nelem)
1393 return;
1394 --set->nelem;
1395 for (; idx < set->nelem; idx++)
1396 set->elems[idx] = set->elems[idx + 1];
1397}
1398
1399
1400/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401 Or return -1 if an error occurred. */
1402
1403static Idx
1404re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405{
1406 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407 {
1408 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409 Idx *new_nexts, *new_indices;
1410 re_node_set *new_edests, *new_eclosures;
1411 re_token_t *new_nodes;
1412
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size = MAX (sizeof (re_token_t),
1415 MAX (sizeof (re_node_set),
1416 sizeof (Idx)));
1417 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1418 return -1;
1419
1420 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421 if (BE (new_nodes == NULL, 0))
1422 return -1;
1423 dfa->nodes = new_nodes;
1424 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428 if (BE (new_nexts == NULL || new_indices == NULL
1429 || new_edests == NULL || new_eclosures == NULL, 0))
1430 {
1431 re_free (new_nexts);
1432 re_free (new_indices);
1433 re_free (new_edests);
1434 re_free (new_eclosures);
1435 return -1;
1436 }
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
1441 dfa->nodes_alloc = new_nodes_alloc;
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
1444 dfa->nodes[dfa->nodes_len].constraint = 0;
1445#ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
1449#endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1454}
1455
1456static re_hashval_t
1457calc_state_hash (const re_node_set *nodes, unsigned int context)
1458{
1459 re_hashval_t hash = nodes->nelem + context;
1460 Idx i;
1461 for (i = 0 ; i < nodes->nelem ; i++)
1462 hash += nodes->elems[i];
1463 return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1473 optimization. */
1474
1475static re_dfastate_t *
1476__attribute_warn_unused_result__
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 const re_node_set *nodes)
1479{
1480 re_hashval_t hash;
1481 re_dfastate_t *new_state;
1482 struct re_state_table_entry *spot;
1483 Idx i;
1484#if defined GCC_LINT || defined lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1486 *err = REG_NOERROR;
1487#endif
1488 if (BE (nodes->nelem == 0, 0))
1489 {
1490 *err = REG_NOERROR;
1491 return NULL;
1492 }
1493 hash = calc_state_hash (nodes, 0);
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496 for (i = 0 ; i < spot->num ; i++)
1497 {
1498 re_dfastate_t *state = spot->array[i];
1499 if (hash != state->hash)
1500 continue;
1501 if (re_node_set_compare (&state->nodes, nodes))
1502 return state;
1503 }
1504
1505 /* There are no appropriate state in the dfa, create the new one. */
1506 new_state = create_ci_newstate (dfa, nodes, hash);
1507 if (BE (new_state == NULL, 0))
1508 *err = REG_ESPACE;
1509
1510 return new_state;
1511}
1512
1513/* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1521 optimization. */
1522
1523static re_dfastate_t *
1524__attribute_warn_unused_result__
1525re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 const re_node_set *nodes, unsigned int context)
1527{
1528 re_hashval_t hash;
1529 re_dfastate_t *new_state;
1530 struct re_state_table_entry *spot;
1531 Idx i;
1532#if defined GCC_LINT || defined lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1534 *err = REG_NOERROR;
1535#endif
1536 if (nodes->nelem == 0)
1537 {
1538 *err = REG_NOERROR;
1539 return NULL;
1540 }
1541 hash = calc_state_hash (nodes, context);
1542 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544 for (i = 0 ; i < spot->num ; i++)
1545 {
1546 re_dfastate_t *state = spot->array[i];
1547 if (state->hash == hash
1548 && state->context == context
1549 && re_node_set_compare (state->entrance_nodes, nodes))
1550 return state;
1551 }
1552 /* There are no appropriate state in 'dfa', create the new one. */
1553 new_state = create_cd_newstate (dfa, nodes, context, hash);
1554 if (BE (new_state == NULL, 0))
1555 *err = REG_ESPACE;
1556
1557 return new_state;
1558}
1559
1560/* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
1563
1564static reg_errcode_t
1565__attribute_warn_unused_result__
1566register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 re_hashval_t hash)
1568{
1569 struct re_state_table_entry *spot;
1570 reg_errcode_t err;
1571 Idx i;
1572
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575 if (BE (err != REG_NOERROR, 0))
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
1578 {
1579 Idx elem = newstate->nodes.elems[i];
1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582 return REG_ESPACE;
1583 }
1584
1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586 if (BE (spot->alloc <= spot->num, 0))
1587 {
1588 Idx new_alloc = 2 * spot->num + 2;
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
1591 if (BE (new_array == NULL, 0))
1592 return REG_ESPACE;
1593 spot->array = new_array;
1594 spot->alloc = new_alloc;
1595 }
1596 spot->array[spot->num++] = newstate;
1597 return REG_NOERROR;
1598}
1599
1600static void
1601free_state (re_dfastate_t *state)
1602{
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614}
1615
1616/* Create the new state which is independent of contexts.
1617 Return the new state if succeeded, otherwise return NULL. */
1618
1619static re_dfastate_t *
1620__attribute_warn_unused_result__
1621create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 re_hashval_t hash)
1623{
1624 Idx i;
1625 reg_errcode_t err;
1626 re_dfastate_t *newstate;
1627
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629 if (BE (newstate == NULL, 0))
1630 return NULL;
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (BE (err != REG_NOERROR, 0))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
1637
1638 newstate->entrance_nodes = &newstate->nodes;
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
1643 if (type == CHARACTER && !node->constraint)
1644 continue;
1645#ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647#endif /* RE_ENABLE_I18N */
1648
1649 /* If the state has the halt node, the state is a halt state. */
1650 if (type == END_OF_RE)
1651 newstate->halt = 1;
1652 else if (type == OP_BACK_REF)
1653 newstate->has_backref = 1;
1654 else if (type == ANCHOR || node->constraint)
1655 newstate->has_constraint = 1;
1656 }
1657 err = register_state (dfa, newstate, hash);
1658 if (BE (err != REG_NOERROR, 0))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
1664}
1665
1666/* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1668
1669static re_dfastate_t *
1670__attribute_warn_unused_result__
1671create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 unsigned int context, re_hashval_t hash)
1673{
1674 Idx i, nctx_nodes = 0;
1675 reg_errcode_t err;
1676 re_dfastate_t *newstate;
1677
1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679 if (BE (newstate == NULL, 0))
1680 return NULL;
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (BE (err != REG_NOERROR, 0))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1687
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1690
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
1693 re_token_t *node = dfa->nodes + nodes->elems[i];
1694 re_token_type_t type = node->type;
1695 unsigned int constraint = node->constraint;
1696
1697 if (type == CHARACTER && !constraint)
1698 continue;
1699#ifdef RE_ENABLE_I18N
1700 newstate->accept_mb |= node->accept_mb;
1701#endif /* RE_ENABLE_I18N */
1702
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type == END_OF_RE)
1705 newstate->halt = 1;
1706 else if (type == OP_BACK_REF)
1707 newstate->has_backref = 1;
1708
1709 if (constraint)
1710 {
1711 if (newstate->entrance_nodes == &newstate->nodes)
1712 {
1713 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1714 if (BE (newstate->entrance_nodes == NULL, 0))
1715 {
1716 free_state (newstate);
1717 return NULL;
1718 }
1719 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720 != REG_NOERROR)
1721 return NULL;
1722 nctx_nodes = 0;
1723 newstate->has_constraint = 1;
1724 }
1725
1726 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727 {
1728 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729 ++nctx_nodes;
1730 }
1731 }
1732 }
1733 err = register_state (dfa, newstate, hash);
1734 if (BE (err != REG_NOERROR, 0))
1735 {
1736 free_state (newstate);
1737 newstate = NULL;
1738 }
1739 return newstate;
1740}
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
new file mode 100644
index 00000000000..7bbe802bc53
--- /dev/null
+++ b/lib/regex_internal.h
@@ -0,0 +1,911 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifndef _REGEX_INTERNAL_H
21#define _REGEX_INTERNAL_H 1
22
23#include <assert.h>
24#include <ctype.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28
29#include <langinfo.h>
30#include <locale.h>
31#include <wchar.h>
32#include <wctype.h>
33#include <stdbool.h>
34#include <stdint.h>
35
36/* Properties of integers. Although Gnulib has intprops.h, glibc does
37 without for now. */
38#ifndef _LIBC
39# include "intprops.h"
40#else
41/* True if the real type T is signed. */
42# define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
43
44/* True if adding the nonnegative Idx values A and B would overflow.
45 If false, set *R to A + B. A, B, and R may be evaluated more than
46 once, or zero times. Although this is not a full implementation of
47 Gnulib INT_ADD_WRAPV, it is good enough for glibc regex code.
48 FIXME: This implementation is a fragile stopgap, and this file would
49 be simpler and more robust if intprops.h were migrated into glibc. */
50# define INT_ADD_WRAPV(a, b, r) \
51 (IDX_MAX - (a) < (b) ? true : (*(r) = (a) + (b), false))
52#endif
53
54#ifdef _LIBC
55# include <libc-lock.h>
56# define lock_define(name) __libc_lock_define (, name)
57# define lock_init(lock) (__libc_lock_init (lock), 0)
58# define lock_fini(lock) ((void) 0)
59# define lock_lock(lock) __libc_lock_lock (lock)
60# define lock_unlock(lock) __libc_lock_unlock (lock)
61#elif defined GNULIB_LOCK && !defined USE_UNLOCKED_IO
62# include "glthread/lock.h"
63 /* Use gl_lock_define if empty macro arguments are known to work.
64 Otherwise, fall back on less-portable substitutes. */
65# if ((defined __GNUC__ && !defined __STRICT_ANSI__) \
66 || (defined __STDC_VERSION__ && 199901L <= __STDC_VERSION__))
67# define lock_define(name) gl_lock_define (, name)
68# elif USE_POSIX_THREADS
69# define lock_define(name) pthread_mutex_t name;
70# elif USE_PTH_THREADS
71# define lock_define(name) pth_mutex_t name;
72# elif USE_SOLARIS_THREADS
73# define lock_define(name) mutex_t name;
74# elif USE_WINDOWS_THREADS
75# define lock_define(name) gl_lock_t name;
76# else
77# define lock_define(name)
78# endif
79# define lock_init(lock) glthread_lock_init (&(lock))
80# define lock_fini(lock) glthread_lock_destroy (&(lock))
81# define lock_lock(lock) glthread_lock_lock (&(lock))
82# define lock_unlock(lock) glthread_lock_unlock (&(lock))
83#elif defined GNULIB_PTHREAD && !defined USE_UNLOCKED_IO
84# include <pthread.h>
85# define lock_define(name) pthread_mutex_t name;
86# define lock_init(lock) pthread_mutex_init (&(lock), 0)
87# define lock_fini(lock) pthread_mutex_destroy (&(lock))
88# define lock_lock(lock) pthread_mutex_lock (&(lock))
89# define lock_unlock(lock) pthread_mutex_unlock (&(lock))
90#else
91# define lock_define(name)
92# define lock_init(lock) 0
93# define lock_fini(lock) ((void) 0)
94 /* The 'dfa' avoids an "unused variable 'dfa'" warning from GCC. */
95# define lock_lock(lock) ((void) dfa)
96# define lock_unlock(lock) ((void) 0)
97#endif
98
99/* In case that the system doesn't have isblank(). */
100#if !defined _LIBC && ! (defined isblank || (HAVE_ISBLANK && HAVE_DECL_ISBLANK))
101# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
102#endif
103
104#ifdef _LIBC
105# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
106# define _RE_DEFINE_LOCALE_FUNCTIONS 1
107# include <locale/localeinfo.h>
108# include <locale/coll-lookup.h>
109# endif
110#endif
111
112/* This is for other GNU distributions with internationalized messages. */
113#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
114# include <libintl.h>
115# ifdef _LIBC
116# undef gettext
117# define gettext(msgid) \
118 __dcgettext (_libc_intl_domainname, msgid, LC_MESSAGES)
119# endif
120#else
121# undef gettext
122# define gettext(msgid) (msgid)
123#endif
124
125#ifndef gettext_noop
126/* This define is so xgettext can find the internationalizable
127 strings. */
128# define gettext_noop(String) String
129#endif
130
131#if (defined MB_CUR_MAX && HAVE_WCTYPE_H && HAVE_ISWCTYPE) || _LIBC
132# define RE_ENABLE_I18N
133#endif
134
135#define BE(expr, val) __builtin_expect (expr, val)
136
137/* Number of ASCII characters. */
138#define ASCII_CHARS 0x80
139
140/* Number of single byte characters. */
141#define SBC_MAX (UCHAR_MAX + 1)
142
143#define COLL_ELEM_LEN_MAX 8
144
145/* The character which represents newline. */
146#define NEWLINE_CHAR '\n'
147#define WIDE_NEWLINE_CHAR L'\n'
148
149/* Rename to standard API for using out of glibc. */
150#ifndef _LIBC
151# undef __wctype
152# undef __iswctype
153# define __wctype wctype
154# define __iswalnum iswalnum
155# define __iswctype iswctype
156# define __towlower towlower
157# define __towupper towupper
158# define __btowc btowc
159# define __mbrtowc mbrtowc
160# define __wcrtomb wcrtomb
161# define __regfree regfree
162# define attribute_hidden
163#endif /* not _LIBC */
164
165#if __GNUC__ < 3 + (__GNUC_MINOR__ < 1)
166# define __attribute__(arg)
167#endif
168
169#ifndef SSIZE_MAX
170# define SSIZE_MAX ((ssize_t) (SIZE_MAX / 2))
171#endif
172
173/* The type of indexes into strings. This is signed, not size_t,
174 since the API requires indexes to fit in regoff_t anyway, and using
175 signed integers makes the code a bit smaller and presumably faster.
176 The traditional GNU regex implementation uses int for indexes.
177 The POSIX-compatible implementation uses a possibly-wider type.
178 The name 'Idx' is three letters to minimize the hassle of
179 reindenting a lot of regex code that formerly used 'int'. */
180typedef regoff_t Idx;
181#ifdef _REGEX_LARGE_OFFSETS
182# define IDX_MAX SSIZE_MAX
183#else
184# define IDX_MAX INT_MAX
185#endif
186
187/* A hash value, suitable for computing hash tables. */
188typedef __re_size_t re_hashval_t;
189
190/* An integer used to represent a set of bits. It must be unsigned,
191 and must be at least as wide as unsigned int. */
192typedef unsigned long int bitset_word_t;
193/* All bits set in a bitset_word_t. */
194#define BITSET_WORD_MAX ULONG_MAX
195
196/* Number of bits in a bitset_word_t. For portability to hosts with
197 padding bits, do not use '(sizeof (bitset_word_t) * CHAR_BIT)';
198 instead, deduce it directly from BITSET_WORD_MAX. Avoid
199 greater-than-32-bit integers and unconditional shifts by more than
200 31 bits, as they're not portable. */
201#if BITSET_WORD_MAX == 0xffffffffUL
202# define BITSET_WORD_BITS 32
203#elif BITSET_WORD_MAX >> 31 >> 4 == 1
204# define BITSET_WORD_BITS 36
205#elif BITSET_WORD_MAX >> 31 >> 16 == 1
206# define BITSET_WORD_BITS 48
207#elif BITSET_WORD_MAX >> 31 >> 28 == 1
208# define BITSET_WORD_BITS 60
209#elif BITSET_WORD_MAX >> 31 >> 31 >> 1 == 1
210# define BITSET_WORD_BITS 64
211#elif BITSET_WORD_MAX >> 31 >> 31 >> 9 == 1
212# define BITSET_WORD_BITS 72
213#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 3 == 1
214# define BITSET_WORD_BITS 128
215#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 == 1
216# define BITSET_WORD_BITS 256
217#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 > 1
218# define BITSET_WORD_BITS 257 /* any value > SBC_MAX will do here */
219# if BITSET_WORD_BITS <= SBC_MAX
220# error "Invalid SBC_MAX"
221# endif
222#else
223# error "Add case for new bitset_word_t size"
224#endif
225
226/* Number of bitset_word_t values in a bitset_t. */
227#define BITSET_WORDS ((SBC_MAX + BITSET_WORD_BITS - 1) / BITSET_WORD_BITS)
228
229typedef bitset_word_t bitset_t[BITSET_WORDS];
230typedef bitset_word_t *re_bitset_ptr_t;
231typedef const bitset_word_t *re_const_bitset_ptr_t;
232
233#define PREV_WORD_CONSTRAINT 0x0001
234#define PREV_NOTWORD_CONSTRAINT 0x0002
235#define NEXT_WORD_CONSTRAINT 0x0004
236#define NEXT_NOTWORD_CONSTRAINT 0x0008
237#define PREV_NEWLINE_CONSTRAINT 0x0010
238#define NEXT_NEWLINE_CONSTRAINT 0x0020
239#define PREV_BEGBUF_CONSTRAINT 0x0040
240#define NEXT_ENDBUF_CONSTRAINT 0x0080
241#define WORD_DELIM_CONSTRAINT 0x0100
242#define NOT_WORD_DELIM_CONSTRAINT 0x0200
243
244typedef enum
245{
246 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
247 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
248 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
249 INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
250 LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
251 LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
252 BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
253 BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
254 WORD_DELIM = WORD_DELIM_CONSTRAINT,
255 NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
256} re_context_type;
257
258typedef struct
259{
260 Idx alloc;
261 Idx nelem;
262 Idx *elems;
263} re_node_set;
264
265typedef enum
266{
267 NON_TYPE = 0,
268
269 /* Node type, These are used by token, node, tree. */
270 CHARACTER = 1,
271 END_OF_RE = 2,
272 SIMPLE_BRACKET = 3,
273 OP_BACK_REF = 4,
274 OP_PERIOD = 5,
275#ifdef RE_ENABLE_I18N
276 COMPLEX_BRACKET = 6,
277 OP_UTF8_PERIOD = 7,
278#endif /* RE_ENABLE_I18N */
279
280 /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
281 when the debugger shows values of this enum type. */
282#define EPSILON_BIT 8
283 OP_OPEN_SUBEXP = EPSILON_BIT | 0,
284 OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
285 OP_ALT = EPSILON_BIT | 2,
286 OP_DUP_ASTERISK = EPSILON_BIT | 3,
287 ANCHOR = EPSILON_BIT | 4,
288
289 /* Tree type, these are used only by tree. */
290 CONCAT = 16,
291 SUBEXP = 17,
292
293 /* Token type, these are used only by token. */
294 OP_DUP_PLUS = 18,
295 OP_DUP_QUESTION,
296 OP_OPEN_BRACKET,
297 OP_CLOSE_BRACKET,
298 OP_CHARSET_RANGE,
299 OP_OPEN_DUP_NUM,
300 OP_CLOSE_DUP_NUM,
301 OP_NON_MATCH_LIST,
302 OP_OPEN_COLL_ELEM,
303 OP_CLOSE_COLL_ELEM,
304 OP_OPEN_EQUIV_CLASS,
305 OP_CLOSE_EQUIV_CLASS,
306 OP_OPEN_CHAR_CLASS,
307 OP_CLOSE_CHAR_CLASS,
308 OP_WORD,
309 OP_NOTWORD,
310 OP_SPACE,
311 OP_NOTSPACE,
312 BACK_SLASH
313
314} re_token_type_t;
315
316#ifdef RE_ENABLE_I18N
317typedef struct
318{
319 /* Multibyte characters. */
320 wchar_t *mbchars;
321
322 /* Collating symbols. */
323# ifdef _LIBC
324 int32_t *coll_syms;
325# endif
326
327 /* Equivalence classes. */
328# ifdef _LIBC
329 int32_t *equiv_classes;
330# endif
331
332 /* Range expressions. */
333# ifdef _LIBC
334 uint32_t *range_starts;
335 uint32_t *range_ends;
336# else /* not _LIBC */
337 wchar_t *range_starts;
338 wchar_t *range_ends;
339# endif /* not _LIBC */
340
341 /* Character classes. */
342 wctype_t *char_classes;
343
344 /* If this character set is the non-matching list. */
345 unsigned int non_match : 1;
346
347 /* # of multibyte characters. */
348 Idx nmbchars;
349
350 /* # of collating symbols. */
351 Idx ncoll_syms;
352
353 /* # of equivalence classes. */
354 Idx nequiv_classes;
355
356 /* # of range expressions. */
357 Idx nranges;
358
359 /* # of character classes. */
360 Idx nchar_classes;
361} re_charset_t;
362#endif /* RE_ENABLE_I18N */
363
364typedef struct
365{
366 union
367 {
368 unsigned char c; /* for CHARACTER */
369 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
370#ifdef RE_ENABLE_I18N
371 re_charset_t *mbcset; /* for COMPLEX_BRACKET */
372#endif /* RE_ENABLE_I18N */
373 Idx idx; /* for BACK_REF */
374 re_context_type ctx_type; /* for ANCHOR */
375 } opr;
376#if __GNUC__ >= 2 && !defined __STRICT_ANSI__
377 re_token_type_t type : 8;
378#else
379 re_token_type_t type;
380#endif
381 unsigned int constraint : 10; /* context constraint */
382 unsigned int duplicated : 1;
383 unsigned int opt_subexp : 1;
384#ifdef RE_ENABLE_I18N
385 unsigned int accept_mb : 1;
386 /* These 2 bits can be moved into the union if needed (e.g. if running out
387 of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
388 unsigned int mb_partial : 1;
389#endif
390 unsigned int word_char : 1;
391} re_token_t;
392
393#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
394
395struct re_string_t
396{
397 /* Indicate the raw buffer which is the original string passed as an
398 argument of regexec(), re_search(), etc.. */
399 const unsigned char *raw_mbs;
400 /* Store the multibyte string. In case of "case insensitive mode" like
401 REG_ICASE, upper cases of the string are stored, otherwise MBS points
402 the same address that RAW_MBS points. */
403 unsigned char *mbs;
404#ifdef RE_ENABLE_I18N
405 /* Store the wide character string which is corresponding to MBS. */
406 wint_t *wcs;
407 Idx *offsets;
408 mbstate_t cur_state;
409#endif
410 /* Index in RAW_MBS. Each character mbs[i] corresponds to
411 raw_mbs[raw_mbs_idx + i]. */
412 Idx raw_mbs_idx;
413 /* The length of the valid characters in the buffers. */
414 Idx valid_len;
415 /* The corresponding number of bytes in raw_mbs array. */
416 Idx valid_raw_len;
417 /* The length of the buffers MBS and WCS. */
418 Idx bufs_len;
419 /* The index in MBS, which is updated by re_string_fetch_byte. */
420 Idx cur_idx;
421 /* length of RAW_MBS array. */
422 Idx raw_len;
423 /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
424 Idx len;
425 /* End of the buffer may be shorter than its length in the cases such
426 as re_match_2, re_search_2. Then, we use STOP for end of the buffer
427 instead of LEN. */
428 Idx raw_stop;
429 /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
430 Idx stop;
431
432 /* The context of mbs[0]. We store the context independently, since
433 the context of mbs[0] may be different from raw_mbs[0], which is
434 the beginning of the input string. */
435 unsigned int tip_context;
436 /* The translation passed as a part of an argument of re_compile_pattern. */
437 RE_TRANSLATE_TYPE trans;
438 /* Copy of re_dfa_t's word_char. */
439 re_const_bitset_ptr_t word_char;
440 /* true if REG_ICASE. */
441 unsigned char icase;
442 unsigned char is_utf8;
443 unsigned char map_notascii;
444 unsigned char mbs_allocated;
445 unsigned char offsets_needed;
446 unsigned char newline_anchor;
447 unsigned char word_ops_used;
448 int mb_cur_max;
449};
450typedef struct re_string_t re_string_t;
451
452
453struct re_dfa_t;
454typedef struct re_dfa_t re_dfa_t;
455
456#ifndef _LIBC
457# define IS_IN(libc) false
458#endif
459
460#define re_string_peek_byte(pstr, offset) \
461 ((pstr)->mbs[(pstr)->cur_idx + offset])
462#define re_string_fetch_byte(pstr) \
463 ((pstr)->mbs[(pstr)->cur_idx++])
464#define re_string_first_byte(pstr, idx) \
465 ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
466#define re_string_is_single_byte_char(pstr, idx) \
467 ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
468 || (pstr)->wcs[(idx) + 1] != WEOF))
469#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
470#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
471#define re_string_get_buffer(pstr) ((pstr)->mbs)
472#define re_string_length(pstr) ((pstr)->len)
473#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
474#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
475#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
476
477#if defined _LIBC || HAVE_ALLOCA
478# include <alloca.h>
479#endif
480
481#ifndef _LIBC
482# if HAVE_ALLOCA
483/* The OS usually guarantees only one guard page at the bottom of the stack,
484 and a page size can be as small as 4096 bytes. So we cannot safely
485 allocate anything larger than 4096 bytes. Also care for the possibility
486 of a few compiler-allocated temporary stack slots. */
487# define __libc_use_alloca(n) ((n) < 4032)
488# else
489/* alloca is implemented with malloc, so just use malloc. */
490# define __libc_use_alloca(n) 0
491# undef alloca
492# define alloca(n) malloc (n)
493# endif
494#endif
495
496#ifdef _LIBC
497# define MALLOC_0_IS_NONNULL 1
498#elif !defined MALLOC_0_IS_NONNULL
499# define MALLOC_0_IS_NONNULL 0
500#endif
501
502#ifndef MAX
503# define MAX(a,b) ((a) < (b) ? (b) : (a))
504#endif
505#ifndef MIN
506# define MIN(a,b) ((a) < (b) ? (a) : (b))
507#endif
508
509#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
510#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
511#define re_free(p) free (p)
512
513struct bin_tree_t
514{
515 struct bin_tree_t *parent;
516 struct bin_tree_t *left;
517 struct bin_tree_t *right;
518 struct bin_tree_t *first;
519 struct bin_tree_t *next;
520
521 re_token_t token;
522
523 /* 'node_idx' is the index in dfa->nodes, if 'type' == 0.
524 Otherwise 'type' indicate the type of this node. */
525 Idx node_idx;
526};
527typedef struct bin_tree_t bin_tree_t;
528
529#define BIN_TREE_STORAGE_SIZE \
530 ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
531
532struct bin_tree_storage_t
533{
534 struct bin_tree_storage_t *next;
535 bin_tree_t data[BIN_TREE_STORAGE_SIZE];
536};
537typedef struct bin_tree_storage_t bin_tree_storage_t;
538
539#define CONTEXT_WORD 1
540#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
541#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
542#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
543
544#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
545#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
546#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
547#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
548#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
549
550#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
551#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
552#define IS_WIDE_WORD_CHAR(ch) (__iswalnum (ch) || (ch) == L'_')
553#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
554
555#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
556 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
557 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
558 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
559 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
560
561#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
562 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
563 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
564 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
565 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
566
567struct re_dfastate_t
568{
569 re_hashval_t hash;
570 re_node_set nodes;
571 re_node_set non_eps_nodes;
572 re_node_set inveclosure;
573 re_node_set *entrance_nodes;
574 struct re_dfastate_t **trtable, **word_trtable;
575 unsigned int context : 4;
576 unsigned int halt : 1;
577 /* If this state can accept "multi byte".
578 Note that we refer to multibyte characters, and multi character
579 collating elements as "multi byte". */
580 unsigned int accept_mb : 1;
581 /* If this state has backreference node(s). */
582 unsigned int has_backref : 1;
583 unsigned int has_constraint : 1;
584};
585typedef struct re_dfastate_t re_dfastate_t;
586
587struct re_state_table_entry
588{
589 Idx num;
590 Idx alloc;
591 re_dfastate_t **array;
592};
593
594/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
595
596typedef struct
597{
598 Idx next_idx;
599 Idx alloc;
600 re_dfastate_t **array;
601} state_array_t;
602
603/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
604
605typedef struct
606{
607 Idx node;
608 Idx str_idx; /* The position NODE match at. */
609 state_array_t path;
610} re_sub_match_last_t;
611
612/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
613 And information about the node, whose type is OP_CLOSE_SUBEXP,
614 corresponding to NODE is stored in LASTS. */
615
616typedef struct
617{
618 Idx str_idx;
619 Idx node;
620 state_array_t *path;
621 Idx alasts; /* Allocation size of LASTS. */
622 Idx nlasts; /* The number of LASTS. */
623 re_sub_match_last_t **lasts;
624} re_sub_match_top_t;
625
626struct re_backref_cache_entry
627{
628 Idx node;
629 Idx str_idx;
630 Idx subexp_from;
631 Idx subexp_to;
632 char more;
633 char unused;
634 unsigned short int eps_reachable_subexps_map;
635};
636
637typedef struct
638{
639 /* The string object corresponding to the input string. */
640 re_string_t input;
641#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
642 const re_dfa_t *const dfa;
643#else
644 const re_dfa_t *dfa;
645#endif
646 /* EFLAGS of the argument of regexec. */
647 int eflags;
648 /* Where the matching ends. */
649 Idx match_last;
650 Idx last_node;
651 /* The state log used by the matcher. */
652 re_dfastate_t **state_log;
653 Idx state_log_top;
654 /* Back reference cache. */
655 Idx nbkref_ents;
656 Idx abkref_ents;
657 struct re_backref_cache_entry *bkref_ents;
658 int max_mb_elem_len;
659 Idx nsub_tops;
660 Idx asub_tops;
661 re_sub_match_top_t **sub_tops;
662} re_match_context_t;
663
664typedef struct
665{
666 re_dfastate_t **sifted_states;
667 re_dfastate_t **limited_states;
668 Idx last_node;
669 Idx last_str_idx;
670 re_node_set limits;
671} re_sift_context_t;
672
673struct re_fail_stack_ent_t
674{
675 Idx idx;
676 Idx node;
677 regmatch_t *regs;
678 re_node_set eps_via_nodes;
679};
680
681struct re_fail_stack_t
682{
683 Idx num;
684 Idx alloc;
685 struct re_fail_stack_ent_t *stack;
686};
687
688struct re_dfa_t
689{
690 re_token_t *nodes;
691 size_t nodes_alloc;
692 size_t nodes_len;
693 Idx *nexts;
694 Idx *org_indices;
695 re_node_set *edests;
696 re_node_set *eclosures;
697 re_node_set *inveclosures;
698 struct re_state_table_entry *state_table;
699 re_dfastate_t *init_state;
700 re_dfastate_t *init_state_word;
701 re_dfastate_t *init_state_nl;
702 re_dfastate_t *init_state_begbuf;
703 bin_tree_t *str_tree;
704 bin_tree_storage_t *str_tree_storage;
705 re_bitset_ptr_t sb_char;
706 int str_tree_storage_idx;
707
708 /* number of subexpressions 're_nsub' is in regex_t. */
709 re_hashval_t state_hash_mask;
710 Idx init_node;
711 Idx nbackref; /* The number of backreference in this dfa. */
712
713 /* Bitmap expressing which backreference is used. */
714 bitset_word_t used_bkref_map;
715 bitset_word_t completed_bkref_map;
716
717 unsigned int has_plural_match : 1;
718 /* If this dfa has "multibyte node", which is a backreference or
719 a node which can accept multibyte character or multi character
720 collating element. */
721 unsigned int has_mb_node : 1;
722 unsigned int is_utf8 : 1;
723 unsigned int map_notascii : 1;
724 unsigned int word_ops_used : 1;
725 int mb_cur_max;
726 bitset_t word_char;
727 reg_syntax_t syntax;
728 Idx *subexp_map;
729#ifdef DEBUG
730 char* re_str;
731#endif
732 lock_define (lock)
733};
734
735#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
736#define re_node_set_remove(set,id) \
737 (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
738#define re_node_set_empty(p) ((p)->nelem = 0)
739#define re_node_set_free(set) re_free ((set)->elems)
740
741
742typedef enum
743{
744 SB_CHAR,
745 MB_CHAR,
746 EQUIV_CLASS,
747 COLL_SYM,
748 CHAR_CLASS
749} bracket_elem_type;
750
751typedef struct
752{
753 bracket_elem_type type;
754 union
755 {
756 unsigned char ch;
757 unsigned char *name;
758 wchar_t wch;
759 } opr;
760} bracket_elem_t;
761
762
763/* Functions for bitset_t operation. */
764
765static inline void
766bitset_set (bitset_t set, Idx i)
767{
768 set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS;
769}
770
771static inline void
772bitset_clear (bitset_t set, Idx i)
773{
774 set[i / BITSET_WORD_BITS] &= ~ ((bitset_word_t) 1 << i % BITSET_WORD_BITS);
775}
776
777static inline bool
778bitset_contain (const bitset_t set, Idx i)
779{
780 return (set[i / BITSET_WORD_BITS] >> i % BITSET_WORD_BITS) & 1;
781}
782
783static inline void
784bitset_empty (bitset_t set)
785{
786 memset (set, '\0', sizeof (bitset_t));
787}
788
789static inline void
790bitset_set_all (bitset_t set)
791{
792 memset (set, -1, sizeof (bitset_word_t) * (SBC_MAX / BITSET_WORD_BITS));
793 if (SBC_MAX % BITSET_WORD_BITS != 0)
794 set[BITSET_WORDS - 1] =
795 ((bitset_word_t) 1 << SBC_MAX % BITSET_WORD_BITS) - 1;
796}
797
798static inline void
799bitset_copy (bitset_t dest, const bitset_t src)
800{
801 memcpy (dest, src, sizeof (bitset_t));
802}
803
804static inline void
805bitset_not (bitset_t set)
806{
807 int bitset_i;
808 for (bitset_i = 0; bitset_i < SBC_MAX / BITSET_WORD_BITS; ++bitset_i)
809 set[bitset_i] = ~set[bitset_i];
810 if (SBC_MAX % BITSET_WORD_BITS != 0)
811 set[BITSET_WORDS - 1] =
812 ((((bitset_word_t) 1 << SBC_MAX % BITSET_WORD_BITS) - 1)
813 & ~set[BITSET_WORDS - 1]);
814}
815
816static inline void
817bitset_merge (bitset_t dest, const bitset_t src)
818{
819 int bitset_i;
820 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
821 dest[bitset_i] |= src[bitset_i];
822}
823
824static inline void
825bitset_mask (bitset_t dest, const bitset_t src)
826{
827 int bitset_i;
828 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
829 dest[bitset_i] &= src[bitset_i];
830}
831
832#ifdef RE_ENABLE_I18N
833/* Functions for re_string. */
834static int
835__attribute__ ((pure, unused))
836re_string_char_size_at (const re_string_t *pstr, Idx idx)
837{
838 int byte_idx;
839 if (pstr->mb_cur_max == 1)
840 return 1;
841 for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
842 if (pstr->wcs[idx + byte_idx] != WEOF)
843 break;
844 return byte_idx;
845}
846
847static wint_t
848__attribute__ ((pure, unused))
849re_string_wchar_at (const re_string_t *pstr, Idx idx)
850{
851 if (pstr->mb_cur_max == 1)
852 return (wint_t) pstr->mbs[idx];
853 return (wint_t) pstr->wcs[idx];
854}
855
856# ifdef _LIBC
857# include <locale/weight.h>
858# endif
859
860static int
861__attribute__ ((pure, unused))
862re_string_elem_size_at (const re_string_t *pstr, Idx idx)
863{
864# ifdef _LIBC
865 const unsigned char *p, *extra;
866 const int32_t *table, *indirect;
867 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
868
869 if (nrules != 0)
870 {
871 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
872 extra = (const unsigned char *)
873 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
874 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
875 _NL_COLLATE_INDIRECTMB);
876 p = pstr->mbs + idx;
877 findidx (table, indirect, extra, &p, pstr->len - idx);
878 return p - pstr->mbs - idx;
879 }
880 else
881# endif /* _LIBC */
882 return 1;
883}
884#endif /* RE_ENABLE_I18N */
885
886#ifndef __GNUC_PREREQ
887# if defined __GNUC__ && defined __GNUC_MINOR__
888# define __GNUC_PREREQ(maj, min) \
889 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
890# else
891# define __GNUC_PREREQ(maj, min) 0
892# endif
893#endif
894
895#if __GNUC_PREREQ (3,4)
896# undef __attribute_warn_unused_result__
897# define __attribute_warn_unused_result__ \
898 __attribute__ ((__warn_unused_result__))
899#else
900# define __attribute_warn_unused_result__ /* empty */
901#endif
902
903#ifndef FALLTHROUGH
904# if __GNUC__ < 7
905# define FALLTHROUGH ((void) 0)
906# else
907# define FALLTHROUGH __attribute__ ((__fallthrough__))
908# endif
909#endif
910
911#endif /* _REGEX_INTERNAL_H */
diff --git a/lib/regexec.c b/lib/regexec.c
new file mode 100644
index 00000000000..65913111644
--- /dev/null
+++ b/lib/regexec.c
@@ -0,0 +1,4324 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22static void match_ctx_clean (re_match_context_t *mctx);
23static void match_ctx_free (re_match_context_t *cache);
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs,
63 re_node_set *eps_via_nodes);
64static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70#ifdef RE_ENABLE_I18N
71static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74#endif /* RE_ENABLE_I18N */
75static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121#if 0
122static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125#endif
126#ifdef RE_ENABLE_I18N
127static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129#endif /* RE_ENABLE_I18N */
130static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159#ifdef RE_ENABLE_I18N
160static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162# ifdef _LIBC
163static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165# endif /* _LIBC */
166#endif /* RE_ENABLE_I18N */
167static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174
175/* Entry point for POSIX code. */
176
177/* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
179
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
184
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
188
189 We return 0 if we find a match and REG_NOMATCH if not. */
190
191int
192regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 size_t nmatch, regmatch_t pmatch[], int eflags)
194{
195 reg_errcode_t err;
196 Idx start, length;
197 re_dfa_t *dfa = preg->buffer;
198
199 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200 return REG_BADPAT;
201
202 if (eflags & REG_STARTEND)
203 {
204 start = pmatch[0].rm_so;
205 length = pmatch[0].rm_eo;
206 }
207 else
208 {
209 start = 0;
210 length = strlen (string);
211 }
212
213 lock_lock (dfa->lock);
214 if (preg->no_sub)
215 err = re_search_internal (preg, string, length, start, length,
216 length, 0, NULL, eflags);
217 else
218 err = re_search_internal (preg, string, length, start, length,
219 length, nmatch, pmatch, eflags);
220 lock_unlock (dfa->lock);
221 return err != REG_NOERROR;
222}
223
224#ifdef _LIBC
225libc_hidden_def (__regexec)
226
227# include <shlib-compat.h>
228versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
229
230# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231__typeof__ (__regexec) __compat_regexec;
232
233int
234attribute_compat_text_section
235__compat_regexec (const regex_t *_Restrict_ preg,
236 const char *_Restrict_ string, size_t nmatch,
237 regmatch_t pmatch[], int eflags)
238{
239 return regexec (preg, string, nmatch, pmatch,
240 eflags & (REG_NOTBOL | REG_NOTEOL));
241}
242compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243# endif
244#endif
245
246/* Entry points for GNU code. */
247
248/* re_match, re_search, re_match_2, re_search_2
249
250 The former two functions operate on STRING with length LENGTH,
251 while the later two operate on concatenation of STRING1 and STRING2
252 with lengths LENGTH1 and LENGTH2, respectively.
253
254 re_match() matches the compiled pattern in BUFP against the string,
255 starting at index START.
256
257 re_search() first tries matching at index START, then it tries to match
258 starting from index START + 1, and so on. The last start position tried
259 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
260 way as re_match().)
261
262 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263 the first STOP characters of the concatenation of the strings should be
264 concerned.
265
266 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267 and all groups is stored in REGS. (For the "_2" variants, the offsets are
268 computed relative to the concatenation, not relative to the individual
269 strings.)
270
271 On success, re_match* functions return the length of the match, re_search*
272 return the position of the start of the match. Return value -1 means no
273 match was found and -2 indicates an internal error. */
274
275regoff_t
276re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 Idx start, struct re_registers *regs)
278{
279 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
280}
281#ifdef _LIBC
282weak_alias (__re_match, re_match)
283#endif
284
285regoff_t
286re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 Idx start, regoff_t range, struct re_registers *regs)
288{
289 return re_search_stub (bufp, string, length, start, range, length, regs,
290 false);
291}
292#ifdef _LIBC
293weak_alias (__re_search, re_search)
294#endif
295
296regoff_t
297re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 const char *string2, Idx length2, Idx start,
299 struct re_registers *regs, Idx stop)
300{
301 return re_search_2_stub (bufp, string1, length1, string2, length2,
302 start, 0, regs, stop, true);
303}
304#ifdef _LIBC
305weak_alias (__re_match_2, re_match_2)
306#endif
307
308regoff_t
309re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 const char *string2, Idx length2, Idx start, regoff_t range,
311 struct re_registers *regs, Idx stop)
312{
313 return re_search_2_stub (bufp, string1, length1, string2, length2,
314 start, range, regs, stop, false);
315}
316#ifdef _LIBC
317weak_alias (__re_search_2, re_search_2)
318#endif
319
320static regoff_t
321re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 Idx length1, const char *string2, Idx length2, Idx start,
323 regoff_t range, struct re_registers *regs,
324 Idx stop, bool ret_len)
325{
326 const char *str;
327 regoff_t rval;
328 Idx len;
329 char *s = NULL;
330
331 if (BE ((length1 < 0 || length2 < 0 || stop < 0
332 || INT_ADD_WRAPV (length1, length2, &len)),
333 0))
334 return -2;
335
336 /* Concatenate the strings. */
337 if (length2 > 0)
338 if (length1 > 0)
339 {
340 s = re_malloc (char, len);
341
342 if (BE (s == NULL, 0))
343 return -2;
344#ifdef _LIBC
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
346#else
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
349#endif
350 str = s;
351 }
352 else
353 str = string2;
354 else
355 str = string1;
356
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 ret_len);
359 re_free (s);
360 return rval;
361}
362
363/* The parameters have the same meaning as those of re_search.
364 Additional parameters:
365 If RET_LEN is true the length of the match is returned (re_match style);
366 otherwise the position of the match is returned. */
367
368static regoff_t
369re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 bool ret_len)
372{
373 reg_errcode_t result;
374 regmatch_t *pmatch;
375 Idx nregs;
376 regoff_t rval;
377 int eflags = 0;
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
380
381 /* Check for out-of-range. */
382 if (BE (start < 0 || start > length, 0))
383 return -1;
384 if (BE (length < last_start || (0 <= range && last_start < start), 0))
385 last_start = length;
386 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
387 last_start = 0;
388
389 lock_lock (dfa->lock);
390
391 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
392 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
393
394 /* Compile fastmap if we haven't yet. */
395 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
396 re_compile_fastmap (bufp);
397
398 if (BE (bufp->no_sub, 0))
399 regs = NULL;
400
401 /* We need at least 1 register. */
402 if (regs == NULL)
403 nregs = 1;
404 else if (BE (bufp->regs_allocated == REGS_FIXED
405 && regs->num_regs <= bufp->re_nsub, 0))
406 {
407 nregs = regs->num_regs;
408 if (BE (nregs < 1, 0))
409 {
410 /* Nothing can be copied to regs. */
411 regs = NULL;
412 nregs = 1;
413 }
414 }
415 else
416 nregs = bufp->re_nsub + 1;
417 pmatch = re_malloc (regmatch_t, nregs);
418 if (BE (pmatch == NULL, 0))
419 {
420 rval = -2;
421 goto out;
422 }
423
424 result = re_search_internal (bufp, string, length, start, last_start, stop,
425 nregs, pmatch, eflags);
426
427 rval = 0;
428
429 /* I hope we needn't fill their regs with -1's when no match was found. */
430 if (result != REG_NOERROR)
431 rval = result == REG_NOMATCH ? -1 : -2;
432 else if (regs != NULL)
433 {
434 /* If caller wants register contents data back, copy them. */
435 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
436 bufp->regs_allocated);
437 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
438 rval = -2;
439 }
440
441 if (BE (rval == 0, 1))
442 {
443 if (ret_len)
444 {
445 assert (pmatch[0].rm_so == start);
446 rval = pmatch[0].rm_eo - start;
447 }
448 else
449 rval = pmatch[0].rm_so;
450 }
451 re_free (pmatch);
452 out:
453 lock_unlock (dfa->lock);
454 return rval;
455}
456
457static unsigned
458re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
459 int regs_allocated)
460{
461 int rval = REGS_REALLOCATE;
462 Idx i;
463 Idx need_regs = nregs + 1;
464 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
465 uses. */
466
467 /* Have the register data arrays been allocated? */
468 if (regs_allocated == REGS_UNALLOCATED)
469 { /* No. So allocate them with malloc. */
470 regs->start = re_malloc (regoff_t, need_regs);
471 if (BE (regs->start == NULL, 0))
472 return REGS_UNALLOCATED;
473 regs->end = re_malloc (regoff_t, need_regs);
474 if (BE (regs->end == NULL, 0))
475 {
476 re_free (regs->start);
477 return REGS_UNALLOCATED;
478 }
479 regs->num_regs = need_regs;
480 }
481 else if (regs_allocated == REGS_REALLOCATE)
482 { /* Yes. If we need more elements than were already
483 allocated, reallocate them. If we need fewer, just
484 leave it alone. */
485 if (BE (need_regs > regs->num_regs, 0))
486 {
487 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
488 regoff_t *new_end;
489 if (BE (new_start == NULL, 0))
490 return REGS_UNALLOCATED;
491 new_end = re_realloc (regs->end, regoff_t, need_regs);
492 if (BE (new_end == NULL, 0))
493 {
494 re_free (new_start);
495 return REGS_UNALLOCATED;
496 }
497 regs->start = new_start;
498 regs->end = new_end;
499 regs->num_regs = need_regs;
500 }
501 }
502 else
503 {
504 assert (regs_allocated == REGS_FIXED);
505 /* This function may not be called with REGS_FIXED and nregs too big. */
506 assert (regs->num_regs >= nregs);
507 rval = REGS_FIXED;
508 }
509
510 /* Copy the regs. */
511 for (i = 0; i < nregs; ++i)
512 {
513 regs->start[i] = pmatch[i].rm_so;
514 regs->end[i] = pmatch[i].rm_eo;
515 }
516 for ( ; i < regs->num_regs; ++i)
517 regs->start[i] = regs->end[i] = -1;
518
519 return rval;
520}
521
522/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
523 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
524 this memory for recording register information. STARTS and ENDS
525 must be allocated using the malloc library routine, and must each
526 be at least NUM_REGS * sizeof (regoff_t) bytes long.
527
528 If NUM_REGS == 0, then subsequent matches should allocate their own
529 register data.
530
531 Unless this function is called, the first search or match using
532 PATTERN_BUFFER will allocate its own register data, without
533 freeing the old data. */
534
535void
536re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
537 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
538{
539 if (num_regs)
540 {
541 bufp->regs_allocated = REGS_REALLOCATE;
542 regs->num_regs = num_regs;
543 regs->start = starts;
544 regs->end = ends;
545 }
546 else
547 {
548 bufp->regs_allocated = REGS_UNALLOCATED;
549 regs->num_regs = 0;
550 regs->start = regs->end = NULL;
551 }
552}
553#ifdef _LIBC
554weak_alias (__re_set_registers, re_set_registers)
555#endif
556
557/* Entry points compatible with 4.2 BSD regex library. We don't define
558 them unless specifically requested. */
559
560#if defined _REGEX_RE_COMP || defined _LIBC
561int
562# ifdef _LIBC
563weak_function
564# endif
565re_exec (const char *s)
566{
567 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
568}
569#endif /* _REGEX_RE_COMP */
570
571/* Internal entry point. */
572
573/* Searches for a compiled pattern PREG in the string STRING, whose
574 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
575 meaning as with regexec. LAST_START is START + RANGE, where
576 START and RANGE have the same meaning as with re_search.
577 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
578 otherwise return the error code.
579 Note: We assume front end functions already check ranges.
580 (0 <= LAST_START && LAST_START <= LENGTH) */
581
582static reg_errcode_t
583__attribute_warn_unused_result__
584re_search_internal (const regex_t *preg, const char *string, Idx length,
585 Idx start, Idx last_start, Idx stop, size_t nmatch,
586 regmatch_t pmatch[], int eflags)
587{
588 reg_errcode_t err;
589 const re_dfa_t *dfa = preg->buffer;
590 Idx left_lim, right_lim;
591 int incr;
592 bool fl_longest_match;
593 int match_kind;
594 Idx match_first;
595 Idx match_last = -1;
596 Idx extra_nmatch;
597 bool sb;
598 int ch;
599#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
600 re_match_context_t mctx = { .dfa = dfa };
601#else
602 re_match_context_t mctx;
603#endif
604 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
605 && start != last_start && !preg->can_be_null)
606 ? preg->fastmap : NULL);
607 RE_TRANSLATE_TYPE t = preg->translate;
608
609#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
610 memset (&mctx, '\0', sizeof (re_match_context_t));
611 mctx.dfa = dfa;
612#endif
613
614 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
615 nmatch -= extra_nmatch;
616
617 /* Check if the DFA haven't been compiled. */
618 if (BE (preg->used == 0 || dfa->init_state == NULL
619 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
620 || dfa->init_state_begbuf == NULL, 0))
621 return REG_NOMATCH;
622
623#ifdef DEBUG
624 /* We assume front-end functions already check them. */
625 assert (0 <= last_start && last_start <= length);
626#endif
627
628 /* If initial states with non-begbuf contexts have no elements,
629 the regex must be anchored. If preg->newline_anchor is set,
630 we'll never use init_state_nl, so do not check it. */
631 if (dfa->init_state->nodes.nelem == 0
632 && dfa->init_state_word->nodes.nelem == 0
633 && (dfa->init_state_nl->nodes.nelem == 0
634 || !preg->newline_anchor))
635 {
636 if (start != 0 && last_start != 0)
637 return REG_NOMATCH;
638 start = last_start = 0;
639 }
640
641 /* We must check the longest matching, if nmatch > 0. */
642 fl_longest_match = (nmatch != 0 || dfa->nbackref);
643
644 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
645 preg->translate, (preg->syntax & RE_ICASE) != 0,
646 dfa);
647 if (BE (err != REG_NOERROR, 0))
648 goto free_return;
649 mctx.input.stop = stop;
650 mctx.input.raw_stop = stop;
651 mctx.input.newline_anchor = preg->newline_anchor;
652
653 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
654 if (BE (err != REG_NOERROR, 0))
655 goto free_return;
656
657 /* We will log all the DFA states through which the dfa pass,
658 if nmatch > 1, or this dfa has "multibyte node", which is a
659 back-reference or a node which can accept multibyte character or
660 multi character collating element. */
661 if (nmatch > 1 || dfa->has_mb_node)
662 {
663 /* Avoid overflow. */
664 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
665 <= mctx.input.bufs_len), 0))
666 {
667 err = REG_ESPACE;
668 goto free_return;
669 }
670
671 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
672 if (BE (mctx.state_log == NULL, 0))
673 {
674 err = REG_ESPACE;
675 goto free_return;
676 }
677 }
678 else
679 mctx.state_log = NULL;
680
681 match_first = start;
682 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
683 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684
685 /* Check incrementally whether the input string matches. */
686 incr = (last_start < start) ? -1 : 1;
687 left_lim = (last_start < start) ? last_start : start;
688 right_lim = (last_start < start) ? start : last_start;
689 sb = dfa->mb_cur_max == 1;
690 match_kind =
691 (fastmap
692 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
693 | (start <= last_start ? 2 : 0)
694 | (t != NULL ? 1 : 0))
695 : 8);
696
697 for (;; match_first += incr)
698 {
699 err = REG_NOMATCH;
700 if (match_first < left_lim || right_lim < match_first)
701 goto free_return;
702
703 /* Advance as rapidly as possible through the string, until we
704 find a plausible place to start matching. This may be done
705 with varying efficiency, so there are various possibilities:
706 only the most common of them are specialized, in order to
707 save on code size. We use a switch statement for speed. */
708 switch (match_kind)
709 {
710 case 8:
711 /* No fastmap. */
712 break;
713
714 case 7:
715 /* Fastmap with single-byte translation, match forward. */
716 while (BE (match_first < right_lim, 1)
717 && !fastmap[t[(unsigned char) string[match_first]]])
718 ++match_first;
719 goto forward_match_found_start_or_reached_end;
720
721 case 6:
722 /* Fastmap without translation, match forward. */
723 while (BE (match_first < right_lim, 1)
724 && !fastmap[(unsigned char) string[match_first]])
725 ++match_first;
726
727 forward_match_found_start_or_reached_end:
728 if (BE (match_first == right_lim, 0))
729 {
730 ch = match_first >= length
731 ? 0 : (unsigned char) string[match_first];
732 if (!fastmap[t ? t[ch] : ch])
733 goto free_return;
734 }
735 break;
736
737 case 4:
738 case 5:
739 /* Fastmap without multi-byte translation, match backwards. */
740 while (match_first >= left_lim)
741 {
742 ch = match_first >= length
743 ? 0 : (unsigned char) string[match_first];
744 if (fastmap[t ? t[ch] : ch])
745 break;
746 --match_first;
747 }
748 if (match_first < left_lim)
749 goto free_return;
750 break;
751
752 default:
753 /* In this case, we can't determine easily the current byte,
754 since it might be a component byte of a multibyte
755 character. Then we use the constructed buffer instead. */
756 for (;;)
757 {
758 /* If MATCH_FIRST is out of the valid range, reconstruct the
759 buffers. */
760 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
761 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
762 {
763 err = re_string_reconstruct (&mctx.input, match_first,
764 eflags);
765 if (BE (err != REG_NOERROR, 0))
766 goto free_return;
767
768 offset = match_first - mctx.input.raw_mbs_idx;
769 }
770 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
771 Note that MATCH_FIRST must not be smaller than 0. */
772 ch = (match_first >= length
773 ? 0 : re_string_byte_at (&mctx.input, offset));
774 if (fastmap[ch])
775 break;
776 match_first += incr;
777 if (match_first < left_lim || match_first > right_lim)
778 {
779 err = REG_NOMATCH;
780 goto free_return;
781 }
782 }
783 break;
784 }
785
786 /* Reconstruct the buffers so that the matcher can assume that
787 the matching starts from the beginning of the buffer. */
788 err = re_string_reconstruct (&mctx.input, match_first, eflags);
789 if (BE (err != REG_NOERROR, 0))
790 goto free_return;
791
792#ifdef RE_ENABLE_I18N
793 /* Don't consider this char as a possible match start if it part,
794 yet isn't the head, of a multibyte character. */
795 if (!sb && !re_string_first_byte (&mctx.input, 0))
796 continue;
797#endif
798
799 /* It seems to be appropriate one, then use the matcher. */
800 /* We assume that the matching starts from 0. */
801 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
802 match_last = check_matching (&mctx, fl_longest_match,
803 start <= last_start ? &match_first : NULL);
804 if (match_last != -1)
805 {
806 if (BE (match_last == -2, 0))
807 {
808 err = REG_ESPACE;
809 goto free_return;
810 }
811 else
812 {
813 mctx.match_last = match_last;
814 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
815 {
816 re_dfastate_t *pstate = mctx.state_log[match_last];
817 mctx.last_node = check_halt_state_context (&mctx, pstate,
818 match_last);
819 }
820 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
821 || dfa->nbackref)
822 {
823 err = prune_impossible_nodes (&mctx);
824 if (err == REG_NOERROR)
825 break;
826 if (BE (err != REG_NOMATCH, 0))
827 goto free_return;
828 match_last = -1;
829 }
830 else
831 break; /* We found a match. */
832 }
833 }
834
835 match_ctx_clean (&mctx);
836 }
837
838#ifdef DEBUG
839 assert (match_last != -1);
840 assert (err == REG_NOERROR);
841#endif
842
843 /* Set pmatch[] if we need. */
844 if (nmatch > 0)
845 {
846 Idx reg_idx;
847
848 /* Initialize registers. */
849 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
850 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851
852 /* Set the points where matching start/end. */
853 pmatch[0].rm_so = 0;
854 pmatch[0].rm_eo = mctx.match_last;
855 /* FIXME: This function should fail if mctx.match_last exceeds
856 the maximum possible regoff_t value. We need a new error
857 code REG_OVERFLOW. */
858
859 if (!preg->no_sub && nmatch > 1)
860 {
861 err = set_regs (preg, &mctx, nmatch, pmatch,
862 dfa->has_plural_match && dfa->nbackref > 0);
863 if (BE (err != REG_NOERROR, 0))
864 goto free_return;
865 }
866
867 /* At last, add the offset to each register, since we slid
868 the buffers so that we could assume that the matching starts
869 from 0. */
870 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
871 if (pmatch[reg_idx].rm_so != -1)
872 {
873#ifdef RE_ENABLE_I18N
874 if (BE (mctx.input.offsets_needed != 0, 0))
875 {
876 pmatch[reg_idx].rm_so =
877 (pmatch[reg_idx].rm_so == mctx.input.valid_len
878 ? mctx.input.valid_raw_len
879 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
880 pmatch[reg_idx].rm_eo =
881 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
882 ? mctx.input.valid_raw_len
883 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
884 }
885#else
886 assert (mctx.input.offsets_needed == 0);
887#endif
888 pmatch[reg_idx].rm_so += match_first;
889 pmatch[reg_idx].rm_eo += match_first;
890 }
891 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
892 {
893 pmatch[nmatch + reg_idx].rm_so = -1;
894 pmatch[nmatch + reg_idx].rm_eo = -1;
895 }
896
897 if (dfa->subexp_map)
898 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
899 if (dfa->subexp_map[reg_idx] != reg_idx)
900 {
901 pmatch[reg_idx + 1].rm_so
902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
903 pmatch[reg_idx + 1].rm_eo
904 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 }
906 }
907
908 free_return:
909 re_free (mctx.state_log);
910 if (dfa->nbackref)
911 match_ctx_free (&mctx);
912 re_string_destruct (&mctx.input);
913 return err;
914}
915
916static reg_errcode_t
917__attribute_warn_unused_result__
918prune_impossible_nodes (re_match_context_t *mctx)
919{
920 const re_dfa_t *const dfa = mctx->dfa;
921 Idx halt_node, match_last;
922 reg_errcode_t ret;
923 re_dfastate_t **sifted_states;
924 re_dfastate_t **lim_states = NULL;
925 re_sift_context_t sctx;
926#ifdef DEBUG
927 assert (mctx->state_log != NULL);
928#endif
929 match_last = mctx->match_last;
930 halt_node = mctx->last_node;
931
932 /* Avoid overflow. */
933 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
934 return REG_ESPACE;
935
936 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
937 if (BE (sifted_states == NULL, 0))
938 {
939 ret = REG_ESPACE;
940 goto free_return;
941 }
942 if (dfa->nbackref)
943 {
944 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
945 if (BE (lim_states == NULL, 0))
946 {
947 ret = REG_ESPACE;
948 goto free_return;
949 }
950 while (1)
951 {
952 memset (lim_states, '\0',
953 sizeof (re_dfastate_t *) * (match_last + 1));
954 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
955 match_last);
956 ret = sift_states_backward (mctx, &sctx);
957 re_node_set_free (&sctx.limits);
958 if (BE (ret != REG_NOERROR, 0))
959 goto free_return;
960 if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 break;
962 do
963 {
964 --match_last;
965 if (match_last < 0)
966 {
967 ret = REG_NOMATCH;
968 goto free_return;
969 }
970 } while (mctx->state_log[match_last] == NULL
971 || !mctx->state_log[match_last]->halt);
972 halt_node = check_halt_state_context (mctx,
973 mctx->state_log[match_last],
974 match_last);
975 }
976 ret = merge_state_array (dfa, sifted_states, lim_states,
977 match_last + 1);
978 re_free (lim_states);
979 lim_states = NULL;
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 }
983 else
984 {
985 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
986 ret = sift_states_backward (mctx, &sctx);
987 re_node_set_free (&sctx.limits);
988 if (BE (ret != REG_NOERROR, 0))
989 goto free_return;
990 if (sifted_states[0] == NULL)
991 {
992 ret = REG_NOMATCH;
993 goto free_return;
994 }
995 }
996 re_free (mctx->state_log);
997 mctx->state_log = sifted_states;
998 sifted_states = NULL;
999 mctx->last_node = halt_node;
1000 mctx->match_last = match_last;
1001 ret = REG_NOERROR;
1002 free_return:
1003 re_free (sifted_states);
1004 re_free (lim_states);
1005 return ret;
1006}
1007
1008/* Acquire an initial state and return it.
1009 We must select appropriate initial state depending on the context,
1010 since initial states may have constraints like "\<", "^", etc.. */
1011
1012static inline re_dfastate_t *
1013__attribute__ ((always_inline))
1014acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015 Idx idx)
1016{
1017 const re_dfa_t *const dfa = mctx->dfa;
1018 if (dfa->init_state->has_constraint)
1019 {
1020 unsigned int context;
1021 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022 if (IS_WORD_CONTEXT (context))
1023 return dfa->init_state_word;
1024 else if (IS_ORDINARY_CONTEXT (context))
1025 return dfa->init_state;
1026 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027 return dfa->init_state_begbuf;
1028 else if (IS_NEWLINE_CONTEXT (context))
1029 return dfa->init_state_nl;
1030 else if (IS_BEGBUF_CONTEXT (context))
1031 {
1032 /* It is relatively rare case, then calculate on demand. */
1033 return re_acquire_state_context (err, dfa,
1034 dfa->init_state->entrance_nodes,
1035 context);
1036 }
1037 else
1038 /* Must not happen? */
1039 return dfa->init_state;
1040 }
1041 else
1042 return dfa->init_state;
1043}
1044
1045/* Check whether the regular expression match input string INPUT or not,
1046 and return the index where the matching end. Return -1 if
1047 there is no match, and return -2 in case of an error.
1048 FL_LONGEST_MATCH means we want the POSIX longest matching.
1049 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050 next place where we may want to try matching.
1051 Note that the matcher assumes that the matching starts from the current
1052 index of the buffer. */
1053
1054static Idx
1055__attribute_warn_unused_result__
1056check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057 Idx *p_match_first)
1058{
1059 const re_dfa_t *const dfa = mctx->dfa;
1060 reg_errcode_t err;
1061 Idx match = 0;
1062 Idx match_last = -1;
1063 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1064 re_dfastate_t *cur_state;
1065 bool at_init_state = p_match_first != NULL;
1066 Idx next_start_idx = cur_str_idx;
1067
1068 err = REG_NOERROR;
1069 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070 /* An initial state must not be NULL (invalid). */
1071 if (BE (cur_state == NULL, 0))
1072 {
1073 assert (err == REG_ESPACE);
1074 return -2;
1075 }
1076
1077 if (mctx->state_log != NULL)
1078 {
1079 mctx->state_log[cur_str_idx] = cur_state;
1080
1081 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082 later. E.g. Processing back references. */
1083 if (BE (dfa->nbackref, 0))
1084 {
1085 at_init_state = false;
1086 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087 if (BE (err != REG_NOERROR, 0))
1088 return err;
1089
1090 if (cur_state->has_backref)
1091 {
1092 err = transit_state_bkref (mctx, &cur_state->nodes);
1093 if (BE (err != REG_NOERROR, 0))
1094 return err;
1095 }
1096 }
1097 }
1098
1099 /* If the RE accepts NULL string. */
1100 if (BE (cur_state->halt, 0))
1101 {
1102 if (!cur_state->has_constraint
1103 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1104 {
1105 if (!fl_longest_match)
1106 return cur_str_idx;
1107 else
1108 {
1109 match_last = cur_str_idx;
1110 match = 1;
1111 }
1112 }
1113 }
1114
1115 while (!re_string_eoi (&mctx->input))
1116 {
1117 re_dfastate_t *old_state = cur_state;
1118 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119
1120 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121 && mctx->input.bufs_len < mctx->input.len)
1122 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123 && mctx->input.valid_len < mctx->input.len))
1124 {
1125 err = extend_buffers (mctx, next_char_idx + 1);
1126 if (BE (err != REG_NOERROR, 0))
1127 {
1128 assert (err == REG_ESPACE);
1129 return -2;
1130 }
1131 }
1132
1133 cur_state = transit_state (&err, mctx, cur_state);
1134 if (mctx->state_log != NULL)
1135 cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137 if (cur_state == NULL)
1138 {
1139 /* Reached the invalid state or an error. Try to recover a valid
1140 state using the state log, if available and if we have not
1141 already found a valid (even if not the longest) match. */
1142 if (BE (err != REG_NOERROR, 0))
1143 return -2;
1144
1145 if (mctx->state_log == NULL
1146 || (match && !fl_longest_match)
1147 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148 break;
1149 }
1150
1151 if (BE (at_init_state, 0))
1152 {
1153 if (old_state == cur_state)
1154 next_start_idx = next_char_idx;
1155 else
1156 at_init_state = false;
1157 }
1158
1159 if (cur_state->halt)
1160 {
1161 /* Reached a halt state.
1162 Check the halt state can satisfy the current context. */
1163 if (!cur_state->has_constraint
1164 || check_halt_state_context (mctx, cur_state,
1165 re_string_cur_idx (&mctx->input)))
1166 {
1167 /* We found an appropriate halt state. */
1168 match_last = re_string_cur_idx (&mctx->input);
1169 match = 1;
1170
1171 /* We found a match, do not modify match_first below. */
1172 p_match_first = NULL;
1173 if (!fl_longest_match)
1174 break;
1175 }
1176 }
1177 }
1178
1179 if (p_match_first)
1180 *p_match_first += next_start_idx;
1181
1182 return match_last;
1183}
1184
1185/* Check NODE match the current context. */
1186
1187static bool
1188check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1189{
1190 re_token_type_t type = dfa->nodes[node].type;
1191 unsigned int constraint = dfa->nodes[node].constraint;
1192 if (type != END_OF_RE)
1193 return false;
1194 if (!constraint)
1195 return true;
1196 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1197 return false;
1198 return true;
1199}
1200
1201/* Check the halt state STATE match the current context.
1202 Return 0 if not match, if the node, STATE has, is a halt node and
1203 match the context, return the node. */
1204
1205static Idx
1206check_halt_state_context (const re_match_context_t *mctx,
1207 const re_dfastate_t *state, Idx idx)
1208{
1209 Idx i;
1210 unsigned int context;
1211#ifdef DEBUG
1212 assert (state->halt);
1213#endif
1214 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1215 for (i = 0; i < state->nodes.nelem; ++i)
1216 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1217 return state->nodes.elems[i];
1218 return 0;
1219}
1220
1221/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222 corresponding to the DFA).
1223 Return the destination node, and update EPS_VIA_NODES;
1224 return -1 in case of errors. */
1225
1226static Idx
1227proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229 struct re_fail_stack_t *fs)
1230{
1231 const re_dfa_t *const dfa = mctx->dfa;
1232 Idx i;
1233 bool ok;
1234 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235 {
1236 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1237 re_node_set *edests = &dfa->edests[node];
1238 Idx dest_node;
1239 ok = re_node_set_insert (eps_via_nodes, node);
1240 if (BE (! ok, 0))
1241 return -2;
1242 /* Pick up a valid destination, or return -1 if none
1243 is found. */
1244 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245 {
1246 Idx candidate = edests->elems[i];
1247 if (!re_node_set_contains (cur_nodes, candidate))
1248 continue;
1249 if (dest_node == -1)
1250 dest_node = candidate;
1251
1252 else
1253 {
1254 /* In order to avoid infinite loop like "(a*)*", return the second
1255 epsilon-transition if the first was already considered. */
1256 if (re_node_set_contains (eps_via_nodes, dest_node))
1257 return candidate;
1258
1259 /* Otherwise, push the second epsilon-transition on the fail stack. */
1260 else if (fs != NULL
1261 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 eps_via_nodes))
1263 return -2;
1264
1265 /* We know we are going to exit. */
1266 break;
1267 }
1268 }
1269 return dest_node;
1270 }
1271 else
1272 {
1273 Idx naccepted = 0;
1274 re_token_type_t type = dfa->nodes[node].type;
1275
1276#ifdef RE_ENABLE_I18N
1277 if (dfa->nodes[node].accept_mb)
1278 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279 else
1280#endif /* RE_ENABLE_I18N */
1281 if (type == OP_BACK_REF)
1282 {
1283 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1284 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 if (fs != NULL)
1286 {
1287 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 return -1;
1289 else if (naccepted)
1290 {
1291 char *buf = (char *) re_string_get_buffer (&mctx->input);
1292 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293 naccepted) != 0)
1294 return -1;
1295 }
1296 }
1297
1298 if (naccepted == 0)
1299 {
1300 Idx dest_node;
1301 ok = re_node_set_insert (eps_via_nodes, node);
1302 if (BE (! ok, 0))
1303 return -2;
1304 dest_node = dfa->edests[node].elems[0];
1305 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306 dest_node))
1307 return dest_node;
1308 }
1309 }
1310
1311 if (naccepted != 0
1312 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1313 {
1314 Idx dest_node = dfa->nexts[node];
1315 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1316 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1317 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 dest_node)))
1319 return -1;
1320 re_node_set_empty (eps_via_nodes);
1321 return dest_node;
1322 }
1323 }
1324 return -1;
1325}
1326
1327static reg_errcode_t
1328__attribute_warn_unused_result__
1329push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1330 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1331{
1332 reg_errcode_t err;
1333 Idx num = fs->num++;
1334 if (fs->num == fs->alloc)
1335 {
1336 struct re_fail_stack_ent_t *new_array;
1337 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1338 fs->alloc * 2);
1339 if (new_array == NULL)
1340 return REG_ESPACE;
1341 fs->alloc *= 2;
1342 fs->stack = new_array;
1343 }
1344 fs->stack[num].idx = str_idx;
1345 fs->stack[num].node = dest_node;
1346 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1347 if (fs->stack[num].regs == NULL)
1348 return REG_ESPACE;
1349 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1350 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1351 return err;
1352}
1353
1354static Idx
1355pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1356 regmatch_t *regs, re_node_set *eps_via_nodes)
1357{
1358 Idx num = --fs->num;
1359 assert (num >= 0);
1360 *pidx = fs->stack[num].idx;
1361 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1362 re_node_set_free (eps_via_nodes);
1363 re_free (fs->stack[num].regs);
1364 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1365 return fs->stack[num].node;
1366}
1367
1368/* Set the positions where the subexpressions are starts/ends to registers
1369 PMATCH.
1370 Note: We assume that pmatch[0] is already set, and
1371 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1372
1373static reg_errcode_t
1374__attribute_warn_unused_result__
1375set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1376 regmatch_t *pmatch, bool fl_backtrack)
1377{
1378 const re_dfa_t *dfa = preg->buffer;
1379 Idx idx, cur_node;
1380 re_node_set eps_via_nodes;
1381 struct re_fail_stack_t *fs;
1382 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1383 regmatch_t *prev_idx_match;
1384 bool prev_idx_match_malloced = false;
1385
1386#ifdef DEBUG
1387 assert (nmatch > 1);
1388 assert (mctx->state_log != NULL);
1389#endif
1390 if (fl_backtrack)
1391 {
1392 fs = &fs_body;
1393 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394 if (fs->stack == NULL)
1395 return REG_ESPACE;
1396 }
1397 else
1398 fs = NULL;
1399
1400 cur_node = dfa->init_node;
1401 re_node_set_init_empty (&eps_via_nodes);
1402
1403 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1404 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1405 else
1406 {
1407 prev_idx_match = re_malloc (regmatch_t, nmatch);
1408 if (prev_idx_match == NULL)
1409 {
1410 free_fail_stack_return (fs);
1411 return REG_ESPACE;
1412 }
1413 prev_idx_match_malloced = true;
1414 }
1415 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1416
1417 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1418 {
1419 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1420
1421 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1422 {
1423 Idx reg_idx;
1424 if (fs)
1425 {
1426 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428 break;
1429 if (reg_idx == nmatch)
1430 {
1431 re_node_set_free (&eps_via_nodes);
1432 if (prev_idx_match_malloced)
1433 re_free (prev_idx_match);
1434 return free_fail_stack_return (fs);
1435 }
1436 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437 &eps_via_nodes);
1438 }
1439 else
1440 {
1441 re_node_set_free (&eps_via_nodes);
1442 if (prev_idx_match_malloced)
1443 re_free (prev_idx_match);
1444 return REG_NOERROR;
1445 }
1446 }
1447
1448 /* Proceed to next node. */
1449 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1450 &eps_via_nodes, fs);
1451
1452 if (BE (cur_node < 0, 0))
1453 {
1454 if (BE (cur_node == -2, 0))
1455 {
1456 re_node_set_free (&eps_via_nodes);
1457 if (prev_idx_match_malloced)
1458 re_free (prev_idx_match);
1459 free_fail_stack_return (fs);
1460 return REG_ESPACE;
1461 }
1462 if (fs)
1463 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464 &eps_via_nodes);
1465 else
1466 {
1467 re_node_set_free (&eps_via_nodes);
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
1470 return REG_NOMATCH;
1471 }
1472 }
1473 }
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1477 return free_fail_stack_return (fs);
1478}
1479
1480static reg_errcode_t
1481free_fail_stack_return (struct re_fail_stack_t *fs)
1482{
1483 if (fs)
1484 {
1485 Idx fs_idx;
1486 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1487 {
1488 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1489 re_free (fs->stack[fs_idx].regs);
1490 }
1491 re_free (fs->stack);
1492 }
1493 return REG_NOERROR;
1494}
1495
1496static void
1497update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1498 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1499{
1500 int type = dfa->nodes[cur_node].type;
1501 if (type == OP_OPEN_SUBEXP)
1502 {
1503 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504
1505 /* We are at the first node of this sub expression. */
1506 if (reg_num < nmatch)
1507 {
1508 pmatch[reg_num].rm_so = cur_idx;
1509 pmatch[reg_num].rm_eo = -1;
1510 }
1511 }
1512 else if (type == OP_CLOSE_SUBEXP)
1513 {
1514 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1515 if (reg_num < nmatch)
1516 {
1517 /* We are at the last node of this sub expression. */
1518 if (pmatch[reg_num].rm_so < cur_idx)
1519 {
1520 pmatch[reg_num].rm_eo = cur_idx;
1521 /* This is a non-empty match or we are not inside an optional
1522 subexpression. Accept this right away. */
1523 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524 }
1525 else
1526 {
1527 if (dfa->nodes[cur_node].opt_subexp
1528 && prev_idx_match[reg_num].rm_so != -1)
1529 /* We transited through an empty match for an optional
1530 subexpression, like (a?)*, and this is not the subexp's
1531 first match. Copy back the old content of the registers
1532 so that matches of an inner subexpression are undone as
1533 well, like in ((a?))*. */
1534 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1535 else
1536 /* We completed a subexpression, but it may be part of
1537 an optional one, so do not update PREV_IDX_MATCH. */
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 }
1540 }
1541 }
1542}
1543
1544/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1545 and sift the nodes in each states according to the following rules.
1546 Updated state_log will be wrote to STATE_LOG.
1547
1548 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1549 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1550 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1551 the LAST_NODE, we throw away the node 'a'.
1552 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1553 string 's' and transit to 'b':
1554 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1555 away the node 'a'.
1556 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1557 thrown away, we throw away the node 'a'.
1558 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1559 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1560 node 'a'.
1561 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1562 we throw away the node 'a'. */
1563
1564#define STATE_NODE_CONTAINS(state,node) \
1565 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1566
1567static reg_errcode_t
1568sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1569{
1570 reg_errcode_t err;
1571 int null_cnt = 0;
1572 Idx str_idx = sctx->last_str_idx;
1573 re_node_set cur_dest;
1574
1575#ifdef DEBUG
1576 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577#endif
1578
1579 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1580 transit to the last_node and the last_node itself. */
1581 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582 if (BE (err != REG_NOERROR, 0))
1583 return err;
1584 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585 if (BE (err != REG_NOERROR, 0))
1586 goto free_return;
1587
1588 /* Then check each states in the state_log. */
1589 while (str_idx > 0)
1590 {
1591 /* Update counters. */
1592 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593 if (null_cnt > mctx->max_mb_elem_len)
1594 {
1595 memset (sctx->sifted_states, '\0',
1596 sizeof (re_dfastate_t *) * str_idx);
1597 re_node_set_free (&cur_dest);
1598 return REG_NOERROR;
1599 }
1600 re_node_set_empty (&cur_dest);
1601 --str_idx;
1602
1603 if (mctx->state_log[str_idx])
1604 {
1605 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1608 }
1609
1610 /* Add all the nodes which satisfy the following conditions:
1611 - It can epsilon transit to a node in CUR_DEST.
1612 - It is in CUR_SRC.
1613 And update state_log. */
1614 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615 if (BE (err != REG_NOERROR, 0))
1616 goto free_return;
1617 }
1618 err = REG_NOERROR;
1619 free_return:
1620 re_node_set_free (&cur_dest);
1621 return err;
1622}
1623
1624static reg_errcode_t
1625__attribute_warn_unused_result__
1626build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1627 Idx str_idx, re_node_set *cur_dest)
1628{
1629 const re_dfa_t *const dfa = mctx->dfa;
1630 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631 Idx i;
1632
1633 /* Then build the next sifted state.
1634 We build the next sifted state on 'cur_dest', and update
1635 'sifted_states[str_idx]' with 'cur_dest'.
1636 Note:
1637 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1638 'cur_src' points the node_set of the old 'state_log[str_idx]'
1639 (with the epsilon nodes pre-filtered out). */
1640 for (i = 0; i < cur_src->nelem; i++)
1641 {
1642 Idx prev_node = cur_src->elems[i];
1643 int naccepted = 0;
1644 bool ok;
1645
1646#ifdef DEBUG
1647 re_token_type_t type = dfa->nodes[prev_node].type;
1648 assert (!IS_EPSILON_NODE (type));
1649#endif
1650#ifdef RE_ENABLE_I18N
1651 /* If the node may accept "multi byte". */
1652 if (dfa->nodes[prev_node].accept_mb)
1653 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1654 str_idx, sctx->last_str_idx);
1655#endif /* RE_ENABLE_I18N */
1656
1657 /* We don't check backreferences here.
1658 See update_cur_sifted_state(). */
1659 if (!naccepted
1660 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1661 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1662 dfa->nexts[prev_node]))
1663 naccepted = 1;
1664
1665 if (naccepted == 0)
1666 continue;
1667
1668 if (sctx->limits.nelem)
1669 {
1670 Idx to_idx = str_idx + naccepted;
1671 if (check_dst_limits (mctx, &sctx->limits,
1672 dfa->nexts[prev_node], to_idx,
1673 prev_node, str_idx))
1674 continue;
1675 }
1676 ok = re_node_set_insert (cur_dest, prev_node);
1677 if (BE (! ok, 0))
1678 return REG_ESPACE;
1679 }
1680
1681 return REG_NOERROR;
1682}
1683
1684/* Helper functions. */
1685
1686static reg_errcode_t
1687clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1688{
1689 Idx top = mctx->state_log_top;
1690
1691 if ((next_state_log_idx >= mctx->input.bufs_len
1692 && mctx->input.bufs_len < mctx->input.len)
1693 || (next_state_log_idx >= mctx->input.valid_len
1694 && mctx->input.valid_len < mctx->input.len))
1695 {
1696 reg_errcode_t err;
1697 err = extend_buffers (mctx, next_state_log_idx + 1);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700 }
1701
1702 if (top < next_state_log_idx)
1703 {
1704 memset (mctx->state_log + top + 1, '\0',
1705 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706 mctx->state_log_top = next_state_log_idx;
1707 }
1708 return REG_NOERROR;
1709}
1710
1711static reg_errcode_t
1712merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1713 re_dfastate_t **src, Idx num)
1714{
1715 Idx st_idx;
1716 reg_errcode_t err;
1717 for (st_idx = 0; st_idx < num; ++st_idx)
1718 {
1719 if (dst[st_idx] == NULL)
1720 dst[st_idx] = src[st_idx];
1721 else if (src[st_idx] != NULL)
1722 {
1723 re_node_set merged_set;
1724 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1725 &src[st_idx]->nodes);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1729 re_node_set_free (&merged_set);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733 }
1734 return REG_NOERROR;
1735}
1736
1737static reg_errcode_t
1738update_cur_sifted_state (const re_match_context_t *mctx,
1739 re_sift_context_t *sctx, Idx str_idx,
1740 re_node_set *dest_nodes)
1741{
1742 const re_dfa_t *const dfa = mctx->dfa;
1743 reg_errcode_t err = REG_NOERROR;
1744 const re_node_set *candidates;
1745 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1746 : &mctx->state_log[str_idx]->nodes);
1747
1748 if (dest_nodes->nelem == 0)
1749 sctx->sifted_states[str_idx] = NULL;
1750 else
1751 {
1752 if (candidates)
1753 {
1754 /* At first, add the nodes which can epsilon transit to a node in
1755 DEST_NODE. */
1756 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1757 if (BE (err != REG_NOERROR, 0))
1758 return err;
1759
1760 /* Then, check the limitations in the current sift_context. */
1761 if (sctx->limits.nelem)
1762 {
1763 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1764 mctx->bkref_ents, str_idx);
1765 if (BE (err != REG_NOERROR, 0))
1766 return err;
1767 }
1768 }
1769
1770 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1773 }
1774
1775 if (candidates && mctx->state_log[str_idx]->has_backref)
1776 {
1777 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1778 if (BE (err != REG_NOERROR, 0))
1779 return err;
1780 }
1781 return REG_NOERROR;
1782}
1783
1784static reg_errcode_t
1785__attribute_warn_unused_result__
1786add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1787 const re_node_set *candidates)
1788{
1789 reg_errcode_t err = REG_NOERROR;
1790 Idx i;
1791
1792 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1793 if (BE (err != REG_NOERROR, 0))
1794 return err;
1795
1796 if (!state->inveclosure.alloc)
1797 {
1798 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1799 if (BE (err != REG_NOERROR, 0))
1800 return REG_ESPACE;
1801 for (i = 0; i < dest_nodes->nelem; i++)
1802 {
1803 err = re_node_set_merge (&state->inveclosure,
1804 dfa->inveclosures + dest_nodes->elems[i]);
1805 if (BE (err != REG_NOERROR, 0))
1806 return REG_ESPACE;
1807 }
1808 }
1809 return re_node_set_add_intersect (dest_nodes, candidates,
1810 &state->inveclosure);
1811}
1812
1813static reg_errcode_t
1814sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1815 const re_node_set *candidates)
1816{
1817 Idx ecl_idx;
1818 reg_errcode_t err;
1819 re_node_set *inv_eclosure = dfa->inveclosures + node;
1820 re_node_set except_nodes;
1821 re_node_set_init_empty (&except_nodes);
1822 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823 {
1824 Idx cur_node = inv_eclosure->elems[ecl_idx];
1825 if (cur_node == node)
1826 continue;
1827 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1828 {
1829 Idx edst1 = dfa->edests[cur_node].elems[0];
1830 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1831 ? dfa->edests[cur_node].elems[1] : -1);
1832 if ((!re_node_set_contains (inv_eclosure, edst1)
1833 && re_node_set_contains (dest_nodes, edst1))
1834 || (edst2 > 0
1835 && !re_node_set_contains (inv_eclosure, edst2)
1836 && re_node_set_contains (dest_nodes, edst2)))
1837 {
1838 err = re_node_set_add_intersect (&except_nodes, candidates,
1839 dfa->inveclosures + cur_node);
1840 if (BE (err != REG_NOERROR, 0))
1841 {
1842 re_node_set_free (&except_nodes);
1843 return err;
1844 }
1845 }
1846 }
1847 }
1848 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849 {
1850 Idx cur_node = inv_eclosure->elems[ecl_idx];
1851 if (!re_node_set_contains (&except_nodes, cur_node))
1852 {
1853 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1854 re_node_set_remove_at (dest_nodes, idx);
1855 }
1856 }
1857 re_node_set_free (&except_nodes);
1858 return REG_NOERROR;
1859}
1860
1861static bool
1862check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1863 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1864{
1865 const re_dfa_t *const dfa = mctx->dfa;
1866 Idx lim_idx, src_pos, dst_pos;
1867
1868 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1869 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1870 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1871 {
1872 Idx subexp_idx;
1873 struct re_backref_cache_entry *ent;
1874 ent = mctx->bkref_ents + limits->elems[lim_idx];
1875 subexp_idx = dfa->nodes[ent->node].opr.idx;
1876
1877 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878 subexp_idx, dst_node, dst_idx,
1879 dst_bkref_idx);
1880 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 subexp_idx, src_node, src_idx,
1882 src_bkref_idx);
1883
1884 /* In case of:
1885 <src> <dst> ( <subexp> )
1886 ( <subexp> ) <src> <dst>
1887 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1888 if (src_pos == dst_pos)
1889 continue; /* This is unrelated limitation. */
1890 else
1891 return true;
1892 }
1893 return false;
1894}
1895
1896static int
1897check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1898 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1899{
1900 const re_dfa_t *const dfa = mctx->dfa;
1901 const re_node_set *eclosures = dfa->eclosures + from_node;
1902 Idx node_idx;
1903
1904 /* Else, we are on the boundary: examine the nodes on the epsilon
1905 closure. */
1906 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1907 {
1908 Idx node = eclosures->elems[node_idx];
1909 switch (dfa->nodes[node].type)
1910 {
1911 case OP_BACK_REF:
1912 if (bkref_idx != -1)
1913 {
1914 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1915 do
1916 {
1917 Idx dst;
1918 int cpos;
1919
1920 if (ent->node != node)
1921 continue;
1922
1923 if (subexp_idx < BITSET_WORD_BITS
1924 && !(ent->eps_reachable_subexps_map
1925 & ((bitset_word_t) 1 << subexp_idx)))
1926 continue;
1927
1928 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1929 OP_CLOSE_SUBEXP cases below. But, if the
1930 destination node is the same node as the source
1931 node, don't recurse because it would cause an
1932 infinite loop: a regex that exhibits this behavior
1933 is ()\1*\1* */
1934 dst = dfa->edests[node].elems[0];
1935 if (dst == from_node)
1936 {
1937 if (boundaries & 1)
1938 return -1;
1939 else /* if (boundaries & 2) */
1940 return 0;
1941 }
1942
1943 cpos =
1944 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1945 dst, bkref_idx);
1946 if (cpos == -1 /* && (boundaries & 1) */)
1947 return -1;
1948 if (cpos == 0 && (boundaries & 2))
1949 return 0;
1950
1951 if (subexp_idx < BITSET_WORD_BITS)
1952 ent->eps_reachable_subexps_map
1953 &= ~((bitset_word_t) 1 << subexp_idx);
1954 }
1955 while (ent++->more);
1956 }
1957 break;
1958
1959 case OP_OPEN_SUBEXP:
1960 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 return -1;
1962 break;
1963
1964 case OP_CLOSE_SUBEXP:
1965 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 return 0;
1967 break;
1968
1969 default:
1970 break;
1971 }
1972 }
1973
1974 return (boundaries & 2) ? 1 : 0;
1975}
1976
1977static int
1978check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1979 Idx subexp_idx, Idx from_node, Idx str_idx,
1980 Idx bkref_idx)
1981{
1982 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983 int boundaries;
1984
1985 /* If we are outside the range of the subexpression, return -1 or 1. */
1986 if (str_idx < lim->subexp_from)
1987 return -1;
1988
1989 if (lim->subexp_to < str_idx)
1990 return 1;
1991
1992 /* If we are within the subexpression, return 0. */
1993 boundaries = (str_idx == lim->subexp_from);
1994 boundaries |= (str_idx == lim->subexp_to) << 1;
1995 if (boundaries == 0)
1996 return 0;
1997
1998 /* Else, examine epsilon closure. */
1999 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 from_node, bkref_idx);
2001}
2002
2003/* Check the limitations of sub expressions LIMITS, and remove the nodes
2004 which are against limitations from DEST_NODES. */
2005
2006static reg_errcode_t
2007check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2008 const re_node_set *candidates, re_node_set *limits,
2009 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2010{
2011 reg_errcode_t err;
2012 Idx node_idx, lim_idx;
2013
2014 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015 {
2016 Idx subexp_idx;
2017 struct re_backref_cache_entry *ent;
2018 ent = bkref_ents + limits->elems[lim_idx];
2019
2020 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021 continue; /* This is unrelated limitation. */
2022
2023 subexp_idx = dfa->nodes[ent->node].opr.idx;
2024 if (ent->subexp_to == str_idx)
2025 {
2026 Idx ops_node = -1;
2027 Idx cls_node = -1;
2028 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029 {
2030 Idx node = dest_nodes->elems[node_idx];
2031 re_token_type_t type = dfa->nodes[node].type;
2032 if (type == OP_OPEN_SUBEXP
2033 && subexp_idx == dfa->nodes[node].opr.idx)
2034 ops_node = node;
2035 else if (type == OP_CLOSE_SUBEXP
2036 && subexp_idx == dfa->nodes[node].opr.idx)
2037 cls_node = node;
2038 }
2039
2040 /* Check the limitation of the open subexpression. */
2041 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2042 if (ops_node >= 0)
2043 {
2044 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045 candidates);
2046 if (BE (err != REG_NOERROR, 0))
2047 return err;
2048 }
2049
2050 /* Check the limitation of the close subexpression. */
2051 if (cls_node >= 0)
2052 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053 {
2054 Idx node = dest_nodes->elems[node_idx];
2055 if (!re_node_set_contains (dfa->inveclosures + node,
2056 cls_node)
2057 && !re_node_set_contains (dfa->eclosures + node,
2058 cls_node))
2059 {
2060 /* It is against this limitation.
2061 Remove it form the current sifted state. */
2062 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063 candidates);
2064 if (BE (err != REG_NOERROR, 0))
2065 return err;
2066 --node_idx;
2067 }
2068 }
2069 }
2070 else /* (ent->subexp_to != str_idx) */
2071 {
2072 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073 {
2074 Idx node = dest_nodes->elems[node_idx];
2075 re_token_type_t type = dfa->nodes[node].type;
2076 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077 {
2078 if (subexp_idx != dfa->nodes[node].opr.idx)
2079 continue;
2080 /* It is against this limitation.
2081 Remove it form the current sifted state. */
2082 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083 candidates);
2084 if (BE (err != REG_NOERROR, 0))
2085 return err;
2086 }
2087 }
2088 }
2089 }
2090 return REG_NOERROR;
2091}
2092
2093static reg_errcode_t
2094__attribute_warn_unused_result__
2095sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2096 Idx str_idx, const re_node_set *candidates)
2097{
2098 const re_dfa_t *const dfa = mctx->dfa;
2099 reg_errcode_t err;
2100 Idx node_idx, node;
2101 re_sift_context_t local_sctx;
2102 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103
2104 if (first_idx == -1)
2105 return REG_NOERROR;
2106
2107 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2108
2109 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110 {
2111 Idx enabled_idx;
2112 re_token_type_t type;
2113 struct re_backref_cache_entry *entry;
2114 node = candidates->elems[node_idx];
2115 type = dfa->nodes[node].type;
2116 /* Avoid infinite loop for the REs like "()\1+". */
2117 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118 continue;
2119 if (type != OP_BACK_REF)
2120 continue;
2121
2122 entry = mctx->bkref_ents + first_idx;
2123 enabled_idx = first_idx;
2124 do
2125 {
2126 Idx subexp_len;
2127 Idx to_idx;
2128 Idx dst_node;
2129 bool ok;
2130 re_dfastate_t *cur_state;
2131
2132 if (entry->node != node)
2133 continue;
2134 subexp_len = entry->subexp_to - entry->subexp_from;
2135 to_idx = str_idx + subexp_len;
2136 dst_node = (subexp_len ? dfa->nexts[node]
2137 : dfa->edests[node].elems[0]);
2138
2139 if (to_idx > sctx->last_str_idx
2140 || sctx->sifted_states[to_idx] == NULL
2141 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2142 || check_dst_limits (mctx, &sctx->limits, node,
2143 str_idx, dst_node, to_idx))
2144 continue;
2145
2146 if (local_sctx.sifted_states == NULL)
2147 {
2148 local_sctx = *sctx;
2149 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2150 if (BE (err != REG_NOERROR, 0))
2151 goto free_return;
2152 }
2153 local_sctx.last_node = node;
2154 local_sctx.last_str_idx = str_idx;
2155 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2156 if (BE (! ok, 0))
2157 {
2158 err = REG_ESPACE;
2159 goto free_return;
2160 }
2161 cur_state = local_sctx.sifted_states[str_idx];
2162 err = sift_states_backward (mctx, &local_sctx);
2163 if (BE (err != REG_NOERROR, 0))
2164 goto free_return;
2165 if (sctx->limited_states != NULL)
2166 {
2167 err = merge_state_array (dfa, sctx->limited_states,
2168 local_sctx.sifted_states,
2169 str_idx + 1);
2170 if (BE (err != REG_NOERROR, 0))
2171 goto free_return;
2172 }
2173 local_sctx.sifted_states[str_idx] = cur_state;
2174 re_node_set_remove (&local_sctx.limits, enabled_idx);
2175
2176 /* mctx->bkref_ents may have changed, reload the pointer. */
2177 entry = mctx->bkref_ents + enabled_idx;
2178 }
2179 while (enabled_idx++, entry++->more);
2180 }
2181 err = REG_NOERROR;
2182 free_return:
2183 if (local_sctx.sifted_states != NULL)
2184 {
2185 re_node_set_free (&local_sctx.limits);
2186 }
2187
2188 return err;
2189}
2190
2191
2192#ifdef RE_ENABLE_I18N
2193static int
2194sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2195 Idx node_idx, Idx str_idx, Idx max_str_idx)
2196{
2197 const re_dfa_t *const dfa = mctx->dfa;
2198 int naccepted;
2199 /* Check the node can accept "multi byte". */
2200 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2201 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2202 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2203 dfa->nexts[node_idx]))
2204 /* The node can't accept the "multi byte", or the
2205 destination was already thrown away, then the node
2206 could't accept the current input "multi byte". */
2207 naccepted = 0;
2208 /* Otherwise, it is sure that the node could accept
2209 'naccepted' bytes input. */
2210 return naccepted;
2211}
2212#endif /* RE_ENABLE_I18N */
2213
2214
2215/* Functions for state transition. */
2216
2217/* Return the next state to which the current state STATE will transit by
2218 accepting the current input byte, and update STATE_LOG if necessary.
2219 If STATE can accept a multibyte char/collating element/back reference
2220 update the destination of STATE_LOG. */
2221
2222static re_dfastate_t *
2223__attribute_warn_unused_result__
2224transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2225 re_dfastate_t *state)
2226{
2227 re_dfastate_t **trtable;
2228 unsigned char ch;
2229
2230#ifdef RE_ENABLE_I18N
2231 /* If the current state can accept multibyte. */
2232 if (BE (state->accept_mb, 0))
2233 {
2234 *err = transit_state_mb (mctx, state);
2235 if (BE (*err != REG_NOERROR, 0))
2236 return NULL;
2237 }
2238#endif /* RE_ENABLE_I18N */
2239
2240 /* Then decide the next state with the single byte. */
2241#if 0
2242 if (0)
2243 /* don't use transition table */
2244 return transit_state_sb (err, mctx, state);
2245#endif
2246
2247 /* Use transition table */
2248 ch = re_string_fetch_byte (&mctx->input);
2249 for (;;)
2250 {
2251 trtable = state->trtable;
2252 if (BE (trtable != NULL, 1))
2253 return trtable[ch];
2254
2255 trtable = state->word_trtable;
2256 if (BE (trtable != NULL, 1))
2257 {
2258 unsigned int context;
2259 context
2260 = re_string_context_at (&mctx->input,
2261 re_string_cur_idx (&mctx->input) - 1,
2262 mctx->eflags);
2263 if (IS_WORD_CONTEXT (context))
2264 return trtable[ch + SBC_MAX];
2265 else
2266 return trtable[ch];
2267 }
2268
2269 if (!build_trtable (mctx->dfa, state))
2270 {
2271 *err = REG_ESPACE;
2272 return NULL;
2273 }
2274
2275 /* Retry, we now have a transition table. */
2276 }
2277}
2278
2279/* Update the state_log if we need */
2280static re_dfastate_t *
2281merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2282 re_dfastate_t *next_state)
2283{
2284 const re_dfa_t *const dfa = mctx->dfa;
2285 Idx cur_idx = re_string_cur_idx (&mctx->input);
2286
2287 if (cur_idx > mctx->state_log_top)
2288 {
2289 mctx->state_log[cur_idx] = next_state;
2290 mctx->state_log_top = cur_idx;
2291 }
2292 else if (mctx->state_log[cur_idx] == 0)
2293 {
2294 mctx->state_log[cur_idx] = next_state;
2295 }
2296 else
2297 {
2298 re_dfastate_t *pstate;
2299 unsigned int context;
2300 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 the destination of a multibyte char/collating element/
2303 back reference. Then the next state is the union set of
2304 these destinations and the results of the transition table. */
2305 pstate = mctx->state_log[cur_idx];
2306 log_nodes = pstate->entrance_nodes;
2307 if (next_state != NULL)
2308 {
2309 table_nodes = next_state->entrance_nodes;
2310 *err = re_node_set_init_union (&next_nodes, table_nodes,
2311 log_nodes);
2312 if (BE (*err != REG_NOERROR, 0))
2313 return NULL;
2314 }
2315 else
2316 next_nodes = *log_nodes;
2317 /* Note: We already add the nodes of the initial state,
2318 then we don't need to add them here. */
2319
2320 context = re_string_context_at (&mctx->input,
2321 re_string_cur_idx (&mctx->input) - 1,
2322 mctx->eflags);
2323 next_state = mctx->state_log[cur_idx]
2324 = re_acquire_state_context (err, dfa, &next_nodes, context);
2325 /* We don't need to check errors here, since the return value of
2326 this function is next_state and ERR is already set. */
2327
2328 if (table_nodes != NULL)
2329 re_node_set_free (&next_nodes);
2330 }
2331
2332 if (BE (dfa->nbackref, 0) && next_state != NULL)
2333 {
2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 later. We must check them here, since the back references in the
2336 next state might use them. */
2337 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338 cur_idx);
2339 if (BE (*err != REG_NOERROR, 0))
2340 return NULL;
2341
2342 /* If the next state has back references. */
2343 if (next_state->has_backref)
2344 {
2345 *err = transit_state_bkref (mctx, &next_state->nodes);
2346 if (BE (*err != REG_NOERROR, 0))
2347 return NULL;
2348 next_state = mctx->state_log[cur_idx];
2349 }
2350 }
2351
2352 return next_state;
2353}
2354
2355/* Skip bytes in the input that correspond to part of a
2356 multi-byte match, then look in the log for a state
2357 from which to restart matching. */
2358static re_dfastate_t *
2359find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360{
2361 re_dfastate_t *cur_state;
2362 do
2363 {
2364 Idx max = mctx->state_log_top;
2365 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366
2367 do
2368 {
2369 if (++cur_str_idx > max)
2370 return NULL;
2371 re_string_skip_bytes (&mctx->input, 1);
2372 }
2373 while (mctx->state_log[cur_str_idx] == NULL);
2374
2375 cur_state = merge_state_with_log (err, mctx, NULL);
2376 }
2377 while (*err == REG_NOERROR && cur_state == NULL);
2378 return cur_state;
2379}
2380
2381/* Helper functions for transit_state. */
2382
2383/* From the node set CUR_NODES, pick up the nodes whose types are
2384 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385 expression. And register them to use them later for evaluating the
2386 corresponding back references. */
2387
2388static reg_errcode_t
2389check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2390 Idx str_idx)
2391{
2392 const re_dfa_t *const dfa = mctx->dfa;
2393 Idx node_idx;
2394 reg_errcode_t err;
2395
2396 /* TODO: This isn't efficient.
2397 Because there might be more than one nodes whose types are
2398 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2399 nodes.
2400 E.g. RE: (a){2} */
2401 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402 {
2403 Idx node = cur_nodes->elems[node_idx];
2404 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2405 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2406 && (dfa->used_bkref_map
2407 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2408 {
2409 err = match_ctx_add_subtop (mctx, node, str_idx);
2410 if (BE (err != REG_NOERROR, 0))
2411 return err;
2412 }
2413 }
2414 return REG_NOERROR;
2415}
2416
2417#if 0
2418/* Return the next state to which the current state STATE will transit by
2419 accepting the current input byte. */
2420
2421static re_dfastate_t *
2422transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423 re_dfastate_t *state)
2424{
2425 const re_dfa_t *const dfa = mctx->dfa;
2426 re_node_set next_nodes;
2427 re_dfastate_t *next_state;
2428 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429 unsigned int context;
2430
2431 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432 if (BE (*err != REG_NOERROR, 0))
2433 return NULL;
2434 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435 {
2436 Idx cur_node = state->nodes.elems[node_cnt];
2437 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438 {
2439 *err = re_node_set_merge (&next_nodes,
2440 dfa->eclosures + dfa->nexts[cur_node]);
2441 if (BE (*err != REG_NOERROR, 0))
2442 {
2443 re_node_set_free (&next_nodes);
2444 return NULL;
2445 }
2446 }
2447 }
2448 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450 /* We don't need to check errors here, since the return value of
2451 this function is next_state and ERR is already set. */
2452
2453 re_node_set_free (&next_nodes);
2454 re_string_skip_bytes (&mctx->input, 1);
2455 return next_state;
2456}
2457#endif
2458
2459#ifdef RE_ENABLE_I18N
2460static reg_errcode_t
2461transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2462{
2463 const re_dfa_t *const dfa = mctx->dfa;
2464 reg_errcode_t err;
2465 Idx i;
2466
2467 for (i = 0; i < pstate->nodes.nelem; ++i)
2468 {
2469 re_node_set dest_nodes, *new_nodes;
2470 Idx cur_node_idx = pstate->nodes.elems[i];
2471 int naccepted;
2472 Idx dest_idx;
2473 unsigned int context;
2474 re_dfastate_t *dest_state;
2475
2476 if (!dfa->nodes[cur_node_idx].accept_mb)
2477 continue;
2478
2479 if (dfa->nodes[cur_node_idx].constraint)
2480 {
2481 context = re_string_context_at (&mctx->input,
2482 re_string_cur_idx (&mctx->input),
2483 mctx->eflags);
2484 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2485 context))
2486 continue;
2487 }
2488
2489 /* How many bytes the node can accept? */
2490 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2491 re_string_cur_idx (&mctx->input));
2492 if (naccepted == 0)
2493 continue;
2494
2495 /* The node can accepts 'naccepted' bytes. */
2496 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2497 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498 : mctx->max_mb_elem_len);
2499 err = clean_state_log_if_needed (mctx, dest_idx);
2500 if (BE (err != REG_NOERROR, 0))
2501 return err;
2502#ifdef DEBUG
2503 assert (dfa->nexts[cur_node_idx] != -1);
2504#endif
2505 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2506
2507 dest_state = mctx->state_log[dest_idx];
2508 if (dest_state == NULL)
2509 dest_nodes = *new_nodes;
2510 else
2511 {
2512 err = re_node_set_init_union (&dest_nodes,
2513 dest_state->entrance_nodes, new_nodes);
2514 if (BE (err != REG_NOERROR, 0))
2515 return err;
2516 }
2517 context = re_string_context_at (&mctx->input, dest_idx - 1,
2518 mctx->eflags);
2519 mctx->state_log[dest_idx]
2520 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521 if (dest_state != NULL)
2522 re_node_set_free (&dest_nodes);
2523 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524 return err;
2525 }
2526 return REG_NOERROR;
2527}
2528#endif /* RE_ENABLE_I18N */
2529
2530static reg_errcode_t
2531transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2532{
2533 const re_dfa_t *const dfa = mctx->dfa;
2534 reg_errcode_t err;
2535 Idx i;
2536 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2537
2538 for (i = 0; i < nodes->nelem; ++i)
2539 {
2540 Idx dest_str_idx, prev_nelem, bkc_idx;
2541 Idx node_idx = nodes->elems[i];
2542 unsigned int context;
2543 const re_token_t *node = dfa->nodes + node_idx;
2544 re_node_set *new_dest_nodes;
2545
2546 /* Check whether 'node' is a backreference or not. */
2547 if (node->type != OP_BACK_REF)
2548 continue;
2549
2550 if (node->constraint)
2551 {
2552 context = re_string_context_at (&mctx->input, cur_str_idx,
2553 mctx->eflags);
2554 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2555 continue;
2556 }
2557
2558 /* 'node' is a backreference.
2559 Check the substring which the substring matched. */
2560 bkc_idx = mctx->nbkref_ents;
2561 err = get_subexp (mctx, node_idx, cur_str_idx);
2562 if (BE (err != REG_NOERROR, 0))
2563 goto free_return;
2564
2565 /* And add the epsilon closures (which is 'new_dest_nodes') of
2566 the backreference to appropriate state_log. */
2567#ifdef DEBUG
2568 assert (dfa->nexts[node_idx] != -1);
2569#endif
2570 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2571 {
2572 Idx subexp_len;
2573 re_dfastate_t *dest_state;
2574 struct re_backref_cache_entry *bkref_ent;
2575 bkref_ent = mctx->bkref_ents + bkc_idx;
2576 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2577 continue;
2578 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2579 new_dest_nodes = (subexp_len == 0
2580 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2581 : dfa->eclosures + dfa->nexts[node_idx]);
2582 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2583 - bkref_ent->subexp_from);
2584 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2585 mctx->eflags);
2586 dest_state = mctx->state_log[dest_str_idx];
2587 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2588 : mctx->state_log[cur_str_idx]->nodes.nelem);
2589 /* Add 'new_dest_node' to state_log. */
2590 if (dest_state == NULL)
2591 {
2592 mctx->state_log[dest_str_idx]
2593 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2594 context);
2595 if (BE (mctx->state_log[dest_str_idx] == NULL
2596 && err != REG_NOERROR, 0))
2597 goto free_return;
2598 }
2599 else
2600 {
2601 re_node_set dest_nodes;
2602 err = re_node_set_init_union (&dest_nodes,
2603 dest_state->entrance_nodes,
2604 new_dest_nodes);
2605 if (BE (err != REG_NOERROR, 0))
2606 {
2607 re_node_set_free (&dest_nodes);
2608 goto free_return;
2609 }
2610 mctx->state_log[dest_str_idx]
2611 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2612 re_node_set_free (&dest_nodes);
2613 if (BE (mctx->state_log[dest_str_idx] == NULL
2614 && err != REG_NOERROR, 0))
2615 goto free_return;
2616 }
2617 /* We need to check recursively if the backreference can epsilon
2618 transit. */
2619 if (subexp_len == 0
2620 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2621 {
2622 err = check_subexp_matching_top (mctx, new_dest_nodes,
2623 cur_str_idx);
2624 if (BE (err != REG_NOERROR, 0))
2625 goto free_return;
2626 err = transit_state_bkref (mctx, new_dest_nodes);
2627 if (BE (err != REG_NOERROR, 0))
2628 goto free_return;
2629 }
2630 }
2631 }
2632 err = REG_NOERROR;
2633 free_return:
2634 return err;
2635}
2636
2637/* Enumerate all the candidates which the backreference BKREF_NODE can match
2638 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2639 Note that we might collect inappropriate candidates here.
2640 However, the cost of checking them strictly here is too high, then we
2641 delay these checking for prune_impossible_nodes(). */
2642
2643static reg_errcode_t
2644__attribute_warn_unused_result__
2645get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2646{
2647 const re_dfa_t *const dfa = mctx->dfa;
2648 Idx subexp_num, sub_top_idx;
2649 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2650 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2651 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2652 if (cache_idx != -1)
2653 {
2654 const struct re_backref_cache_entry *entry
2655 = mctx->bkref_ents + cache_idx;
2656 do
2657 if (entry->node == bkref_node)
2658 return REG_NOERROR; /* We already checked it. */
2659 while (entry++->more);
2660 }
2661
2662 subexp_num = dfa->nodes[bkref_node].opr.idx;
2663
2664 /* For each sub expression */
2665 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666 {
2667 reg_errcode_t err;
2668 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669 re_sub_match_last_t *sub_last;
2670 Idx sub_last_idx, sl_str, bkref_str_off;
2671
2672 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673 continue; /* It isn't related. */
2674
2675 sl_str = sub_top->str_idx;
2676 bkref_str_off = bkref_str_idx;
2677 /* At first, check the last node of sub expressions we already
2678 evaluated. */
2679 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680 {
2681 regoff_t sl_str_diff;
2682 sub_last = sub_top->lasts[sub_last_idx];
2683 sl_str_diff = sub_last->str_idx - sl_str;
2684 /* The matched string by the sub expression match with the substring
2685 at the back reference? */
2686 if (sl_str_diff > 0)
2687 {
2688 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689 {
2690 /* Not enough chars for a successful match. */
2691 if (bkref_str_off + sl_str_diff > mctx->input.len)
2692 break;
2693
2694 err = clean_state_log_if_needed (mctx,
2695 bkref_str_off
2696 + sl_str_diff);
2697 if (BE (err != REG_NOERROR, 0))
2698 return err;
2699 buf = (const char *) re_string_get_buffer (&mctx->input);
2700 }
2701 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702 /* We don't need to search this sub expression any more. */
2703 break;
2704 }
2705 bkref_str_off += sl_str_diff;
2706 sl_str += sl_str_diff;
2707 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2708 bkref_str_idx);
2709
2710 /* Reload buf, since the preceding call might have reallocated
2711 the buffer. */
2712 buf = (const char *) re_string_get_buffer (&mctx->input);
2713
2714 if (err == REG_NOMATCH)
2715 continue;
2716 if (BE (err != REG_NOERROR, 0))
2717 return err;
2718 }
2719
2720 if (sub_last_idx < sub_top->nlasts)
2721 continue;
2722 if (sub_last_idx > 0)
2723 ++sl_str;
2724 /* Then, search for the other last nodes of the sub expression. */
2725 for (; sl_str <= bkref_str_idx; ++sl_str)
2726 {
2727 Idx cls_node;
2728 regoff_t sl_str_off;
2729 const re_node_set *nodes;
2730 sl_str_off = sl_str - sub_top->str_idx;
2731 /* The matched string by the sub expression match with the substring
2732 at the back reference? */
2733 if (sl_str_off > 0)
2734 {
2735 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736 {
2737 /* If we are at the end of the input, we cannot match. */
2738 if (bkref_str_off >= mctx->input.len)
2739 break;
2740
2741 err = extend_buffers (mctx, bkref_str_off + 1);
2742 if (BE (err != REG_NOERROR, 0))
2743 return err;
2744
2745 buf = (const char *) re_string_get_buffer (&mctx->input);
2746 }
2747 if (buf [bkref_str_off++] != buf[sl_str - 1])
2748 break; /* We don't need to search this sub expression
2749 any more. */
2750 }
2751 if (mctx->state_log[sl_str] == NULL)
2752 continue;
2753 /* Does this state have a ')' of the sub expression? */
2754 nodes = &mctx->state_log[sl_str]->nodes;
2755 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2756 OP_CLOSE_SUBEXP);
2757 if (cls_node == -1)
2758 continue; /* No. */
2759 if (sub_top->path == NULL)
2760 {
2761 sub_top->path = calloc (sizeof (state_array_t),
2762 sl_str - sub_top->str_idx + 1);
2763 if (sub_top->path == NULL)
2764 return REG_ESPACE;
2765 }
2766 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2767 in the current context? */
2768 err = check_arrival (mctx, sub_top->path, sub_top->node,
2769 sub_top->str_idx, cls_node, sl_str,
2770 OP_CLOSE_SUBEXP);
2771 if (err == REG_NOMATCH)
2772 continue;
2773 if (BE (err != REG_NOERROR, 0))
2774 return err;
2775 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2776 if (BE (sub_last == NULL, 0))
2777 return REG_ESPACE;
2778 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2779 bkref_str_idx);
2780 if (err == REG_NOMATCH)
2781 continue;
2782 }
2783 }
2784 return REG_NOERROR;
2785}
2786
2787/* Helper functions for get_subexp(). */
2788
2789/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2790 If it can arrive, register the sub expression expressed with SUB_TOP
2791 and SUB_LAST. */
2792
2793static reg_errcode_t
2794get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2795 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2796{
2797 reg_errcode_t err;
2798 Idx to_idx;
2799 /* Can the subexpression arrive the back reference? */
2800 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2801 sub_last->str_idx, bkref_node, bkref_str,
2802 OP_OPEN_SUBEXP);
2803 if (err != REG_NOERROR)
2804 return err;
2805 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806 sub_last->str_idx);
2807 if (BE (err != REG_NOERROR, 0))
2808 return err;
2809 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810 return clean_state_log_if_needed (mctx, to_idx);
2811}
2812
2813/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814 Search '(' if FL_OPEN, or search ')' otherwise.
2815 TODO: This function isn't efficient...
2816 Because there might be more than one nodes whose types are
2817 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818 nodes.
2819 E.g. RE: (a){2} */
2820
2821static Idx
2822find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2823 Idx subexp_idx, int type)
2824{
2825 Idx cls_idx;
2826 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827 {
2828 Idx cls_node = nodes->elems[cls_idx];
2829 const re_token_t *node = dfa->nodes + cls_node;
2830 if (node->type == type
2831 && node->opr.idx == subexp_idx)
2832 return cls_node;
2833 }
2834 return -1;
2835}
2836
2837/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2839 heavily reused.
2840 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2841
2842static reg_errcode_t
2843__attribute_warn_unused_result__
2844check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845 Idx top_str, Idx last_node, Idx last_str, int type)
2846{
2847 const re_dfa_t *const dfa = mctx->dfa;
2848 reg_errcode_t err = REG_NOERROR;
2849 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2850 re_dfastate_t *cur_state = NULL;
2851 re_node_set *cur_nodes, next_nodes;
2852 re_dfastate_t **backup_state_log;
2853 unsigned int context;
2854
2855 subexp_num = dfa->nodes[top_node].opr.idx;
2856 /* Extend the buffer if we need. */
2857 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858 {
2859 re_dfastate_t **new_array;
2860 Idx old_alloc = path->alloc;
2861 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2862 Idx new_alloc;
2863 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2864 return REG_ESPACE;
2865 new_alloc = old_alloc + incr_alloc;
2866 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2867 return REG_ESPACE;
2868 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2869 if (BE (new_array == NULL, 0))
2870 return REG_ESPACE;
2871 path->array = new_array;
2872 path->alloc = new_alloc;
2873 memset (new_array + old_alloc, '\0',
2874 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875 }
2876
2877 str_idx = path->next_idx ? path->next_idx : top_str;
2878
2879 /* Temporary modify MCTX. */
2880 backup_state_log = mctx->state_log;
2881 backup_cur_idx = mctx->input.cur_idx;
2882 mctx->state_log = path->array;
2883 mctx->input.cur_idx = str_idx;
2884
2885 /* Setup initial node set. */
2886 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887 if (str_idx == top_str)
2888 {
2889 err = re_node_set_init_1 (&next_nodes, top_node);
2890 if (BE (err != REG_NOERROR, 0))
2891 return err;
2892 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893 if (BE (err != REG_NOERROR, 0))
2894 {
2895 re_node_set_free (&next_nodes);
2896 return err;
2897 }
2898 }
2899 else
2900 {
2901 cur_state = mctx->state_log[str_idx];
2902 if (cur_state && cur_state->has_backref)
2903 {
2904 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905 if (BE (err != REG_NOERROR, 0))
2906 return err;
2907 }
2908 else
2909 re_node_set_init_empty (&next_nodes);
2910 }
2911 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912 {
2913 if (next_nodes.nelem)
2914 {
2915 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916 subexp_num, type);
2917 if (BE (err != REG_NOERROR, 0))
2918 {
2919 re_node_set_free (&next_nodes);
2920 return err;
2921 }
2922 }
2923 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925 {
2926 re_node_set_free (&next_nodes);
2927 return err;
2928 }
2929 mctx->state_log[str_idx] = cur_state;
2930 }
2931
2932 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933 {
2934 re_node_set_empty (&next_nodes);
2935 if (mctx->state_log[str_idx + 1])
2936 {
2937 err = re_node_set_merge (&next_nodes,
2938 &mctx->state_log[str_idx + 1]->nodes);
2939 if (BE (err != REG_NOERROR, 0))
2940 {
2941 re_node_set_free (&next_nodes);
2942 return err;
2943 }
2944 }
2945 if (cur_state)
2946 {
2947 err = check_arrival_add_next_nodes (mctx, str_idx,
2948 &cur_state->non_eps_nodes,
2949 &next_nodes);
2950 if (BE (err != REG_NOERROR, 0))
2951 {
2952 re_node_set_free (&next_nodes);
2953 return err;
2954 }
2955 }
2956 ++str_idx;
2957 if (next_nodes.nelem)
2958 {
2959 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2960 if (BE (err != REG_NOERROR, 0))
2961 {
2962 re_node_set_free (&next_nodes);
2963 return err;
2964 }
2965 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2966 subexp_num, type);
2967 if (BE (err != REG_NOERROR, 0))
2968 {
2969 re_node_set_free (&next_nodes);
2970 return err;
2971 }
2972 }
2973 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2974 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976 {
2977 re_node_set_free (&next_nodes);
2978 return err;
2979 }
2980 mctx->state_log[str_idx] = cur_state;
2981 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982 }
2983 re_node_set_free (&next_nodes);
2984 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2985 : &mctx->state_log[last_str]->nodes);
2986 path->next_idx = str_idx;
2987
2988 /* Fix MCTX. */
2989 mctx->state_log = backup_state_log;
2990 mctx->input.cur_idx = backup_cur_idx;
2991
2992 /* Then check the current node set has the node LAST_NODE. */
2993 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2994 return REG_NOERROR;
2995
2996 return REG_NOMATCH;
2997}
2998
2999/* Helper functions for check_arrival. */
3000
3001/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3002 to NEXT_NODES.
3003 TODO: This function is similar to the functions transit_state*(),
3004 however this function has many additional works.
3005 Can't we unify them? */
3006
3007static reg_errcode_t
3008__attribute_warn_unused_result__
3009check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3010 re_node_set *cur_nodes, re_node_set *next_nodes)
3011{
3012 const re_dfa_t *const dfa = mctx->dfa;
3013 bool ok;
3014 Idx cur_idx;
3015#ifdef RE_ENABLE_I18N
3016 reg_errcode_t err = REG_NOERROR;
3017#endif
3018 re_node_set union_set;
3019 re_node_set_init_empty (&union_set);
3020 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3021 {
3022 int naccepted = 0;
3023 Idx cur_node = cur_nodes->elems[cur_idx];
3024#ifdef DEBUG
3025 re_token_type_t type = dfa->nodes[cur_node].type;
3026 assert (!IS_EPSILON_NODE (type));
3027#endif
3028#ifdef RE_ENABLE_I18N
3029 /* If the node may accept "multi byte". */
3030 if (dfa->nodes[cur_node].accept_mb)
3031 {
3032 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033 str_idx);
3034 if (naccepted > 1)
3035 {
3036 re_dfastate_t *dest_state;
3037 Idx next_node = dfa->nexts[cur_node];
3038 Idx next_idx = str_idx + naccepted;
3039 dest_state = mctx->state_log[next_idx];
3040 re_node_set_empty (&union_set);
3041 if (dest_state)
3042 {
3043 err = re_node_set_merge (&union_set, &dest_state->nodes);
3044 if (BE (err != REG_NOERROR, 0))
3045 {
3046 re_node_set_free (&union_set);
3047 return err;
3048 }
3049 }
3050 ok = re_node_set_insert (&union_set, next_node);
3051 if (BE (! ok, 0))
3052 {
3053 re_node_set_free (&union_set);
3054 return REG_ESPACE;
3055 }
3056 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057 &union_set);
3058 if (BE (mctx->state_log[next_idx] == NULL
3059 && err != REG_NOERROR, 0))
3060 {
3061 re_node_set_free (&union_set);
3062 return err;
3063 }
3064 }
3065 }
3066#endif /* RE_ENABLE_I18N */
3067 if (naccepted
3068 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3069 {
3070 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071 if (BE (! ok, 0))
3072 {
3073 re_node_set_free (&union_set);
3074 return REG_ESPACE;
3075 }
3076 }
3077 }
3078 re_node_set_free (&union_set);
3079 return REG_NOERROR;
3080}
3081
3082/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083 CUR_NODES, however exclude the nodes which are:
3084 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086*/
3087
3088static reg_errcode_t
3089check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3090 Idx ex_subexp, int type)
3091{
3092 reg_errcode_t err;
3093 Idx idx, outside_node;
3094 re_node_set new_nodes;
3095#ifdef DEBUG
3096 assert (cur_nodes->nelem);
3097#endif
3098 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3099 if (BE (err != REG_NOERROR, 0))
3100 return err;
3101 /* Create a new node set NEW_NODES with the nodes which are epsilon
3102 closures of the node in CUR_NODES. */
3103
3104 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3105 {
3106 Idx cur_node = cur_nodes->elems[idx];
3107 const re_node_set *eclosure = dfa->eclosures + cur_node;
3108 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3109 if (outside_node == -1)
3110 {
3111 /* There are no problematic nodes, just merge them. */
3112 err = re_node_set_merge (&new_nodes, eclosure);
3113 if (BE (err != REG_NOERROR, 0))
3114 {
3115 re_node_set_free (&new_nodes);
3116 return err;
3117 }
3118 }
3119 else
3120 {
3121 /* There are problematic nodes, re-calculate incrementally. */
3122 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3123 ex_subexp, type);
3124 if (BE (err != REG_NOERROR, 0))
3125 {
3126 re_node_set_free (&new_nodes);
3127 return err;
3128 }
3129 }
3130 }
3131 re_node_set_free (cur_nodes);
3132 *cur_nodes = new_nodes;
3133 return REG_NOERROR;
3134}
3135
3136/* Helper function for check_arrival_expand_ecl.
3137 Check incrementally the epsilon closure of TARGET, and if it isn't
3138 problematic append it to DST_NODES. */
3139
3140static reg_errcode_t
3141__attribute_warn_unused_result__
3142check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3143 Idx target, Idx ex_subexp, int type)
3144{
3145 Idx cur_node;
3146 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3147 {
3148 bool ok;
3149
3150 if (dfa->nodes[cur_node].type == type
3151 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3152 {
3153 if (type == OP_CLOSE_SUBEXP)
3154 {
3155 ok = re_node_set_insert (dst_nodes, cur_node);
3156 if (BE (! ok, 0))
3157 return REG_ESPACE;
3158 }
3159 break;
3160 }
3161 ok = re_node_set_insert (dst_nodes, cur_node);
3162 if (BE (! ok, 0))
3163 return REG_ESPACE;
3164 if (dfa->edests[cur_node].nelem == 0)
3165 break;
3166 if (dfa->edests[cur_node].nelem == 2)
3167 {
3168 reg_errcode_t err;
3169 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3170 dfa->edests[cur_node].elems[1],
3171 ex_subexp, type);
3172 if (BE (err != REG_NOERROR, 0))
3173 return err;
3174 }
3175 cur_node = dfa->edests[cur_node].elems[0];
3176 }
3177 return REG_NOERROR;
3178}
3179
3180
3181/* For all the back references in the current state, calculate the
3182 destination of the back references by the appropriate entry
3183 in MCTX->BKREF_ENTS. */
3184
3185static reg_errcode_t
3186__attribute_warn_unused_result__
3187expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3188 Idx cur_str, Idx subexp_num, int type)
3189{
3190 const re_dfa_t *const dfa = mctx->dfa;
3191 reg_errcode_t err;
3192 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3193 struct re_backref_cache_entry *ent;
3194
3195 if (cache_idx_start == -1)
3196 return REG_NOERROR;
3197
3198 restart:
3199 ent = mctx->bkref_ents + cache_idx_start;
3200 do
3201 {
3202 Idx to_idx, next_node;
3203
3204 /* Is this entry ENT is appropriate? */
3205 if (!re_node_set_contains (cur_nodes, ent->node))
3206 continue; /* No. */
3207
3208 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3209 /* Calculate the destination of the back reference, and append it
3210 to MCTX->STATE_LOG. */
3211 if (to_idx == cur_str)
3212 {
3213 /* The backreference did epsilon transit, we must re-check all the
3214 node in the current state. */
3215 re_node_set new_dests;
3216 reg_errcode_t err2, err3;
3217 next_node = dfa->edests[ent->node].elems[0];
3218 if (re_node_set_contains (cur_nodes, next_node))
3219 continue;
3220 err = re_node_set_init_1 (&new_dests, next_node);
3221 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3222 err3 = re_node_set_merge (cur_nodes, &new_dests);
3223 re_node_set_free (&new_dests);
3224 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3225 || err3 != REG_NOERROR, 0))
3226 {
3227 err = (err != REG_NOERROR ? err
3228 : (err2 != REG_NOERROR ? err2 : err3));
3229 return err;
3230 }
3231 /* TODO: It is still inefficient... */
3232 goto restart;
3233 }
3234 else
3235 {
3236 re_node_set union_set;
3237 next_node = dfa->nexts[ent->node];
3238 if (mctx->state_log[to_idx])
3239 {
3240 bool ok;
3241 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3242 next_node))
3243 continue;
3244 err = re_node_set_init_copy (&union_set,
3245 &mctx->state_log[to_idx]->nodes);
3246 ok = re_node_set_insert (&union_set, next_node);
3247 if (BE (err != REG_NOERROR || ! ok, 0))
3248 {
3249 re_node_set_free (&union_set);
3250 err = err != REG_NOERROR ? err : REG_ESPACE;
3251 return err;
3252 }
3253 }
3254 else
3255 {
3256 err = re_node_set_init_1 (&union_set, next_node);
3257 if (BE (err != REG_NOERROR, 0))
3258 return err;
3259 }
3260 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3261 re_node_set_free (&union_set);
3262 if (BE (mctx->state_log[to_idx] == NULL
3263 && err != REG_NOERROR, 0))
3264 return err;
3265 }
3266 }
3267 while (ent++->more);
3268 return REG_NOERROR;
3269}
3270
3271/* Build transition table for the state.
3272 Return true if successful. */
3273
3274static bool
3275build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3276{
3277 reg_errcode_t err;
3278 Idx i, j;
3279 int ch;
3280 bool need_word_trtable = false;
3281 bitset_word_t elem, mask;
3282 bool dests_node_malloced = false;
3283 bool dest_states_malloced = false;
3284 Idx ndests; /* Number of the destination states from 'state'. */
3285 re_dfastate_t **trtable;
3286 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3287 re_node_set follows, *dests_node;
3288 bitset_t *dests_ch;
3289 bitset_t acceptable;
3290
3291 struct dests_alloc
3292 {
3293 re_node_set dests_node[SBC_MAX];
3294 bitset_t dests_ch[SBC_MAX];
3295 } *dests_alloc;
3296
3297 /* We build DFA states which corresponds to the destination nodes
3298 from 'state'. 'dests_node[i]' represents the nodes which i-th
3299 destination state contains, and 'dests_ch[i]' represents the
3300 characters which i-th destination state accepts. */
3301 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3302 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3303 else
3304 {
3305 dests_alloc = re_malloc (struct dests_alloc, 1);
3306 if (BE (dests_alloc == NULL, 0))
3307 return false;
3308 dests_node_malloced = true;
3309 }
3310 dests_node = dests_alloc->dests_node;
3311 dests_ch = dests_alloc->dests_ch;
3312
3313 /* Initialize transition table. */
3314 state->word_trtable = state->trtable = NULL;
3315
3316 /* At first, group all nodes belonging to 'state' into several
3317 destinations. */
3318 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319 if (BE (ndests <= 0, 0))
3320 {
3321 if (dests_node_malloced)
3322 re_free (dests_alloc);
3323 /* Return false in case of an error, true otherwise. */
3324 if (ndests == 0)
3325 {
3326 state->trtable = (re_dfastate_t **)
3327 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3328 if (BE (state->trtable == NULL, 0))
3329 return false;
3330 return true;
3331 }
3332 return false;
3333 }
3334
3335 err = re_node_set_alloc (&follows, ndests + 1);
3336 if (BE (err != REG_NOERROR, 0))
3337 goto out_free;
3338
3339 /* Avoid arithmetic overflow in size calculation. */
3340 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3341 / (3 * sizeof (re_dfastate_t *)))
3342 < ndests),
3343 0))
3344 goto out_free;
3345
3346 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3347 + ndests * 3 * sizeof (re_dfastate_t *)))
3348 dest_states = (re_dfastate_t **)
3349 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3350 else
3351 {
3352 dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3353 if (BE (dest_states == NULL, 0))
3354 {
3355out_free:
3356 if (dest_states_malloced)
3357 re_free (dest_states);
3358 re_node_set_free (&follows);
3359 for (i = 0; i < ndests; ++i)
3360 re_node_set_free (dests_node + i);
3361 if (dests_node_malloced)
3362 re_free (dests_alloc);
3363 return false;
3364 }
3365 dest_states_malloced = true;
3366 }
3367 dest_states_word = dest_states + ndests;
3368 dest_states_nl = dest_states_word + ndests;
3369 bitset_empty (acceptable);
3370
3371 /* Then build the states for all destinations. */
3372 for (i = 0; i < ndests; ++i)
3373 {
3374 Idx next_node;
3375 re_node_set_empty (&follows);
3376 /* Merge the follows of this destination states. */
3377 for (j = 0; j < dests_node[i].nelem; ++j)
3378 {
3379 next_node = dfa->nexts[dests_node[i].elems[j]];
3380 if (next_node != -1)
3381 {
3382 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3383 if (BE (err != REG_NOERROR, 0))
3384 goto out_free;
3385 }
3386 }
3387 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3388 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3389 goto out_free;
3390 /* If the new state has context constraint,
3391 build appropriate states for these contexts. */
3392 if (dest_states[i]->has_constraint)
3393 {
3394 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3395 CONTEXT_WORD);
3396 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3397 goto out_free;
3398
3399 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3400 need_word_trtable = true;
3401
3402 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3403 CONTEXT_NEWLINE);
3404 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3405 goto out_free;
3406 }
3407 else
3408 {
3409 dest_states_word[i] = dest_states[i];
3410 dest_states_nl[i] = dest_states[i];
3411 }
3412 bitset_merge (acceptable, dests_ch[i]);
3413 }
3414
3415 if (!BE (need_word_trtable, 0))
3416 {
3417 /* We don't care about whether the following character is a word
3418 character, or we are in a single-byte character set so we can
3419 discern by looking at the character code: allocate a
3420 256-entry transition table. */
3421 trtable = state->trtable =
3422 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3423 if (BE (trtable == NULL, 0))
3424 goto out_free;
3425
3426 /* For all characters ch...: */
3427 for (i = 0; i < BITSET_WORDS; ++i)
3428 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3429 elem;
3430 mask <<= 1, elem >>= 1, ++ch)
3431 if (BE (elem & 1, 0))
3432 {
3433 /* There must be exactly one destination which accepts
3434 character ch. See group_nodes_into_DFAstates. */
3435 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436 ;
3437
3438 /* j-th destination accepts the word character ch. */
3439 if (dfa->word_char[i] & mask)
3440 trtable[ch] = dest_states_word[j];
3441 else
3442 trtable[ch] = dest_states[j];
3443 }
3444 }
3445 else
3446 {
3447 /* We care about whether the following character is a word
3448 character, and we are in a multi-byte character set: discern
3449 by looking at the character code: build two 256-entry
3450 transition tables, one starting at trtable[0] and one
3451 starting at trtable[SBC_MAX]. */
3452 trtable = state->word_trtable =
3453 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3454 if (BE (trtable == NULL, 0))
3455 goto out_free;
3456
3457 /* For all characters ch...: */
3458 for (i = 0; i < BITSET_WORDS; ++i)
3459 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3460 elem;
3461 mask <<= 1, elem >>= 1, ++ch)
3462 if (BE (elem & 1, 0))
3463 {
3464 /* There must be exactly one destination which accepts
3465 character ch. See group_nodes_into_DFAstates. */
3466 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3467 ;
3468
3469 /* j-th destination accepts the word character ch. */
3470 trtable[ch] = dest_states[j];
3471 trtable[ch + SBC_MAX] = dest_states_word[j];
3472 }
3473 }
3474
3475 /* new line */
3476 if (bitset_contain (acceptable, NEWLINE_CHAR))
3477 {
3478 /* The current state accepts newline character. */
3479 for (j = 0; j < ndests; ++j)
3480 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3481 {
3482 /* k-th destination accepts newline character. */
3483 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3484 if (need_word_trtable)
3485 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3486 /* There must be only one destination which accepts
3487 newline. See group_nodes_into_DFAstates. */
3488 break;
3489 }
3490 }
3491
3492 if (dest_states_malloced)
3493 re_free (dest_states);
3494
3495 re_node_set_free (&follows);
3496 for (i = 0; i < ndests; ++i)
3497 re_node_set_free (dests_node + i);
3498
3499 if (dests_node_malloced)
3500 re_free (dests_alloc);
3501
3502 return true;
3503}
3504
3505/* Group all nodes belonging to STATE into several destinations.
3506 Then for all destinations, set the nodes belonging to the destination
3507 to DESTS_NODE[i] and set the characters accepted by the destination
3508 to DEST_CH[i]. This function return the number of destinations. */
3509
3510static Idx
3511group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3512 re_node_set *dests_node, bitset_t *dests_ch)
3513{
3514 reg_errcode_t err;
3515 bool ok;
3516 Idx i, j, k;
3517 Idx ndests; /* Number of the destinations from 'state'. */
3518 bitset_t accepts; /* Characters a node can accept. */
3519 const re_node_set *cur_nodes = &state->nodes;
3520 bitset_empty (accepts);
3521 ndests = 0;
3522
3523 /* For all the nodes belonging to 'state', */
3524 for (i = 0; i < cur_nodes->nelem; ++i)
3525 {
3526 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527 re_token_type_t type = node->type;
3528 unsigned int constraint = node->constraint;
3529
3530 /* Enumerate all single byte character this node can accept. */
3531 if (type == CHARACTER)
3532 bitset_set (accepts, node->opr.c);
3533 else if (type == SIMPLE_BRACKET)
3534 {
3535 bitset_merge (accepts, node->opr.sbcset);
3536 }
3537 else if (type == OP_PERIOD)
3538 {
3539#ifdef RE_ENABLE_I18N
3540 if (dfa->mb_cur_max > 1)
3541 bitset_merge (accepts, dfa->sb_char);
3542 else
3543#endif
3544 bitset_set_all (accepts);
3545 if (!(dfa->syntax & RE_DOT_NEWLINE))
3546 bitset_clear (accepts, '\n');
3547 if (dfa->syntax & RE_DOT_NOT_NULL)
3548 bitset_clear (accepts, '\0');
3549 }
3550#ifdef RE_ENABLE_I18N
3551 else if (type == OP_UTF8_PERIOD)
3552 {
3553 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3554 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3555 else
3556 bitset_merge (accepts, utf8_sb_map);
3557 if (!(dfa->syntax & RE_DOT_NEWLINE))
3558 bitset_clear (accepts, '\n');
3559 if (dfa->syntax & RE_DOT_NOT_NULL)
3560 bitset_clear (accepts, '\0');
3561 }
3562#endif
3563 else
3564 continue;
3565
3566 /* Check the 'accepts' and sift the characters which are not
3567 match it the context. */
3568 if (constraint)
3569 {
3570 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3571 {
3572 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3573 bitset_empty (accepts);
3574 if (accepts_newline)
3575 bitset_set (accepts, NEWLINE_CHAR);
3576 else
3577 continue;
3578 }
3579 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3580 {
3581 bitset_empty (accepts);
3582 continue;
3583 }
3584
3585 if (constraint & NEXT_WORD_CONSTRAINT)
3586 {
3587 bitset_word_t any_set = 0;
3588 if (type == CHARACTER && !node->word_char)
3589 {
3590 bitset_empty (accepts);
3591 continue;
3592 }
3593#ifdef RE_ENABLE_I18N
3594 if (dfa->mb_cur_max > 1)
3595 for (j = 0; j < BITSET_WORDS; ++j)
3596 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3597 else
3598#endif
3599 for (j = 0; j < BITSET_WORDS; ++j)
3600 any_set |= (accepts[j] &= dfa->word_char[j]);
3601 if (!any_set)
3602 continue;
3603 }
3604 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3605 {
3606 bitset_word_t any_set = 0;
3607 if (type == CHARACTER && node->word_char)
3608 {
3609 bitset_empty (accepts);
3610 continue;
3611 }
3612#ifdef RE_ENABLE_I18N
3613 if (dfa->mb_cur_max > 1)
3614 for (j = 0; j < BITSET_WORDS; ++j)
3615 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3616 else
3617#endif
3618 for (j = 0; j < BITSET_WORDS; ++j)
3619 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3620 if (!any_set)
3621 continue;
3622 }
3623 }
3624
3625 /* Then divide 'accepts' into DFA states, or create a new
3626 state. Above, we make sure that accepts is not empty. */
3627 for (j = 0; j < ndests; ++j)
3628 {
3629 bitset_t intersec; /* Intersection sets, see below. */
3630 bitset_t remains;
3631 /* Flags, see below. */
3632 bitset_word_t has_intersec, not_subset, not_consumed;
3633
3634 /* Optimization, skip if this state doesn't accept the character. */
3635 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3636 continue;
3637
3638 /* Enumerate the intersection set of this state and 'accepts'. */
3639 has_intersec = 0;
3640 for (k = 0; k < BITSET_WORDS; ++k)
3641 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3642 /* And skip if the intersection set is empty. */
3643 if (!has_intersec)
3644 continue;
3645
3646 /* Then check if this state is a subset of 'accepts'. */
3647 not_subset = not_consumed = 0;
3648 for (k = 0; k < BITSET_WORDS; ++k)
3649 {
3650 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3651 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3652 }
3653
3654 /* If this state isn't a subset of 'accepts', create a
3655 new group state, which has the 'remains'. */
3656 if (not_subset)
3657 {
3658 bitset_copy (dests_ch[ndests], remains);
3659 bitset_copy (dests_ch[j], intersec);
3660 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3661 if (BE (err != REG_NOERROR, 0))
3662 goto error_return;
3663 ++ndests;
3664 }
3665
3666 /* Put the position in the current group. */
3667 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3668 if (BE (! ok, 0))
3669 goto error_return;
3670
3671 /* If all characters are consumed, go to next node. */
3672 if (!not_consumed)
3673 break;
3674 }
3675 /* Some characters remain, create a new group. */
3676 if (j == ndests)
3677 {
3678 bitset_copy (dests_ch[ndests], accepts);
3679 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3680 if (BE (err != REG_NOERROR, 0))
3681 goto error_return;
3682 ++ndests;
3683 bitset_empty (accepts);
3684 }
3685 }
3686 return ndests;
3687 error_return:
3688 for (j = 0; j < ndests; ++j)
3689 re_node_set_free (dests_node + j);
3690 return -1;
3691}
3692
3693#ifdef RE_ENABLE_I18N
3694/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3695 Return the number of the bytes the node accepts.
3696 STR_IDX is the current index of the input string.
3697
3698 This function handles the nodes which can accept one character, or
3699 one collating element like '.', '[a-z]', opposite to the other nodes
3700 can only accept one byte. */
3701
3702# ifdef _LIBC
3703# include <locale/weight.h>
3704# endif
3705
3706static int
3707check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3708 const re_string_t *input, Idx str_idx)
3709{
3710 const re_token_t *node = dfa->nodes + node_idx;
3711 int char_len, elem_len;
3712 Idx i;
3713
3714 if (BE (node->type == OP_UTF8_PERIOD, 0))
3715 {
3716 unsigned char c = re_string_byte_at (input, str_idx), d;
3717 if (BE (c < 0xc2, 1))
3718 return 0;
3719
3720 if (str_idx + 2 > input->len)
3721 return 0;
3722
3723 d = re_string_byte_at (input, str_idx + 1);
3724 if (c < 0xe0)
3725 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3726 else if (c < 0xf0)
3727 {
3728 char_len = 3;
3729 if (c == 0xe0 && d < 0xa0)
3730 return 0;
3731 }
3732 else if (c < 0xf8)
3733 {
3734 char_len = 4;
3735 if (c == 0xf0 && d < 0x90)
3736 return 0;
3737 }
3738 else if (c < 0xfc)
3739 {
3740 char_len = 5;
3741 if (c == 0xf8 && d < 0x88)
3742 return 0;
3743 }
3744 else if (c < 0xfe)
3745 {
3746 char_len = 6;
3747 if (c == 0xfc && d < 0x84)
3748 return 0;
3749 }
3750 else
3751 return 0;
3752
3753 if (str_idx + char_len > input->len)
3754 return 0;
3755
3756 for (i = 1; i < char_len; ++i)
3757 {
3758 d = re_string_byte_at (input, str_idx + i);
3759 if (d < 0x80 || d > 0xbf)
3760 return 0;
3761 }
3762 return char_len;
3763 }
3764
3765 char_len = re_string_char_size_at (input, str_idx);
3766 if (node->type == OP_PERIOD)
3767 {
3768 if (char_len <= 1)
3769 return 0;
3770 /* FIXME: I don't think this if is needed, as both '\n'
3771 and '\0' are char_len == 1. */
3772 /* '.' accepts any one character except the following two cases. */
3773 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3774 re_string_byte_at (input, str_idx) == '\n') ||
3775 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3776 re_string_byte_at (input, str_idx) == '\0'))
3777 return 0;
3778 return char_len;
3779 }
3780
3781 elem_len = re_string_elem_size_at (input, str_idx);
3782 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783 return 0;
3784
3785 if (node->type == COMPLEX_BRACKET)
3786 {
3787 const re_charset_t *cset = node->opr.mbcset;
3788# ifdef _LIBC
3789 const unsigned char *pin
3790 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3791 Idx j;
3792 uint32_t nrules;
3793# endif /* _LIBC */
3794 int match_len = 0;
3795 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3796 ? re_string_wchar_at (input, str_idx) : 0);
3797
3798 /* match with multibyte character? */
3799 for (i = 0; i < cset->nmbchars; ++i)
3800 if (wc == cset->mbchars[i])
3801 {
3802 match_len = char_len;
3803 goto check_node_accept_bytes_match;
3804 }
3805 /* match with character_class? */
3806 for (i = 0; i < cset->nchar_classes; ++i)
3807 {
3808 wctype_t wt = cset->char_classes[i];
3809 if (__iswctype (wc, wt))
3810 {
3811 match_len = char_len;
3812 goto check_node_accept_bytes_match;
3813 }
3814 }
3815
3816# ifdef _LIBC
3817 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3818 if (nrules != 0)
3819 {
3820 unsigned int in_collseq = 0;
3821 const int32_t *table, *indirect;
3822 const unsigned char *weights, *extra;
3823 const char *collseqwc;
3824
3825 /* match with collating_symbol? */
3826 if (cset->ncoll_syms)
3827 extra = (const unsigned char *)
3828 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3829 for (i = 0; i < cset->ncoll_syms; ++i)
3830 {
3831 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3832 /* Compare the length of input collating element and
3833 the length of current collating element. */
3834 if (*coll_sym != elem_len)
3835 continue;
3836 /* Compare each bytes. */
3837 for (j = 0; j < *coll_sym; j++)
3838 if (pin[j] != coll_sym[1 + j])
3839 break;
3840 if (j == *coll_sym)
3841 {
3842 /* Match if every bytes is equal. */
3843 match_len = j;
3844 goto check_node_accept_bytes_match;
3845 }
3846 }
3847
3848 if (cset->nranges)
3849 {
3850 if (elem_len <= char_len)
3851 {
3852 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3853 in_collseq = __collseq_table_lookup (collseqwc, wc);
3854 }
3855 else
3856 in_collseq = find_collation_sequence_value (pin, elem_len);
3857 }
3858 /* match with range expression? */
3859 /* FIXME: Implement rational ranges here, too. */
3860 for (i = 0; i < cset->nranges; ++i)
3861 if (cset->range_starts[i] <= in_collseq
3862 && in_collseq <= cset->range_ends[i])
3863 {
3864 match_len = elem_len;
3865 goto check_node_accept_bytes_match;
3866 }
3867
3868 /* match with equivalence_class? */
3869 if (cset->nequiv_classes)
3870 {
3871 const unsigned char *cp = pin;
3872 table = (const int32_t *)
3873 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874 weights = (const unsigned char *)
3875 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876 extra = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878 indirect = (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3880 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3881 int32_t rule = idx >> 24;
3882 idx &= 0xffffff;
3883 if (idx > 0)
3884 {
3885 size_t weight_len = weights[idx];
3886 for (i = 0; i < cset->nequiv_classes; ++i)
3887 {
3888 int32_t equiv_class_idx = cset->equiv_classes[i];
3889 int32_t equiv_class_rule = equiv_class_idx >> 24;
3890 equiv_class_idx &= 0xffffff;
3891 if (weights[equiv_class_idx] == weight_len
3892 && equiv_class_rule == rule
3893 && memcmp (weights + idx + 1,
3894 weights + equiv_class_idx + 1,
3895 weight_len) == 0)
3896 {
3897 match_len = elem_len;
3898 goto check_node_accept_bytes_match;
3899 }
3900 }
3901 }
3902 }
3903 }
3904 else
3905# endif /* _LIBC */
3906 {
3907 /* match with range expression? */
3908 for (i = 0; i < cset->nranges; ++i)
3909 {
3910 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3911 {
3912 match_len = char_len;
3913 goto check_node_accept_bytes_match;
3914 }
3915 }
3916 }
3917 check_node_accept_bytes_match:
3918 if (!cset->non_match)
3919 return match_len;
3920 else
3921 {
3922 if (match_len > 0)
3923 return 0;
3924 else
3925 return (elem_len > char_len) ? elem_len : char_len;
3926 }
3927 }
3928 return 0;
3929}
3930
3931# ifdef _LIBC
3932static unsigned int
3933find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3934{
3935 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3936 if (nrules == 0)
3937 {
3938 if (mbs_len == 1)
3939 {
3940 /* No valid character. Match it as a single byte character. */
3941 const unsigned char *collseq = (const unsigned char *)
3942 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3943 return collseq[mbs[0]];
3944 }
3945 return UINT_MAX;
3946 }
3947 else
3948 {
3949 int32_t idx;
3950 const unsigned char *extra = (const unsigned char *)
3951 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3952 int32_t extrasize = (const unsigned char *)
3953 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3954
3955 for (idx = 0; idx < extrasize;)
3956 {
3957 int mbs_cnt;
3958 bool found = false;
3959 int32_t elem_mbs_len;
3960 /* Skip the name of collating element name. */
3961 idx = idx + extra[idx] + 1;
3962 elem_mbs_len = extra[idx++];
3963 if (mbs_len == elem_mbs_len)
3964 {
3965 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3966 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3967 break;
3968 if (mbs_cnt == elem_mbs_len)
3969 /* Found the entry. */
3970 found = true;
3971 }
3972 /* Skip the byte sequence of the collating element. */
3973 idx += elem_mbs_len;
3974 /* Adjust for the alignment. */
3975 idx = (idx + 3) & ~3;
3976 /* Skip the collation sequence value. */
3977 idx += sizeof (uint32_t);
3978 /* Skip the wide char sequence of the collating element. */
3979 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3980 /* If we found the entry, return the sequence value. */
3981 if (found)
3982 return *(uint32_t *) (extra + idx);
3983 /* Skip the collation sequence value. */
3984 idx += sizeof (uint32_t);
3985 }
3986 return UINT_MAX;
3987 }
3988}
3989# endif /* _LIBC */
3990#endif /* RE_ENABLE_I18N */
3991
3992/* Check whether the node accepts the byte which is IDX-th
3993 byte of the INPUT. */
3994
3995static bool
3996check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3997 Idx idx)
3998{
3999 unsigned char ch;
4000 ch = re_string_byte_at (&mctx->input, idx);
4001 switch (node->type)
4002 {
4003 case CHARACTER:
4004 if (node->opr.c != ch)
4005 return false;
4006 break;
4007
4008 case SIMPLE_BRACKET:
4009 if (!bitset_contain (node->opr.sbcset, ch))
4010 return false;
4011 break;
4012
4013#ifdef RE_ENABLE_I18N
4014 case OP_UTF8_PERIOD:
4015 if (ch >= ASCII_CHARS)
4016 return false;
4017 FALLTHROUGH;
4018#endif
4019 case OP_PERIOD:
4020 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4021 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4022 return false;
4023 break;
4024
4025 default:
4026 return false;
4027 }
4028
4029 if (node->constraint)
4030 {
4031 /* The node has constraints. Check whether the current context
4032 satisfies the constraints. */
4033 unsigned int context = re_string_context_at (&mctx->input, idx,
4034 mctx->eflags);
4035 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4036 return false;
4037 }
4038
4039 return true;
4040}
4041
4042/* Extend the buffers, if the buffers have run out. */
4043
4044static reg_errcode_t
4045__attribute_warn_unused_result__
4046extend_buffers (re_match_context_t *mctx, int min_len)
4047{
4048 reg_errcode_t ret;
4049 re_string_t *pstr = &mctx->input;
4050
4051 /* Avoid overflow. */
4052 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4053 <= pstr->bufs_len, 0))
4054 return REG_ESPACE;
4055
4056 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4057 ret = re_string_realloc_buffers (pstr,
4058 MAX (min_len,
4059 MIN (pstr->len, pstr->bufs_len * 2)));
4060 if (BE (ret != REG_NOERROR, 0))
4061 return ret;
4062
4063 if (mctx->state_log != NULL)
4064 {
4065 /* And double the length of state_log. */
4066 /* XXX We have no indication of the size of this buffer. If this
4067 allocation fail we have no indication that the state_log array
4068 does not have the right size. */
4069 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070 pstr->bufs_len + 1);
4071 if (BE (new_array == NULL, 0))
4072 return REG_ESPACE;
4073 mctx->state_log = new_array;
4074 }
4075
4076 /* Then reconstruct the buffers. */
4077 if (pstr->icase)
4078 {
4079#ifdef RE_ENABLE_I18N
4080 if (pstr->mb_cur_max > 1)
4081 {
4082 ret = build_wcs_upper_buffer (pstr);
4083 if (BE (ret != REG_NOERROR, 0))
4084 return ret;
4085 }
4086 else
4087#endif /* RE_ENABLE_I18N */
4088 build_upper_buffer (pstr);
4089 }
4090 else
4091 {
4092#ifdef RE_ENABLE_I18N
4093 if (pstr->mb_cur_max > 1)
4094 build_wcs_buffer (pstr);
4095 else
4096#endif /* RE_ENABLE_I18N */
4097 {
4098 if (pstr->trans != NULL)
4099 re_string_translate_buffer (pstr);
4100 }
4101 }
4102 return REG_NOERROR;
4103}
4104
4105
4106/* Functions for matching context. */
4107
4108/* Initialize MCTX. */
4109
4110static reg_errcode_t
4111__attribute_warn_unused_result__
4112match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4113{
4114 mctx->eflags = eflags;
4115 mctx->match_last = -1;
4116 if (n > 0)
4117 {
4118 /* Avoid overflow. */
4119 size_t max_object_size =
4120 MAX (sizeof (struct re_backref_cache_entry),
4121 sizeof (re_sub_match_top_t *));
4122 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4123 return REG_ESPACE;
4124
4125 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128 return REG_ESPACE;
4129 }
4130 /* Already zero-ed by the caller.
4131 else
4132 mctx->bkref_ents = NULL;
4133 mctx->nbkref_ents = 0;
4134 mctx->nsub_tops = 0; */
4135 mctx->abkref_ents = n;
4136 mctx->max_mb_elem_len = 1;
4137 mctx->asub_tops = n;
4138 return REG_NOERROR;
4139}
4140
4141/* Clean the entries which depend on the current input in MCTX.
4142 This function must be invoked when the matcher changes the start index
4143 of the input, or changes the input string. */
4144
4145static void
4146match_ctx_clean (re_match_context_t *mctx)
4147{
4148 Idx st_idx;
4149 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4150 {
4151 Idx sl_idx;
4152 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4153 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4154 {
4155 re_sub_match_last_t *last = top->lasts[sl_idx];
4156 re_free (last->path.array);
4157 re_free (last);
4158 }
4159 re_free (top->lasts);
4160 if (top->path)
4161 {
4162 re_free (top->path->array);
4163 re_free (top->path);
4164 }
4165 re_free (top);
4166 }
4167
4168 mctx->nsub_tops = 0;
4169 mctx->nbkref_ents = 0;
4170}
4171
4172/* Free all the memory associated with MCTX. */
4173
4174static void
4175match_ctx_free (re_match_context_t *mctx)
4176{
4177 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4178 match_ctx_clean (mctx);
4179 re_free (mctx->sub_tops);
4180 re_free (mctx->bkref_ents);
4181}
4182
4183/* Add a new backreference entry to MCTX.
4184 Note that we assume that caller never call this function with duplicate
4185 entry, and call with STR_IDX which isn't smaller than any existing entry.
4186*/
4187
4188static reg_errcode_t
4189__attribute_warn_unused_result__
4190match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4191 Idx to)
4192{
4193 if (mctx->nbkref_ents >= mctx->abkref_ents)
4194 {
4195 struct re_backref_cache_entry* new_entry;
4196 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4197 mctx->abkref_ents * 2);
4198 if (BE (new_entry == NULL, 0))
4199 {
4200 re_free (mctx->bkref_ents);
4201 return REG_ESPACE;
4202 }
4203 mctx->bkref_ents = new_entry;
4204 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4205 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4206 mctx->abkref_ents *= 2;
4207 }
4208 if (mctx->nbkref_ents > 0
4209 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4210 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4211
4212 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4213 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4214 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4215 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4216
4217 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4218 If bit N is clear, means that this entry won't epsilon-transition to
4219 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4220 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4221 such node.
4222
4223 A backreference does not epsilon-transition unless it is empty, so set
4224 to all zeros if FROM != TO. */
4225 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4226 = (from == to ? -1 : 0);
4227
4228 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4229 if (mctx->max_mb_elem_len < to - from)
4230 mctx->max_mb_elem_len = to - from;
4231 return REG_NOERROR;
4232}
4233
4234/* Return the first entry with the same str_idx, or -1 if none is
4235 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4236
4237static Idx
4238search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4239{
4240 Idx left, right, mid, last;
4241 last = right = mctx->nbkref_ents;
4242 for (left = 0; left < right;)
4243 {
4244 mid = (left + right) / 2;
4245 if (mctx->bkref_ents[mid].str_idx < str_idx)
4246 left = mid + 1;
4247 else
4248 right = mid;
4249 }
4250 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4251 return left;
4252 else
4253 return -1;
4254}
4255
4256/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257 at STR_IDX. */
4258
4259static reg_errcode_t
4260__attribute_warn_unused_result__
4261match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4262{
4263#ifdef DEBUG
4264 assert (mctx->sub_tops != NULL);
4265 assert (mctx->asub_tops > 0);
4266#endif
4267 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4268 {
4269 Idx new_asub_tops = mctx->asub_tops * 2;
4270 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4271 re_sub_match_top_t *,
4272 new_asub_tops);
4273 if (BE (new_array == NULL, 0))
4274 return REG_ESPACE;
4275 mctx->sub_tops = new_array;
4276 mctx->asub_tops = new_asub_tops;
4277 }
4278 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4279 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4280 return REG_ESPACE;
4281 mctx->sub_tops[mctx->nsub_tops]->node = node;
4282 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283 return REG_NOERROR;
4284}
4285
4286/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4287 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4288
4289static re_sub_match_last_t *
4290match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4291{
4292 re_sub_match_last_t *new_entry;
4293 if (BE (subtop->nlasts == subtop->alasts, 0))
4294 {
4295 Idx new_alasts = 2 * subtop->alasts + 1;
4296 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4297 re_sub_match_last_t *,
4298 new_alasts);
4299 if (BE (new_array == NULL, 0))
4300 return NULL;
4301 subtop->lasts = new_array;
4302 subtop->alasts = new_alasts;
4303 }
4304 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4305 if (BE (new_entry != NULL, 1))
4306 {
4307 subtop->lasts[subtop->nlasts] = new_entry;
4308 new_entry->node = node;
4309 new_entry->str_idx = str_idx;
4310 ++subtop->nlasts;
4311 }
4312 return new_entry;
4313}
4314
4315static void
4316sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4317 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4318{
4319 sctx->sifted_states = sifted_sts;
4320 sctx->limited_states = limited_sts;
4321 sctx->last_node = last_node;
4322 sctx->last_str_idx = last_str_idx;
4323 re_node_set_init_empty (&sctx->limits);
4324}
diff --git a/m4/builtin-expect.m4 b/m4/builtin-expect.m4
new file mode 100644
index 00000000000..a1eaf965b45
--- /dev/null
+++ b/m4/builtin-expect.m4
@@ -0,0 +1,49 @@
1dnl Check for __builtin_expect.
2
3dnl Copyright 2016-2018 Free Software Foundation, Inc.
4dnl This file is free software; the Free Software Foundation
5dnl gives unlimited permission to copy and/or distribute it,
6dnl with or without modifications, as long as this notice is preserved.
7
8dnl Written by Paul Eggert.
9
10AC_DEFUN([gl___BUILTIN_EXPECT],
11[
12 AC_CACHE_CHECK([for __builtin_expect],
13 [gl_cv___builtin_expect],
14 [AC_LINK_IFELSE(
15 [AC_LANG_SOURCE([[
16 int
17 main (int argc, char **argv)
18 {
19 argc = __builtin_expect (argc, 100);
20 return argv[argc != 100][0];
21 }]])],
22 [gl_cv___builtin_expect=yes],
23 [AC_LINK_IFELSE(
24 [AC_LANG_SOURCE([[
25 #include <builtins.h>
26 int
27 main (int argc, char **argv)
28 {
29 argc = __builtin_expect (argc, 100);
30 return argv[argc != 100][0];
31 }]])],
32 [gl_cv___builtin_expect="in <builtins.h>"],
33 [gl_cv___builtin_expect=no])])])
34 if test "$gl_cv___builtin_expect" = yes; then
35 AC_DEFINE([HAVE___BUILTIN_EXPECT], [1])
36 elif test "$gl_cv___builtin_expect" = "in <builtins.h>"; then
37 AC_DEFINE([HAVE___BUILTIN_EXPECT], [2])
38 fi
39 AH_VERBATIM([HAVE___BUILTIN_EXPECT],
40 [/* Define to 1 if the compiler supports __builtin_expect,
41 and to 2 if <builtins.h> does. */
42#undef HAVE___BUILTIN_EXPECT
43#ifndef HAVE___BUILTIN_EXPECT
44# define __builtin_expect(e, c) (e)
45#elif HAVE___BUILTIN_EXPECT == 2
46# include <builtins.h>
47#endif
48 ])
49])
diff --git a/m4/eealloc.m4 b/m4/eealloc.m4
new file mode 100644
index 00000000000..a5a4e267d8e
--- /dev/null
+++ b/m4/eealloc.m4
@@ -0,0 +1,31 @@
1# eealloc.m4 serial 3
2dnl Copyright (C) 2003, 2009-2018 Free Software Foundation, Inc.
3dnl This file is free software; the Free Software Foundation
4dnl gives unlimited permission to copy and/or distribute it,
5dnl with or without modifications, as long as this notice is preserved.
6
7AC_DEFUN([gl_EEALLOC],
8[
9 AC_REQUIRE([gl_EEMALLOC])
10 AC_REQUIRE([gl_EEREALLOC])
11])
12
13AC_DEFUN([gl_EEMALLOC],
14[
15 _AC_FUNC_MALLOC_IF(
16 [gl_cv_func_malloc_0_nonnull=1],
17 [gl_cv_func_malloc_0_nonnull=0])
18 AC_DEFINE_UNQUOTED([MALLOC_0_IS_NONNULL], [$gl_cv_func_malloc_0_nonnull],
19 [If malloc(0) is != NULL, define this to 1. Otherwise define this
20 to 0.])
21])
22
23AC_DEFUN([gl_EEREALLOC],
24[
25 _AC_FUNC_REALLOC_IF(
26 [gl_cv_func_realloc_0_nonnull=1],
27 [gl_cv_func_realloc_0_nonnull=0])
28 AC_DEFINE_UNQUOTED([REALLOC_0_IS_NONNULL], [$gl_cv_func_realloc_0_nonnull],
29 [If realloc(NULL,0) is != NULL, define this to 1. Otherwise define this
30 to 0.])
31])
diff --git a/m4/glibc21.m4 b/m4/glibc21.m4
new file mode 100644
index 00000000000..126aa1a959e
--- /dev/null
+++ b/m4/glibc21.m4
@@ -0,0 +1,34 @@
1# glibc21.m4 serial 5
2dnl Copyright (C) 2000-2002, 2004, 2008, 2010-2018 Free Software Foundation,
3dnl Inc.
4dnl This file is free software; the Free Software Foundation
5dnl gives unlimited permission to copy and/or distribute it,
6dnl with or without modifications, as long as this notice is preserved.
7
8# Test for the GNU C Library, version 2.1 or newer, or uClibc.
9# From Bruno Haible.
10
11AC_DEFUN([gl_GLIBC21],
12 [
13 AC_CACHE_CHECK([whether we are using the GNU C Library >= 2.1 or uClibc],
14 [ac_cv_gnu_library_2_1],
15 [AC_EGREP_CPP([Lucky],
16 [
17#include <features.h>
18#ifdef __GNU_LIBRARY__
19 #if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 1) || (__GLIBC__ > 2)
20 Lucky GNU user
21 #endif
22#endif
23#ifdef __UCLIBC__
24 Lucky user
25#endif
26 ],
27 [ac_cv_gnu_library_2_1=yes],
28 [ac_cv_gnu_library_2_1=no])
29 ]
30 )
31 AC_SUBST([GLIBC21])
32 GLIBC21="$ac_cv_gnu_library_2_1"
33 ]
34)
diff --git a/m4/gnulib-comp.m4 b/m4/gnulib-comp.m4
index 494c77c7c4e..61aabaa3427 100644
--- a/m4/gnulib-comp.m4
+++ b/m4/gnulib-comp.m4
@@ -48,6 +48,7 @@ AC_DEFUN([gl_EARLY],
48 # Code from module allocator: 48 # Code from module allocator:
49 # Code from module at-internal: 49 # Code from module at-internal:
50 # Code from module binary-io: 50 # Code from module binary-io:
51 # Code from module builtin-expect:
51 # Code from module byteswap: 52 # Code from module byteswap:
52 # Code from module c-ctype: 53 # Code from module c-ctype:
53 # Code from module c-strcase: 54 # Code from module c-strcase:
@@ -129,6 +130,7 @@ AC_DEFUN([gl_EARLY],
129 # Code from module qcopy-acl: 130 # Code from module qcopy-acl:
130 # Code from module readlink: 131 # Code from module readlink:
131 # Code from module readlinkat: 132 # Code from module readlinkat:
133 # Code from module regex:
132 # Code from module root-uid: 134 # Code from module root-uid:
133 # Code from module sig2str: 135 # Code from module sig2str:
134 # Code from module signal-h: 136 # Code from module signal-h:
@@ -358,6 +360,11 @@ AC_DEFUN([gl_INIT],
358 AC_LIBOBJ([readlinkat]) 360 AC_LIBOBJ([readlinkat])
359 fi 361 fi
360 gl_UNISTD_MODULE_INDICATOR([readlinkat]) 362 gl_UNISTD_MODULE_INDICATOR([readlinkat])
363 gl_REGEX
364 if test $ac_use_included_regex = yes; then
365 AC_LIBOBJ([regex])
366 gl_PREREQ_REGEX
367 fi
361 gl_FUNC_SIG2STR 368 gl_FUNC_SIG2STR
362 if test $ac_cv_func_sig2str = no; then 369 if test $ac_cv_func_sig2str = no; then
363 AC_LIBOBJ([sig2str]) 370 AC_LIBOBJ([sig2str])
@@ -425,6 +432,7 @@ AC_DEFUN([gl_INIT],
425 gl_UTIMENS 432 gl_UTIMENS
426 AC_C_VARARRAYS 433 AC_C_VARARRAYS
427 gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b=false 434 gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b=false
435 gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547=false
428 gl_gnulib_enabled_cloexec=false 436 gl_gnulib_enabled_cloexec=false
429 gl_gnulib_enabled_dirfd=false 437 gl_gnulib_enabled_dirfd=false
430 gl_gnulib_enabled_dosname=false 438 gl_gnulib_enabled_dosname=false
@@ -448,6 +456,13 @@ AC_DEFUN([gl_INIT],
448 func_gl_gnulib_m4code_open 456 func_gl_gnulib_m4code_open
449 fi 457 fi
450 } 458 }
459 func_gl_gnulib_m4code_37f71b604aa9c54446783d80f42fe547 ()
460 {
461 if ! $gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547; then
462 gl___BUILTIN_EXPECT
463 gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547=true
464 fi
465 }
451 func_gl_gnulib_m4code_cloexec () 466 func_gl_gnulib_m4code_cloexec ()
452 { 467 {
453 if ! $gl_gnulib_enabled_cloexec; then 468 if ! $gl_gnulib_enabled_cloexec; then
@@ -651,6 +666,9 @@ AC_DEFUN([gl_INIT],
651 if test $HAVE_READLINKAT = 0; then 666 if test $HAVE_READLINKAT = 0; then
652 func_gl_gnulib_m4code_03e0aaad4cb89ca757653bd367a6ccb7 667 func_gl_gnulib_m4code_03e0aaad4cb89ca757653bd367a6ccb7
653 fi 668 fi
669 if test $ac_use_included_regex = yes; then
670 func_gl_gnulib_m4code_37f71b604aa9c54446783d80f42fe547
671 fi
654 if { test $HAVE_DECL_STRTOIMAX = 0 || test $REPLACE_STRTOIMAX = 1; } && test $ac_cv_type_long_long_int = yes; then 672 if { test $HAVE_DECL_STRTOIMAX = 0 || test $REPLACE_STRTOIMAX = 1; } && test $ac_cv_type_long_long_int = yes; then
655 func_gl_gnulib_m4code_strtoll 673 func_gl_gnulib_m4code_strtoll
656 fi 674 fi
@@ -659,6 +677,7 @@ AC_DEFUN([gl_INIT],
659 fi 677 fi
660 m4_pattern_allow([^gl_GNULIB_ENABLED_]) 678 m4_pattern_allow([^gl_GNULIB_ENABLED_])
661 AM_CONDITIONAL([gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b], [$gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b]) 679 AM_CONDITIONAL([gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b], [$gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b])
680 AM_CONDITIONAL([gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547], [$gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547])
662 AM_CONDITIONAL([gl_GNULIB_ENABLED_cloexec], [$gl_gnulib_enabled_cloexec]) 681 AM_CONDITIONAL([gl_GNULIB_ENABLED_cloexec], [$gl_gnulib_enabled_cloexec])
663 AM_CONDITIONAL([gl_GNULIB_ENABLED_dirfd], [$gl_gnulib_enabled_dirfd]) 682 AM_CONDITIONAL([gl_GNULIB_ENABLED_dirfd], [$gl_gnulib_enabled_dirfd])
664 AM_CONDITIONAL([gl_GNULIB_ENABLED_dosname], [$gl_gnulib_enabled_dosname]) 683 AM_CONDITIONAL([gl_GNULIB_ENABLED_dosname], [$gl_gnulib_enabled_dosname])
@@ -924,6 +943,12 @@ AC_DEFUN([gl_FILE_LIST], [
924 lib/qcopy-acl.c 943 lib/qcopy-acl.c
925 lib/readlink.c 944 lib/readlink.c
926 lib/readlinkat.c 945 lib/readlinkat.c
946 lib/regcomp.c
947 lib/regex.c
948 lib/regex.h
949 lib/regex_internal.c
950 lib/regex_internal.h
951 lib/regexec.c
927 lib/root-uid.h 952 lib/root-uid.h
928 lib/set-permissions.c 953 lib/set-permissions.c
929 lib/sha1.c 954 lib/sha1.c
@@ -980,6 +1005,7 @@ AC_DEFUN([gl_FILE_LIST], [
980 m4/absolute-header.m4 1005 m4/absolute-header.m4
981 m4/acl.m4 1006 m4/acl.m4
982 m4/alloca.m4 1007 m4/alloca.m4
1008 m4/builtin-expect.m4
983 m4/byteswap.m4 1009 m4/byteswap.m4
984 m4/c-strtod.m4 1010 m4/c-strtod.m4
985 m4/clock_time.m4 1011 m4/clock_time.m4
@@ -991,6 +1017,7 @@ AC_DEFUN([gl_FILE_LIST], [
991 m4/dirent_h.m4 1017 m4/dirent_h.m4
992 m4/dirfd.m4 1018 m4/dirfd.m4
993 m4/dup2.m4 1019 m4/dup2.m4
1020 m4/eealloc.m4
994 m4/environ.m4 1021 m4/environ.m4
995 m4/errno_h.m4 1022 m4/errno_h.m4
996 m4/euidaccess.m4 1023 m4/euidaccess.m4
@@ -1018,6 +1045,7 @@ AC_DEFUN([gl_FILE_LIST], [
1018 m4/gettime.m4 1045 m4/gettime.m4
1019 m4/gettimeofday.m4 1046 m4/gettimeofday.m4
1020 m4/gl-openssl.m4 1047 m4/gl-openssl.m4
1048 m4/glibc21.m4
1021 m4/gnulib-common.m4 1049 m4/gnulib-common.m4
1022 m4/group-member.m4 1050 m4/group-member.m4
1023 m4/ieee754-h.m4 1051 m4/ieee754-h.m4
@@ -1030,6 +1058,7 @@ AC_DEFUN([gl_FILE_LIST], [
1030 m4/lstat.m4 1058 m4/lstat.m4
1031 m4/manywarnings-c++.m4 1059 m4/manywarnings-c++.m4
1032 m4/manywarnings.m4 1060 m4/manywarnings.m4
1061 m4/mbstate_t.m4
1033 m4/md5.m4 1062 m4/md5.m4
1034 m4/memrchr.m4 1063 m4/memrchr.m4
1035 m4/minmax.m4 1064 m4/minmax.m4
@@ -1048,6 +1077,7 @@ AC_DEFUN([gl_FILE_LIST], [
1048 m4/putenv.m4 1077 m4/putenv.m4
1049 m4/readlink.m4 1078 m4/readlink.m4
1050 m4/readlinkat.m4 1079 m4/readlinkat.m4
1080 m4/regex.m4
1051 m4/sha1.m4 1081 m4/sha1.m4
1052 m4/sha256.m4 1082 m4/sha256.m4
1053 m4/sha512.m4 1083 m4/sha512.m4
diff --git a/m4/mbstate_t.m4 b/m4/mbstate_t.m4
new file mode 100644
index 00000000000..004aa0d17c8
--- /dev/null
+++ b/m4/mbstate_t.m4
@@ -0,0 +1,41 @@
1# mbstate_t.m4 serial 13
2dnl Copyright (C) 2000-2002, 2008-2018 Free Software Foundation, Inc.
3dnl This file is free software; the Free Software Foundation
4dnl gives unlimited permission to copy and/or distribute it,
5dnl with or without modifications, as long as this notice is preserved.
6
7# From Paul Eggert.
8
9# BeOS 5 has <wchar.h> but does not define mbstate_t,
10# so you can't declare an object of that type.
11# Check for this incompatibility with Standard C.
12
13# AC_TYPE_MBSTATE_T
14# -----------------
15AC_DEFUN([AC_TYPE_MBSTATE_T],
16[
17 AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS]) dnl for HP-UX 11.11
18
19 AC_CACHE_CHECK([for mbstate_t], [ac_cv_type_mbstate_t],
20 [AC_COMPILE_IFELSE(
21 [AC_LANG_PROGRAM(
22 [AC_INCLUDES_DEFAULT[
23/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
24 <wchar.h>.
25 BSD/OS 4.0.1 has a bug: <stddef.h>, <stdio.h> and <time.h> must be
26 included before <wchar.h>. */
27#include <stddef.h>
28#include <stdio.h>
29#include <time.h>
30#include <wchar.h>]],
31 [[mbstate_t x; return sizeof x;]])],
32 [ac_cv_type_mbstate_t=yes],
33 [ac_cv_type_mbstate_t=no])])
34 if test $ac_cv_type_mbstate_t = yes; then
35 AC_DEFINE([HAVE_MBSTATE_T], [1],
36 [Define to 1 if <wchar.h> declares mbstate_t.])
37 else
38 AC_DEFINE([mbstate_t], [int],
39 [Define to a type if <wchar.h> does not define.])
40 fi
41])
diff --git a/m4/regex.m4 b/m4/regex.m4
new file mode 100644
index 00000000000..055d71b5aaa
--- /dev/null
+++ b/m4/regex.m4
@@ -0,0 +1,300 @@
1# serial 67
2
3# Copyright (C) 1996-2001, 2003-2018 Free Software Foundation, Inc.
4#
5# This file is free software; the Free Software Foundation
6# gives unlimited permission to copy and/or distribute it,
7# with or without modifications, as long as this notice is preserved.
8
9dnl Initially derived from code in GNU grep.
10dnl Mostly written by Jim Meyering.
11
12AC_PREREQ([2.50])
13
14AC_DEFUN([gl_REGEX],
15[
16 AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
17 AC_ARG_WITH([included-regex],
18 [AS_HELP_STRING([--without-included-regex],
19 [don't compile regex; this is the default on systems
20 with recent-enough versions of the GNU C Library
21 (use with caution on other systems).])])
22
23 case $with_included_regex in #(
24 yes|no) ac_use_included_regex=$with_included_regex
25 ;;
26 '')
27 # If the system regex support is good enough that it passes the
28 # following run test, then default to *not* using the included regex.c.
29 # If cross compiling, assume the test would fail and use the included
30 # regex.c.
31 AC_CHECK_DECLS_ONCE([alarm])
32 AC_CHECK_HEADERS_ONCE([malloc.h])
33 AC_CACHE_CHECK([for working re_compile_pattern],
34 [gl_cv_func_re_compile_pattern_working],
35 [AC_RUN_IFELSE(
36 [AC_LANG_PROGRAM(
37 [[#include <regex.h>
38
39 #include <locale.h>
40 #include <limits.h>
41 #include <string.h>
42
43 #if defined M_CHECK_ACTION || HAVE_DECL_ALARM
44 # include <signal.h>
45 # include <unistd.h>
46 #endif
47
48 #if HAVE_MALLOC_H
49 # include <malloc.h>
50 #endif
51
52 #ifdef M_CHECK_ACTION
53 /* Exit with distinguishable exit code. */
54 static void sigabrt_no_core (int sig) { raise (SIGTERM); }
55 #endif
56 ]],
57 [[int result = 0;
58 static struct re_pattern_buffer regex;
59 unsigned char folded_chars[UCHAR_MAX + 1];
60 int i;
61 const char *s;
62 struct re_registers regs;
63
64 /* Some builds of glibc go into an infinite loop on this
65 test. Use alarm to force death, and mallopt to avoid
66 malloc recursion in diagnosing the corrupted heap. */
67#if HAVE_DECL_ALARM
68 signal (SIGALRM, SIG_DFL);
69 alarm (2);
70#endif
71#ifdef M_CHECK_ACTION
72 signal (SIGABRT, sigabrt_no_core);
73 mallopt (M_CHECK_ACTION, 2);
74#endif
75
76 if (setlocale (LC_ALL, "en_US.UTF-8"))
77 {
78 {
79 /* https://sourceware.org/ml/libc-hacker/2006-09/msg00008.html
80 This test needs valgrind to catch the bug on Debian
81 GNU/Linux 3.1 x86, but it might catch the bug better
82 on other platforms and it shouldn't hurt to try the
83 test here. */
84 static char const pat[] = "insert into";
85 static char const data[] =
86 "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK";
87 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE
88 | RE_ICASE);
89 memset (&regex, 0, sizeof regex);
90 s = re_compile_pattern (pat, sizeof pat - 1, &regex);
91 if (s)
92 result |= 1;
93 else if (re_search (&regex, data, sizeof data - 1,
94 0, sizeof data - 1, &regs)
95 != -1)
96 result |= 1;
97 regfree (&regex);
98 }
99
100 {
101 /* This test is from glibc bug 15078.
102 The test case is from Andreas Schwab in
103 <https://sourceware.org/ml/libc-alpha/2013-01/msg00967.html>.
104 */
105 static char const pat[] = "[^x]x";
106 static char const data[] =
107 /* <U1000><U103B><U103D><U1014><U103A><U102F><U1015><U103A> */
108 "\xe1\x80\x80"
109 "\xe1\x80\xbb"
110 "\xe1\x80\xbd"
111 "\xe1\x80\x94"
112 "\xe1\x80\xba"
113 "\xe1\x80\xaf"
114 "\xe1\x80\x95"
115 "\xe1\x80\xba"
116 "x";
117 re_set_syntax (0);
118 memset (&regex, 0, sizeof regex);
119 s = re_compile_pattern (pat, sizeof pat - 1, &regex);
120 if (s)
121 result |= 1;
122 else
123 {
124 i = re_search (&regex, data, sizeof data - 1,
125 0, sizeof data - 1, 0);
126 if (i != 0 && i != 21)
127 result |= 1;
128 }
129 regfree (&regex);
130 }
131
132 if (! setlocale (LC_ALL, "C"))
133 return 1;
134 }
135
136 /* This test is from glibc bug 3957, reported by Andrew Mackey. */
137 re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE);
138 memset (&regex, 0, sizeof regex);
139 s = re_compile_pattern ("a[^x]b", 6, &regex);
140 if (s)
141 result |= 2;
142 /* This should fail, but succeeds for glibc-2.5. */
143 else if (re_search (&regex, "a\nb", 3, 0, 3, &regs) != -1)
144 result |= 2;
145
146 /* This regular expression is from Spencer ere test number 75
147 in grep-2.3. */
148 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
149 memset (&regex, 0, sizeof regex);
150 for (i = 0; i <= UCHAR_MAX; i++)
151 folded_chars[i] = i;
152 regex.translate = folded_chars;
153 s = re_compile_pattern ("a[[:@:>@:]]b\n", 11, &regex);
154 /* This should fail with _Invalid character class name_ error. */
155 if (!s)
156 result |= 4;
157
158 /* Ensure that [b-a] is diagnosed as invalid, when
159 using RE_NO_EMPTY_RANGES. */
160 re_set_syntax (RE_SYNTAX_POSIX_EGREP | RE_NO_EMPTY_RANGES);
161 memset (&regex, 0, sizeof regex);
162 s = re_compile_pattern ("a[b-a]", 6, &regex);
163 if (s == 0)
164 result |= 8;
165
166 /* This should succeed, but does not for glibc-2.1.3. */
167 memset (&regex, 0, sizeof regex);
168 s = re_compile_pattern ("{1", 2, &regex);
169 if (s)
170 result |= 8;
171
172 /* The following example is derived from a problem report
173 against gawk from Jorge Stolfi <stolfi@ic.unicamp.br>. */
174 memset (&regex, 0, sizeof regex);
175 s = re_compile_pattern ("[an\371]*n", 7, &regex);
176 if (s)
177 result |= 8;
178 /* This should match, but does not for glibc-2.2.1. */
179 else if (re_match (&regex, "an", 2, 0, &regs) != 2)
180 result |= 8;
181
182 memset (&regex, 0, sizeof regex);
183 s = re_compile_pattern ("x", 1, &regex);
184 if (s)
185 result |= 8;
186 /* glibc-2.2.93 does not work with a negative RANGE argument. */
187 else if (re_search (&regex, "wxy", 3, 2, -2, &regs) != 1)
188 result |= 8;
189
190 /* The version of regex.c in older versions of gnulib
191 ignored RE_ICASE. Detect that problem too. */
192 re_set_syntax (RE_SYNTAX_EMACS | RE_ICASE);
193 memset (&regex, 0, sizeof regex);
194 s = re_compile_pattern ("x", 1, &regex);
195 if (s)
196 result |= 16;
197 else if (re_search (&regex, "WXY", 3, 0, 3, &regs) < 0)
198 result |= 16;
199
200 /* Catch a bug reported by Vin Shelton in
201 https://lists.gnu.org/r/bug-coreutils/2007-06/msg00089.html
202 */
203 re_set_syntax (RE_SYNTAX_POSIX_BASIC
204 & ~RE_CONTEXT_INVALID_DUP
205 & ~RE_NO_EMPTY_RANGES);
206 memset (&regex, 0, sizeof regex);
207 s = re_compile_pattern ("[[:alnum:]_-]\\\\+$", 16, &regex);
208 if (s)
209 result |= 32;
210
211 /* REG_STARTEND was added to glibc on 2004-01-15.
212 Reject older versions. */
213 if (! REG_STARTEND)
214 result |= 64;
215
216#if 0
217 /* It would be nice to reject hosts whose regoff_t values are too
218 narrow (including glibc on hosts with 64-bit ptrdiff_t and
219 32-bit int), but we should wait until glibc implements this
220 feature. Otherwise, support for equivalence classes and
221 multibyte collation symbols would always be broken except
222 when compiling --without-included-regex. */
223 if (sizeof (regoff_t) < sizeof (ptrdiff_t)
224 || sizeof (regoff_t) < sizeof (ssize_t))
225 result |= 64;
226#endif
227
228 return result;
229 ]])],
230 [gl_cv_func_re_compile_pattern_working=yes],
231 [gl_cv_func_re_compile_pattern_working=no],
232 [case "$host_os" in
233 # Guess no on native Windows.
234 mingw*) gl_cv_func_re_compile_pattern_working="guessing no" ;;
235 # Otherwise, assume it is not working.
236 *) gl_cv_func_re_compile_pattern_working="guessing no" ;;
237 esac
238 ])
239 ])
240 case "$gl_cv_func_re_compile_pattern_working" in #(
241 *yes) ac_use_included_regex=no;; #(
242 *no) ac_use_included_regex=yes;;
243 esac
244 ;;
245 *) AC_MSG_ERROR([Invalid value for --with-included-regex: $with_included_regex])
246 ;;
247 esac
248
249 if test $ac_use_included_regex = yes; then
250 AC_DEFINE([_REGEX_INCLUDE_LIMITS_H], [1],
251 [Define if you want <regex.h> to include <limits.h>, so that it
252 consistently overrides <limits.h>'s RE_DUP_MAX.])
253 AC_DEFINE([_REGEX_LARGE_OFFSETS], [1],
254 [Define if you want regoff_t to be at least as wide POSIX requires.])
255 AC_DEFINE([re_syntax_options], [rpl_re_syntax_options],
256 [Define to rpl_re_syntax_options if the replacement should be used.])
257 AC_DEFINE([re_set_syntax], [rpl_re_set_syntax],
258 [Define to rpl_re_set_syntax if the replacement should be used.])
259 AC_DEFINE([re_compile_pattern], [rpl_re_compile_pattern],
260 [Define to rpl_re_compile_pattern if the replacement should be used.])
261 AC_DEFINE([re_compile_fastmap], [rpl_re_compile_fastmap],
262 [Define to rpl_re_compile_fastmap if the replacement should be used.])
263 AC_DEFINE([re_search], [rpl_re_search],
264 [Define to rpl_re_search if the replacement should be used.])
265 AC_DEFINE([re_search_2], [rpl_re_search_2],
266 [Define to rpl_re_search_2 if the replacement should be used.])
267 AC_DEFINE([re_match], [rpl_re_match],
268 [Define to rpl_re_match if the replacement should be used.])
269 AC_DEFINE([re_match_2], [rpl_re_match_2],
270 [Define to rpl_re_match_2 if the replacement should be used.])
271 AC_DEFINE([re_set_registers], [rpl_re_set_registers],
272 [Define to rpl_re_set_registers if the replacement should be used.])
273 AC_DEFINE([re_comp], [rpl_re_comp],
274 [Define to rpl_re_comp if the replacement should be used.])
275 AC_DEFINE([re_exec], [rpl_re_exec],
276 [Define to rpl_re_exec if the replacement should be used.])
277 AC_DEFINE([regcomp], [rpl_regcomp],
278 [Define to rpl_regcomp if the replacement should be used.])
279 AC_DEFINE([regexec], [rpl_regexec],
280 [Define to rpl_regexec if the replacement should be used.])
281 AC_DEFINE([regerror], [rpl_regerror],
282 [Define to rpl_regerror if the replacement should be used.])
283 AC_DEFINE([regfree], [rpl_regfree],
284 [Define to rpl_regfree if the replacement should be used.])
285 fi
286])
287
288# Prerequisites of lib/regex.c and lib/regex_internal.c.
289AC_DEFUN([gl_PREREQ_REGEX],
290[
291 AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS])
292 AC_REQUIRE([AC_C_INLINE])
293 AC_REQUIRE([AC_C_RESTRICT])
294 AC_REQUIRE([AC_TYPE_MBSTATE_T])
295 AC_REQUIRE([gl_EEMALLOC])
296 AC_REQUIRE([gl_GLIBC21])
297 AC_CHECK_HEADERS([libintl.h])
298 AC_CHECK_FUNCS_ONCE([isblank iswctype])
299 AC_CHECK_DECLS([isblank], [], [], [[#include <ctype.h>]])
300])
diff --git a/src/conf_post.h b/src/conf_post.h
index 8d56f0b4905..97582984378 100644
--- a/src/conf_post.h
+++ b/src/conf_post.h
@@ -202,13 +202,6 @@ extern void _DebPrint (const char *fmt, ...);
202#endif 202#endif
203#endif 203#endif
204 204
205#ifdef emacs /* Don't do this for lib-src. */
206/* Tell regex-emacs.c to use a type compatible with Emacs. */
207#define RE_TRANSLATE_TYPE Lisp_Object
208#define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
209#define RE_TRANSLATE_P(TBL) (!EQ (TBL, make_number (0)))
210#endif
211
212/* Tell time_rz.c to use Emacs's getter and setter for TZ. 205/* Tell time_rz.c to use Emacs's getter and setter for TZ.
213 Only Emacs uses time_rz so this is OK. */ 206 Only Emacs uses time_rz so this is OK. */
214#define getenv_TZ emacs_getenv_TZ 207#define getenv_TZ emacs_getenv_TZ
diff --git a/src/regex-emacs.h b/src/regex-emacs.h
index cb6dd76ed3e..9a6214af98c 100644
--- a/src/regex-emacs.h
+++ b/src/regex-emacs.h
@@ -219,7 +219,7 @@ extern ptrdiff_t emacs_re_safe_alloca;
219 ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \ 219 ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \
220 & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS)) 220 & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS))
221 221
222#define RE_SYNTAX_POSIX_AWK \ 222#define RE_SYNTAX_POSIX_AWK \
223 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ 223 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
224 | RE_INTERVALS | RE_NO_GNU_OPS) 224 | RE_INTERVALS | RE_NO_GNU_OPS)
225 225
@@ -350,6 +350,11 @@ typedef enum
350 REG_ESIZEBR /* n or m too big in \{n,m\} */ 350 REG_ESIZEBR /* n or m too big in \{n,m\} */
351} reg_errcode_t; 351} reg_errcode_t;
352 352
353/* Use a type compatible with Emacs. */
354#define RE_TRANSLATE_TYPE Lisp_Object
355#define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
356#define RE_TRANSLATE_P(TBL) (!EQ (TBL, make_number (0)))
357
353/* This data structure represents a compiled pattern. Before calling 358/* This data structure represents a compiled pattern. Before calling
354 the pattern compiler, the fields `buffer', `allocated', `fastmap', 359 the pattern compiler, the fields `buffer', `allocated', `fastmap',
355 `translate', and `no_sub' can be set. After the pattern has been 360 `translate', and `no_sub' can be set. After the pattern has been