aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim J. Robbins <tjr@FreeBSD.org>2005-05-10 10:39:53 +0000
committerTim J. Robbins <tjr@FreeBSD.org>2005-05-10 10:39:53 +0000
commitaaf58402c2e3af89f4127955d311864b809afecb (patch)
tree6151dd33a8707f801a30a662b4297d080f778984
parent1a9b678f8d1e27a79844701bb26f43a43485888e (diff)
downloadsrc-aaf58402c2e3af89f4127955d311864b809afecb.tar.gz
src-aaf58402c2e3af89f4127955d311864b809afecb.zip
Import of regex bits from fedora-glibc-2_3_4-21 tag in glibc CVS.vendor/libregex/fedora-glibc-2.3.4-21
Notes
Notes: svn path=/vendor/libregex/dist/; revision=146040 svn path=/vendor/libregex/fedora-glibc-2.3.4-21/; revision=146042; tag=vendor/libregex/fedora-glibc-2.3.4-21
-rw-r--r--gnu/lib/libregex/posix/regex.h67
-rw-r--r--gnu/lib/libregex/re_comp.h26
-rw-r--r--gnu/lib/libregex/regcomp.c3924
-rw-r--r--gnu/lib/libregex/regex.c7893
-rw-r--r--gnu/lib/libregex/regex_internal.c1674
-rw-r--r--gnu/lib/libregex/regex_internal.h798
-rw-r--r--gnu/lib/libregex/regexec.c4327
7 files changed, 10847 insertions, 7862 deletions
diff --git a/gnu/lib/libregex/posix/regex.h b/gnu/lib/libregex/posix/regex.h
index 63c2fef6967b..b2d9a62fec97 100644
--- a/gnu/lib/libregex/posix/regex.h
+++ b/gnu/lib/libregex/posix/regex.h
@@ -1,28 +1,29 @@
/* Definitions for data structures and routines for the regular
- expression library, version 0.12.
- Copyright (C) 1985,1989-1993,1995-1998, 2000 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library. Its master source is NOT part of
- the C library, however. The master source lives in /gd/gnu/lib.
+ expression library.
+ Copyright (C) 1985,1989-93,1995-98,2000,2001,2002,2003
+ Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public License as
- published by the Free Software Foundation; either version 2 of the
- License, or (at your option) any later version.
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
+ Lesser General Public License for more details.
- You should have received a copy of the GNU Library General Public
- License along with the GNU C Library; see the file COPYING.LIB. If not,
- write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA. */
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
#ifndef _REGEX_H
#define _REGEX_H 1
+#include <sys/types.h>
+
/* Allow the use in C++ code. */
#ifdef __cplusplus
extern "C" {
@@ -165,6 +166,23 @@ typedef unsigned long int reg_syntax_t;
treated as 'a\{1'. */
#define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
+/* If this bit is set, then ignore case when matching.
+ If not set, then case is significant. */
+#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
+
+/* This bit is used internally like RE_CONTEXT_INDEP_ANCHORS but only
+ for ^, because it is difficult to scan the regex backwards to find
+ whether ^ should be special. */
+#define RE_CARET_ANCHORS_HERE (RE_ICASE << 1)
+
+/* If this bit is set, then \{ cannot be first in an bre or
+ immediately after an alternation or begin-group operator. */
+#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
+
+/* If this bit is set, then no_sub will be set to 1 during
+ re_compile_pattern. */
+#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
@@ -186,7 +204,8 @@ extern reg_syntax_t re_syntax_options;
#define RE_SYNTAX_GNU_AWK \
((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \
- & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS))
+ & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS \
+ | RE_CONTEXT_INVALID_OPS ))
#define RE_SYNTAX_POSIX_AWK \
(RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
@@ -218,7 +237,7 @@ extern reg_syntax_t re_syntax_options;
| RE_INTERVALS | RE_NO_EMPTY_RANGES)
#define RE_SYNTAX_POSIX_BASIC \
- (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
+ (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM | RE_CONTEXT_INVALID_DUP)
/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
@@ -283,6 +302,10 @@ extern reg_syntax_t re_syntax_options;
/* Like REG_NOTBOL, except for the end-of-line. */
#define REG_NOTEOL (1 << 1)
+/* Use PMATCH[0] to delimit the start and end of the search in the
+ buffer. */
+#define REG_STARTEND (1 << 2)
+
/* If any error codes are removed, changed, or added, update the
`re_error_msg' table in regex.c. */
@@ -298,7 +321,7 @@ typedef enum
/* POSIX regcomp return error codes. (In the order listed in the
standard.) */
REG_BADPAT, /* Invalid pattern. */
- REG_ECOLLATE, /* Not implemented. */
+ REG_ECOLLATE, /* Inalid collating element. */
REG_ECTYPE, /* Invalid character class name. */
REG_EESCAPE, /* Trailing backslash. */
REG_ESUBREG, /* Invalid back reference. */
@@ -530,10 +553,14 @@ extern int re_exec _RE_ARGS ((const char *));
# endif
# endif
#endif
-/* For now unconditionally define __restrict_arr to expand to nothing.
- Ideally we would have a test for the compiler which allows defining
- it to restrict. */
-#define __restrict_arr
+/* gcc 3.1 and up support the [restrict] syntax. */
+#ifndef __restrict_arr
+# if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+# define __restrict_arr __restrict
+# else
+# define __restrict_arr
+# endif
+#endif
/* POSIX compatibility. */
extern int regcomp _RE_ARGS ((regex_t *__restrict __preg,
diff --git a/gnu/lib/libregex/re_comp.h b/gnu/lib/libregex/re_comp.h
new file mode 100644
index 000000000000..49114479c13f
--- /dev/null
+++ b/gnu/lib/libregex/re_comp.h
@@ -0,0 +1,26 @@
+/* Copyright (C) 1996 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef _RE_COMP_H
+#define _RE_COMP_H 1
+
+/* This is only a wrapper around the <regex.h> file. XPG4.2 mentions
+ this name. */
+#include <regex.h>
+
+#endif /* re_comp.h */
diff --git a/gnu/lib/libregex/regcomp.c b/gnu/lib/libregex/regcomp.c
new file mode 100644
index 000000000000..68e2bdab92d1
--- /dev/null
+++ b/gnu/lib/libregex/regcomp.c
@@ -0,0 +1,3924 @@
+/* Extended regular expression matching and search library.
+ Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
+ int length, reg_syntax_t syntax);
+static void re_compile_fastmap_iter (regex_t *bufp,
+ const re_dfastate_t *init_state,
+ char *fastmap);
+static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
+static void init_word_char (re_dfa_t *dfa);
+#ifdef RE_ENABLE_I18N
+static void free_charset (re_charset_t *cset);
+#endif /* RE_ENABLE_I18N */
+static void free_workarea_compile (regex_t *preg);
+static reg_errcode_t create_initial_state (re_dfa_t *dfa);
+#ifdef RE_ENABLE_I18N
+static void optimize_utf8 (re_dfa_t *dfa);
+#endif
+static reg_errcode_t analyze (regex_t *preg);
+static reg_errcode_t create_initial_state (re_dfa_t *dfa);
+static reg_errcode_t preorder (bin_tree_t *root,
+ reg_errcode_t (fn (void *, bin_tree_t *)),
+ void *extra);
+static reg_errcode_t postorder (bin_tree_t *root,
+ reg_errcode_t (fn (void *, bin_tree_t *)),
+ void *extra);
+static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
+static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
+static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
+ bin_tree_t *node);
+static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
+static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
+static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
+static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
+ int top_clone_node, int root_node,
+ unsigned int constraint);
+static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
+ unsigned int constraint);
+static int search_duplicated_node (re_dfa_t *dfa, int org_node,
+ unsigned int constraint);
+static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
+static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
+ int node, int root);
+static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
+static int fetch_number (re_string_t *input, re_token_t *token,
+ reg_syntax_t syntax);
+static void fetch_token (re_token_t *result, re_string_t *input,
+ reg_syntax_t syntax);
+static int peek_token (re_token_t *token, re_string_t *input,
+ reg_syntax_t syntax);
+static int peek_token_bracket (re_token_t *token, re_string_t *input,
+ reg_syntax_t syntax);
+static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
+ reg_syntax_t syntax, reg_errcode_t *err);
+static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
+ re_token_t *token, reg_syntax_t syntax,
+ int nest, reg_errcode_t *err);
+static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
+ re_token_t *token, reg_syntax_t syntax,
+ int nest, reg_errcode_t *err);
+static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
+ re_token_t *token, reg_syntax_t syntax,
+ int nest, reg_errcode_t *err);
+static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
+ re_token_t *token, reg_syntax_t syntax,
+ int nest, reg_errcode_t *err);
+static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
+ re_dfa_t *dfa, re_token_t *token,
+ reg_syntax_t syntax, reg_errcode_t *err);
+static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
+ re_token_t *token, reg_syntax_t syntax,
+ reg_errcode_t *err);
+static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
+ re_string_t *regexp,
+ re_token_t *token, int token_len,
+ re_dfa_t *dfa,
+ reg_syntax_t syntax,
+ int accept_hyphen);
+static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
+ re_string_t *regexp,
+ re_token_t *token);
+#ifndef _LIBC
+# ifdef RE_ENABLE_I18N
+static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
+ re_charset_t *mbcset, int *range_alloc,
+ bracket_elem_t *start_elem,
+ bracket_elem_t *end_elem);
+static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
+ re_charset_t *mbcset,
+ int *coll_sym_alloc,
+ const unsigned char *name);
+# else /* not RE_ENABLE_I18N */
+static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
+ bracket_elem_t *start_elem,
+ bracket_elem_t *end_elem);
+static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
+ const unsigned char *name);
+# endif /* not RE_ENABLE_I18N */
+#endif /* not _LIBC */
+#ifdef RE_ENABLE_I18N
+static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
+ re_charset_t *mbcset,
+ int *equiv_class_alloc,
+ const unsigned char *name);
+static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
+ re_bitset_ptr_t sbcset,
+ re_charset_t *mbcset,
+ int *char_class_alloc,
+ const unsigned char *class_name,
+ reg_syntax_t syntax);
+#else /* not RE_ENABLE_I18N */
+static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
+ const unsigned char *name);
+static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
+ re_bitset_ptr_t sbcset,
+ const unsigned char *class_name,
+ reg_syntax_t syntax);
+#endif /* not RE_ENABLE_I18N */
+static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
+ unsigned RE_TRANSLATE_TYPE trans,
+ const unsigned char *class_name,
+ const unsigned char *extra,
+ int non_match, reg_errcode_t *err);
+static bin_tree_t *create_tree (re_dfa_t *dfa,
+ bin_tree_t *left, bin_tree_t *right,
+ re_token_type_t type);
+static bin_tree_t *create_token_tree (re_dfa_t *dfa,
+ bin_tree_t *left, bin_tree_t *right,
+ const re_token_t *token);
+static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
+static void free_token (re_token_t *node);
+static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
+static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
+
+/* This table gives an error message for each of the error codes listed
+ in regex.h. Obviously the order here has to be same as there.
+ POSIX doesn't require that we do anything for REG_NOERROR,
+ but why not be nice? */
+
+const char __re_error_msgid[] attribute_hidden =
+ {
+#define REG_NOERROR_IDX 0
+ gettext_noop ("Success") /* REG_NOERROR */
+ "\0"
+#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
+ gettext_noop ("No match") /* REG_NOMATCH */
+ "\0"
+#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
+ gettext_noop ("Invalid regular expression") /* REG_BADPAT */
+ "\0"
+#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
+ gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
+ "\0"
+#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
+ gettext_noop ("Invalid character class name") /* REG_ECTYPE */
+ "\0"
+#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
+ gettext_noop ("Trailing backslash") /* REG_EESCAPE */
+ "\0"
+#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
+ gettext_noop ("Invalid back reference") /* REG_ESUBREG */
+ "\0"
+#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
+ gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
+ "\0"
+#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
+ gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
+ "\0"
+#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
+ gettext_noop ("Unmatched \\{") /* REG_EBRACE */
+ "\0"
+#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
+ gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
+ "\0"
+#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
+ gettext_noop ("Invalid range end") /* REG_ERANGE */
+ "\0"
+#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
+ gettext_noop ("Memory exhausted") /* REG_ESPACE */
+ "\0"
+#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
+ gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
+ "\0"
+#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
+ gettext_noop ("Premature end of regular expression") /* REG_EEND */
+ "\0"
+#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
+ gettext_noop ("Regular expression too big") /* REG_ESIZE */
+ "\0"
+#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
+ gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
+ };
+
+const size_t __re_error_msgid_idx[] attribute_hidden =
+ {
+ REG_NOERROR_IDX,
+ REG_NOMATCH_IDX,
+ REG_BADPAT_IDX,
+ REG_ECOLLATE_IDX,
+ REG_ECTYPE_IDX,
+ REG_EESCAPE_IDX,
+ REG_ESUBREG_IDX,
+ REG_EBRACK_IDX,
+ REG_EPAREN_IDX,
+ REG_EBRACE_IDX,
+ REG_BADBR_IDX,
+ REG_ERANGE_IDX,
+ REG_ESPACE_IDX,
+ REG_BADRPT_IDX,
+ REG_EEND_IDX,
+ REG_ESIZE_IDX,
+ REG_ERPAREN_IDX
+ };
+
+/* Entry points for GNU code. */
+
+/* re_compile_pattern is the GNU regular expression compiler: it
+ compiles PATTERN (of length LENGTH) and puts the result in BUFP.
+ Returns 0 if the pattern was valid, otherwise an error string.
+
+ Assumes the `allocated' (and perhaps `buffer') and `translate' fields
+ are set in BUFP on entry. */
+
+const char *
+re_compile_pattern (pattern, length, bufp)
+ const char *pattern;
+ size_t length;
+ struct re_pattern_buffer *bufp;
+{
+ reg_errcode_t ret;
+
+ /* And GNU code determines whether or not to get register information
+ by passing null for the REGS argument to re_match, etc., not by
+ setting no_sub, unless RE_NO_SUB is set. */
+ bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
+
+ /* Match anchors at newline. */
+ bufp->newline_anchor = 1;
+
+ ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
+
+ if (!ret)
+ return NULL;
+ return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
+}
+#ifdef _LIBC
+weak_alias (__re_compile_pattern, re_compile_pattern)
+#endif
+
+/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
+ also be assigned to arbitrarily: each pattern buffer stores its own
+ syntax, so it can be changed between regex compilations. */
+/* This has no initializer because initialized variables in Emacs
+ become read-only after dumping. */
+reg_syntax_t re_syntax_options;
+
+
+/* Specify the precise syntax of regexps for compilation. This provides
+ for compatibility for various utilities which historically have
+ different, incompatible syntaxes.
+
+ The argument SYNTAX is a bit mask comprised of the various bits
+ defined in regex.h. We return the old syntax. */
+
+reg_syntax_t
+re_set_syntax (syntax)
+ reg_syntax_t syntax;
+{
+ reg_syntax_t ret = re_syntax_options;
+
+ re_syntax_options = syntax;
+ return ret;
+}
+#ifdef _LIBC
+weak_alias (__re_set_syntax, re_set_syntax)
+#endif
+
+int
+re_compile_fastmap (bufp)
+ struct re_pattern_buffer *bufp;
+{
+ re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
+ char *fastmap = bufp->fastmap;
+
+ memset (fastmap, '\0', sizeof (char) * SBC_MAX);
+ re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
+ if (dfa->init_state != dfa->init_state_word)
+ re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
+ if (dfa->init_state != dfa->init_state_nl)
+ re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
+ if (dfa->init_state != dfa->init_state_begbuf)
+ re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
+ bufp->fastmap_accurate = 1;
+ return 0;
+}
+#ifdef _LIBC
+weak_alias (__re_compile_fastmap, re_compile_fastmap)
+#endif
+
+static inline void
+__attribute ((always_inline))
+re_set_fastmap (char *fastmap, int icase, int ch)
+{
+ fastmap[ch] = 1;
+ if (icase)
+ fastmap[tolower (ch)] = 1;
+}
+
+/* Helper function for re_compile_fastmap.
+ Compile fastmap for the initial_state INIT_STATE. */
+
+static void
+re_compile_fastmap_iter (bufp, init_state, fastmap)
+ regex_t *bufp;
+ const re_dfastate_t *init_state;
+ char *fastmap;
+{
+ re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
+ int node_cnt;
+ int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
+ for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
+ {
+ int node = init_state->nodes.elems[node_cnt];
+ re_token_type_t type = dfa->nodes[node].type;
+
+ if (type == CHARACTER)
+ {
+ re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
+#ifdef RE_ENABLE_I18N
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
+ {
+ unsigned char *buf = alloca (dfa->mb_cur_max), *p;
+ wchar_t wc;
+ mbstate_t state;
+
+ p = buf;
+ *p++ = dfa->nodes[node].opr.c;
+ while (++node < dfa->nodes_len
+ && dfa->nodes[node].type == CHARACTER
+ && dfa->nodes[node].mb_partial)
+ *p++ = dfa->nodes[node].opr.c;
+ memset (&state, 0, sizeof (state));
+ if (mbrtowc (&wc, (const char *) buf, p - buf,
+ &state) == p - buf
+ && (__wcrtomb ((char *) buf, towlower (wc), &state)
+ != (size_t) -1))
+ re_set_fastmap (fastmap, 0, buf[0]);
+ }
+#endif
+ }
+ else if (type == SIMPLE_BRACKET)
+ {
+ int i, j, ch;
+ for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+ for (j = 0; j < UINT_BITS; ++j, ++ch)
+ if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
+ re_set_fastmap (fastmap, icase, ch);
+ }
+#ifdef RE_ENABLE_I18N
+ else if (type == COMPLEX_BRACKET)
+ {
+ int i;
+ re_charset_t *cset = dfa->nodes[node].opr.mbcset;
+ if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
+ || cset->nranges || cset->nchar_classes)
+ {
+# ifdef _LIBC
+ if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
+ {
+ /* In this case we want to catch the bytes which are
+ the first byte of any collation elements.
+ e.g. In da_DK, we want to catch 'a' since "aa"
+ is a valid collation element, and don't catch
+ 'b' since 'b' is the only collation element
+ which starts from 'b'. */
+ int j, ch;
+ const int32_t *table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+ for (j = 0; j < UINT_BITS; ++j, ++ch)
+ if (table[ch] < 0)
+ re_set_fastmap (fastmap, icase, ch);
+ }
+# else
+ if (dfa->mb_cur_max > 1)
+ for (i = 0; i < SBC_MAX; ++i)
+ if (__btowc (i) == WEOF)
+ re_set_fastmap (fastmap, icase, i);
+# endif /* not _LIBC */
+ }
+ for (i = 0; i < cset->nmbchars; ++i)
+ {
+ char buf[256];
+ mbstate_t state;
+ memset (&state, '\0', sizeof (state));
+ if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
+ re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
+ {
+ if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
+ != (size_t) -1)
+ re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
+ }
+ }
+ }
+#endif /* RE_ENABLE_I18N */
+ else if (type == OP_PERIOD
+#ifdef RE_ENABLE_I18N
+ || type == OP_UTF8_PERIOD
+#endif /* RE_ENABLE_I18N */
+ || type == END_OF_RE)
+ {
+ memset (fastmap, '\1', sizeof (char) * SBC_MAX);
+ if (type == END_OF_RE)
+ bufp->can_be_null = 1;
+ return;
+ }
+ }
+}
+
+/* Entry point for POSIX code. */
+/* regcomp takes a regular expression as a string and compiles it.
+
+ PREG is a regex_t *. We do not expect any fields to be initialized,
+ since POSIX says we shouldn't. Thus, we set
+
+ `buffer' to the compiled pattern;
+ `used' to the length of the compiled pattern;
+ `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
+ REG_EXTENDED bit in CFLAGS is set; otherwise, to
+ RE_SYNTAX_POSIX_BASIC;
+ `newline_anchor' to REG_NEWLINE being set in CFLAGS;
+ `fastmap' to an allocated space for the fastmap;
+ `fastmap_accurate' to zero;
+ `re_nsub' to the number of subexpressions in PATTERN.
+
+ PATTERN is the address of the pattern string.
+
+ CFLAGS is a series of bits which affect compilation.
+
+ If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
+ use POSIX basic syntax.
+
+ If REG_NEWLINE is set, then . and [^...] don't match newline.
+ Also, regexec will try a match beginning after every newline.
+
+ If REG_ICASE is set, then we considers upper- and lowercase
+ versions of letters to be equivalent when matching.
+
+ If REG_NOSUB is set, then when PREG is passed to regexec, that
+ routine will report only success or failure, and nothing about the
+ registers.
+
+ It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
+ the return codes and their meanings.) */
+
+int
+regcomp (preg, pattern, cflags)
+ regex_t *__restrict preg;
+ const char *__restrict pattern;
+ int cflags;
+{
+ reg_errcode_t ret;
+ reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
+ : RE_SYNTAX_POSIX_BASIC);
+
+ preg->buffer = NULL;
+ preg->allocated = 0;
+ preg->used = 0;
+
+ /* Try to allocate space for the fastmap. */
+ preg->fastmap = re_malloc (char, SBC_MAX);
+ if (BE (preg->fastmap == NULL, 0))
+ return REG_ESPACE;
+
+ syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
+
+ /* If REG_NEWLINE is set, newlines are treated differently. */
+ if (cflags & REG_NEWLINE)
+ { /* REG_NEWLINE implies neither . nor [^...] match newline. */
+ syntax &= ~RE_DOT_NEWLINE;
+ syntax |= RE_HAT_LISTS_NOT_NEWLINE;
+ /* It also changes the matching behavior. */
+ preg->newline_anchor = 1;
+ }
+ else
+ preg->newline_anchor = 0;
+ preg->no_sub = !!(cflags & REG_NOSUB);
+ preg->translate = NULL;
+
+ ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
+
+ /* POSIX doesn't distinguish between an unmatched open-group and an
+ unmatched close-group: both are REG_EPAREN. */
+ if (ret == REG_ERPAREN)
+ ret = REG_EPAREN;
+
+ /* We have already checked preg->fastmap != NULL. */
+ if (BE (ret == REG_NOERROR, 1))
+ /* Compute the fastmap now, since regexec cannot modify the pattern
+ buffer. This function never fails in this implementation. */
+ (void) re_compile_fastmap (preg);
+ else
+ {
+ /* Some error occurred while compiling the expression. */
+ re_free (preg->fastmap);
+ preg->fastmap = NULL;
+ }
+
+ return (int) ret;
+}
+#ifdef _LIBC
+weak_alias (__regcomp, regcomp)
+#endif
+
+/* Returns a message corresponding to an error code, ERRCODE, returned
+ from either regcomp or regexec. We don't use PREG here. */
+
+size_t
+regerror (errcode, preg, errbuf, errbuf_size)
+ int errcode;
+ const regex_t *preg;
+ char *errbuf;
+ size_t errbuf_size;
+{
+ const char *msg;
+ size_t msg_size;
+
+ if (BE (errcode < 0
+ || errcode >= (int) (sizeof (__re_error_msgid_idx)
+ / sizeof (__re_error_msgid_idx[0])), 0))
+ /* Only error codes returned by the rest of the code should be passed
+ to this routine. If we are given anything else, or if other regex
+ code generates an invalid error code, then the program has a bug.
+ Dump core so we can fix it. */
+ abort ();
+
+ msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
+
+ msg_size = strlen (msg) + 1; /* Includes the null. */
+
+ if (BE (errbuf_size != 0, 1))
+ {
+ if (BE (msg_size > errbuf_size, 0))
+ {
+#if defined HAVE_MEMPCPY || defined _LIBC
+ *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
+#else
+ memcpy (errbuf, msg, errbuf_size - 1);
+ errbuf[errbuf_size - 1] = 0;
+#endif
+ }
+ else
+ memcpy (errbuf, msg, msg_size);
+ }
+
+ return msg_size;
+}
+#ifdef _LIBC
+weak_alias (__regerror, regerror)
+#endif
+
+
+#ifdef RE_ENABLE_I18N
+/* This static array is used for the map to single-byte characters when
+ UTF-8 is used. Otherwise we would allocate memory just to initialize
+ it the same all the time. UTF-8 is the preferred encoding so this is
+ a worthwhile optimization. */
+static const bitset utf8_sb_map =
+{
+ /* Set the first 128 bits. */
+# if UINT_MAX == 0xffffffff
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+# else
+# error "Add case for new unsigned int size"
+# endif
+};
+#endif
+
+
+static void
+free_dfa_content (re_dfa_t *dfa)
+{
+ int i, j;
+
+ if (dfa->nodes)
+ for (i = 0; i < dfa->nodes_len; ++i)
+ free_token (dfa->nodes + i);
+ re_free (dfa->nexts);
+ for (i = 0; i < dfa->nodes_len; ++i)
+ {
+ if (dfa->eclosures != NULL)
+ re_node_set_free (dfa->eclosures + i);
+ if (dfa->inveclosures != NULL)
+ re_node_set_free (dfa->inveclosures + i);
+ if (dfa->edests != NULL)
+ re_node_set_free (dfa->edests + i);
+ }
+ re_free (dfa->edests);
+ re_free (dfa->eclosures);
+ re_free (dfa->inveclosures);
+ re_free (dfa->nodes);
+
+ if (dfa->state_table)
+ for (i = 0; i <= dfa->state_hash_mask; ++i)
+ {
+ struct re_state_table_entry *entry = dfa->state_table + i;
+ for (j = 0; j < entry->num; ++j)
+ {
+ re_dfastate_t *state = entry->array[j];
+ free_state (state);
+ }
+ re_free (entry->array);
+ }
+ re_free (dfa->state_table);
+#ifdef RE_ENABLE_I18N
+ if (dfa->sb_char != utf8_sb_map)
+ re_free (dfa->sb_char);
+#endif
+ re_free (dfa->subexp_map);
+#ifdef DEBUG
+ re_free (dfa->re_str);
+#endif
+
+ re_free (dfa);
+}
+
+
+/* Free dynamically allocated space used by PREG. */
+
+void
+regfree (preg)
+ regex_t *preg;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ if (BE (dfa != NULL, 1))
+ free_dfa_content (dfa);
+ preg->buffer = NULL;
+ preg->allocated = 0;
+
+ re_free (preg->fastmap);
+ preg->fastmap = NULL;
+
+ re_free (preg->translate);
+ preg->translate = NULL;
+}
+#ifdef _LIBC
+weak_alias (__regfree, regfree)
+#endif
+
+/* Entry points compatible with 4.2 BSD regex library. We don't define
+ them unless specifically requested. */
+
+#if defined _REGEX_RE_COMP || defined _LIBC
+
+/* BSD has one and only one pattern buffer. */
+static struct re_pattern_buffer re_comp_buf;
+
+char *
+# ifdef _LIBC
+/* Make these definitions weak in libc, so POSIX programs can redefine
+ these names if they don't use our functions, and still use
+ regcomp/regexec above without link errors. */
+weak_function
+# endif
+re_comp (s)
+ const char *s;
+{
+ reg_errcode_t ret;
+ char *fastmap;
+
+ if (!s)
+ {
+ if (!re_comp_buf.buffer)
+ return gettext ("No previous regular expression");
+ return 0;
+ }
+
+ if (re_comp_buf.buffer)
+ {
+ fastmap = re_comp_buf.fastmap;
+ re_comp_buf.fastmap = NULL;
+ __regfree (&re_comp_buf);
+ memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
+ re_comp_buf.fastmap = fastmap;
+ }
+
+ if (re_comp_buf.fastmap == NULL)
+ {
+ re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
+ if (re_comp_buf.fastmap == NULL)
+ return (char *) gettext (__re_error_msgid
+ + __re_error_msgid_idx[(int) REG_ESPACE]);
+ }
+
+ /* Since `re_exec' always passes NULL for the `regs' argument, we
+ don't need to initialize the pattern buffer fields which affect it. */
+
+ /* Match anchors at newlines. */
+ re_comp_buf.newline_anchor = 1;
+
+ ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
+
+ if (!ret)
+ return NULL;
+
+ /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
+ return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
+}
+
+#ifdef _LIBC
+libc_freeres_fn (free_mem)
+{
+ __regfree (&re_comp_buf);
+}
+#endif
+
+#endif /* _REGEX_RE_COMP */
+
+/* Internal entry point.
+ Compile the regular expression PATTERN, whose length is LENGTH.
+ SYNTAX indicate regular expression's syntax. */
+
+static reg_errcode_t
+re_compile_internal (preg, pattern, length, syntax)
+ regex_t *preg;
+ const char * pattern;
+ int length;
+ reg_syntax_t syntax;
+{
+ reg_errcode_t err = REG_NOERROR;
+ re_dfa_t *dfa;
+ re_string_t regexp;
+
+ /* Initialize the pattern buffer. */
+ preg->fastmap_accurate = 0;
+ preg->syntax = syntax;
+ preg->not_bol = preg->not_eol = 0;
+ preg->used = 0;
+ preg->re_nsub = 0;
+ preg->can_be_null = 0;
+ preg->regs_allocated = REGS_UNALLOCATED;
+
+ /* Initialize the dfa. */
+ dfa = (re_dfa_t *) preg->buffer;
+ if (BE (preg->allocated < sizeof (re_dfa_t), 0))
+ {
+ /* If zero allocated, but buffer is non-null, try to realloc
+ enough space. This loses if buffer's address is bogus, but
+ that is the user's responsibility. If ->buffer is NULL this
+ is a simple allocation. */
+ dfa = re_realloc (preg->buffer, re_dfa_t, 1);
+ if (dfa == NULL)
+ return REG_ESPACE;
+ preg->allocated = sizeof (re_dfa_t);
+ preg->buffer = (unsigned char *) dfa;
+ }
+ preg->used = sizeof (re_dfa_t);
+
+ err = init_dfa (dfa, length);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ free_dfa_content (dfa);
+ preg->buffer = NULL;
+ preg->allocated = 0;
+ return err;
+ }
+#ifdef DEBUG
+ dfa->re_str = re_malloc (char, length + 1);
+ strncpy (dfa->re_str, pattern, length + 1);
+#endif
+
+ err = re_string_construct (&regexp, pattern, length, preg->translate,
+ syntax & RE_ICASE, dfa);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_compile_internal_free_return:
+ free_workarea_compile (preg);
+ re_string_destruct (&regexp);
+ free_dfa_content (dfa);
+ preg->buffer = NULL;
+ preg->allocated = 0;
+ return err;
+ }
+
+ /* Parse the regular expression, and build a structure tree. */
+ preg->re_nsub = 0;
+ dfa->str_tree = parse (&regexp, preg, syntax, &err);
+ if (BE (dfa->str_tree == NULL, 0))
+ goto re_compile_internal_free_return;
+
+ /* Analyze the tree and create the nfa. */
+ err = analyze (preg);
+ if (BE (err != REG_NOERROR, 0))
+ goto re_compile_internal_free_return;
+
+#ifdef RE_ENABLE_I18N
+ /* If possible, do searching in single byte encoding to speed things up. */
+ if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
+ optimize_utf8 (dfa);
+#endif
+
+ /* Then create the initial state of the dfa. */
+ err = create_initial_state (dfa);
+
+ /* Release work areas. */
+ free_workarea_compile (preg);
+ re_string_destruct (&regexp);
+
+ if (BE (err != REG_NOERROR, 0))
+ {
+ free_dfa_content (dfa);
+ preg->buffer = NULL;
+ preg->allocated = 0;
+ }
+
+ return err;
+}
+
+/* Initialize DFA. We use the length of the regular expression PAT_LEN
+ as the initial length of some arrays. */
+
+static reg_errcode_t
+init_dfa (dfa, pat_len)
+ re_dfa_t *dfa;
+ int pat_len;
+{
+ int table_size;
+#ifndef _LIBC
+ char *codeset_name;
+#endif
+
+ memset (dfa, '\0', sizeof (re_dfa_t));
+
+ /* Force allocation of str_tree_storage the first time. */
+ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
+
+ dfa->nodes_alloc = pat_len + 1;
+ dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
+
+ dfa->states_alloc = pat_len + 1;
+
+ /* table_size = 2 ^ ceil(log pat_len) */
+ for (table_size = 1; table_size > 0; table_size <<= 1)
+ if (table_size > pat_len)
+ break;
+
+ dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
+ dfa->state_hash_mask = table_size - 1;
+
+ dfa->mb_cur_max = MB_CUR_MAX;
+#ifdef _LIBC
+ if (dfa->mb_cur_max == 6
+ && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
+ dfa->is_utf8 = 1;
+ dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
+ != 0);
+#else
+# ifdef HAVE_LANGINFO_CODESET
+ codeset_name = nl_langinfo (CODESET);
+# else
+ codeset_name = getenv ("LC_ALL");
+ if (codeset_name == NULL || codeset_name[0] == '\0')
+ codeset_name = getenv ("LC_CTYPE");
+ if (codeset_name == NULL || codeset_name[0] == '\0')
+ codeset_name = getenv ("LANG");
+ if (codeset_name == NULL)
+ codeset_name = "";
+ else if (strchr (codeset_name, '.') != NULL)
+ codeset_name = strchr (codeset_name, '.') + 1;
+# endif
+
+ if (strcasecmp (codeset_name, "UTF-8") == 0
+ || strcasecmp (codeset_name, "UTF8") == 0)
+ dfa->is_utf8 = 1;
+
+ /* We check exhaustively in the loop below if this charset is a
+ superset of ASCII. */
+ dfa->map_notascii = 0;
+#endif
+
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ {
+ if (dfa->is_utf8)
+ dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
+ else
+ {
+ int i, j, ch;
+
+ dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
+ if (BE (dfa->sb_char == NULL, 0))
+ return REG_ESPACE;
+
+ /* Clear all bits by, then set those corresponding to single
+ byte chars. */
+ bitset_empty (dfa->sb_char);
+
+ for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+ for (j = 0; j < UINT_BITS; ++j, ++ch)
+ {
+ wchar_t wch = __btowc (ch);
+ if (wch != WEOF)
+ dfa->sb_char[i] |= 1 << j;
+# ifndef _LIBC
+ if (isascii (ch) && wch != (wchar_t) ch)
+ dfa->map_notascii = 1;
+# endif
+ }
+ }
+ }
+#endif
+
+ if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
+ return REG_ESPACE;
+ return REG_NOERROR;
+}
+
+/* Initialize WORD_CHAR table, which indicate which character is
+ "word". In this case "word" means that it is the word construction
+ character used by some operators like "\<", "\>", etc. */
+
+static void
+init_word_char (dfa)
+ re_dfa_t *dfa;
+{
+ int i, j, ch;
+ dfa->word_ops_used = 1;
+ for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+ for (j = 0; j < UINT_BITS; ++j, ++ch)
+ if (isalnum (ch) || ch == '_')
+ dfa->word_char[i] |= 1 << j;
+}
+
+/* Free the work area which are only used while compiling. */
+
+static void
+free_workarea_compile (preg)
+ regex_t *preg;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_storage_t *storage, *next;
+ for (storage = dfa->str_tree_storage; storage; storage = next)
+ {
+ next = storage->next;
+ re_free (storage);
+ }
+ dfa->str_tree_storage = NULL;
+ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
+ dfa->str_tree = NULL;
+ re_free (dfa->org_indices);
+ dfa->org_indices = NULL;
+}
+
+/* Create initial states for all contexts. */
+
+static reg_errcode_t
+create_initial_state (dfa)
+ re_dfa_t *dfa;
+{
+ int first, i;
+ reg_errcode_t err;
+ re_node_set init_nodes;
+
+ /* Initial states have the epsilon closure of the node which is
+ the first node of the regular expression. */
+ first = dfa->str_tree->first->node_idx;
+ dfa->init_node = first;
+ err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ /* The back-references which are in initial states can epsilon transit,
+ since in this case all of the subexpressions can be null.
+ Then we add epsilon closures of the nodes which are the next nodes of
+ the back-references. */
+ if (dfa->nbackref > 0)
+ for (i = 0; i < init_nodes.nelem; ++i)
+ {
+ int node_idx = init_nodes.elems[i];
+ re_token_type_t type = dfa->nodes[node_idx].type;
+
+ int clexp_idx;
+ if (type != OP_BACK_REF)
+ continue;
+ for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
+ {
+ re_token_t *clexp_node;
+ clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
+ if (clexp_node->type == OP_CLOSE_SUBEXP
+ && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
+ break;
+ }
+ if (clexp_idx == init_nodes.nelem)
+ continue;
+
+ if (type == OP_BACK_REF)
+ {
+ int dest_idx = dfa->edests[node_idx].elems[0];
+ if (!re_node_set_contains (&init_nodes, dest_idx))
+ {
+ re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
+ i = 0;
+ }
+ }
+ }
+
+ /* It must be the first time to invoke acquire_state. */
+ dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
+ /* We don't check ERR here, since the initial state must not be NULL. */
+ if (BE (dfa->init_state == NULL, 0))
+ return err;
+ if (dfa->init_state->has_constraint)
+ {
+ dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
+ CONTEXT_WORD);
+ dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
+ CONTEXT_NEWLINE);
+ dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
+ &init_nodes,
+ CONTEXT_NEWLINE
+ | CONTEXT_BEGBUF);
+ if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
+ || dfa->init_state_begbuf == NULL, 0))
+ return err;
+ }
+ else
+ dfa->init_state_word = dfa->init_state_nl
+ = dfa->init_state_begbuf = dfa->init_state;
+
+ re_node_set_free (&init_nodes);
+ return REG_NOERROR;
+}
+
+#ifdef RE_ENABLE_I18N
+/* If it is possible to do searching in single byte encoding instead of UTF-8
+ to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
+ DFA nodes where needed. */
+
+static void
+optimize_utf8 (dfa)
+ re_dfa_t *dfa;
+{
+ int node, i, mb_chars = 0, has_period = 0;
+
+ for (node = 0; node < dfa->nodes_len; ++node)
+ switch (dfa->nodes[node].type)
+ {
+ case CHARACTER:
+ if (dfa->nodes[node].opr.c >= 0x80)
+ mb_chars = 1;
+ break;
+ case ANCHOR:
+ switch (dfa->nodes[node].opr.idx)
+ {
+ case LINE_FIRST:
+ case LINE_LAST:
+ case BUF_FIRST:
+ case BUF_LAST:
+ break;
+ default:
+ /* Word anchors etc. cannot be handled. */
+ return;
+ }
+ break;
+ case OP_PERIOD:
+ has_period = 1;
+ break;
+ case OP_BACK_REF:
+ case OP_ALT:
+ case END_OF_RE:
+ case OP_DUP_ASTERISK:
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ break;
+ case COMPLEX_BRACKET:
+ return;
+ case SIMPLE_BRACKET:
+ /* Just double check. */
+ for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
+ if (dfa->nodes[node].opr.sbcset[i])
+ return;
+ break;
+ default:
+ abort ();
+ }
+
+ if (mb_chars || has_period)
+ for (node = 0; node < dfa->nodes_len; ++node)
+ {
+ if (dfa->nodes[node].type == CHARACTER
+ && dfa->nodes[node].opr.c >= 0x80)
+ dfa->nodes[node].mb_partial = 0;
+ else if (dfa->nodes[node].type == OP_PERIOD)
+ dfa->nodes[node].type = OP_UTF8_PERIOD;
+ }
+
+ /* The search can be in single byte locale. */
+ dfa->mb_cur_max = 1;
+ dfa->is_utf8 = 0;
+ dfa->has_mb_node = dfa->nbackref > 0 || has_period;
+}
+#endif
+
+/* Analyze the structure tree, and calculate "first", "next", "edest",
+ "eclosure", and "inveclosure". */
+
+static reg_errcode_t
+analyze (preg)
+ regex_t *preg;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ reg_errcode_t ret;
+
+ /* Allocate arrays. */
+ dfa->nexts = re_malloc (int, dfa->nodes_alloc);
+ dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
+ dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
+ dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
+ if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
+ || dfa->eclosures == NULL, 0))
+ return REG_ESPACE;
+
+ dfa->subexp_map = re_malloc (int, preg->re_nsub);
+ if (dfa->subexp_map != NULL)
+ {
+ int i;
+ for (i = 0; i < preg->re_nsub; i++)
+ dfa->subexp_map[i] = i;
+ preorder (dfa->str_tree, optimize_subexps, dfa);
+ for (i = 0; i < preg->re_nsub; i++)
+ if (dfa->subexp_map[i] != i)
+ break;
+ if (i == preg->re_nsub)
+ {
+ free (dfa->subexp_map);
+ dfa->subexp_map = NULL;
+ }
+ }
+
+ ret = postorder (dfa->str_tree, lower_subexps, preg);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ ret = postorder (dfa->str_tree, calc_first, dfa);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ preorder (dfa->str_tree, calc_next, dfa);
+ ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ ret = calc_eclosure (dfa);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+
+ /* We only need this during the prune_impossible_nodes pass in regexec.c;
+ skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
+ if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
+ || dfa->nbackref)
+ {
+ dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
+ if (BE (dfa->inveclosures == NULL, 0))
+ return REG_ESPACE;
+ ret = calc_inveclosure (dfa);
+ }
+
+ return ret;
+}
+
+/* Our parse trees are very unbalanced, so we cannot use a stack to
+ implement parse tree visits. Instead, we use parent pointers and
+ some hairy code in these two functions. */
+static reg_errcode_t
+postorder (root, fn, extra)
+ bin_tree_t *root;
+ reg_errcode_t (fn (void *, bin_tree_t *));
+ void *extra;
+{
+ bin_tree_t *node, *prev;
+
+ for (node = root; ; )
+ {
+ /* Descend down the tree, preferably to the left (or to the right
+ if that's the only child). */
+ while (node->left || node->right)
+ if (node->left)
+ node = node->left;
+ else
+ node = node->right;
+
+ do
+ {
+ reg_errcode_t err = fn (extra, node);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ if (node->parent == NULL)
+ return REG_NOERROR;
+ prev = node;
+ node = node->parent;
+ }
+ /* Go up while we have a node that is reached from the right. */
+ while (node->right == prev || node->right == NULL);
+ node = node->right;
+ }
+}
+
+static reg_errcode_t
+preorder (root, fn, extra)
+ bin_tree_t *root;
+ reg_errcode_t (fn (void *, bin_tree_t *));
+ void *extra;
+{
+ bin_tree_t *node;
+
+ for (node = root; ; )
+ {
+ reg_errcode_t err = fn (extra, node);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ /* Go to the left node, or up and to the right. */
+ if (node->left)
+ node = node->left;
+ else
+ {
+ bin_tree_t *prev = NULL;
+ while (node->right == prev || node->right == NULL)
+ {
+ prev = node;
+ node = node->parent;
+ if (!node)
+ return REG_NOERROR;
+ }
+ node = node->right;
+ }
+ }
+}
+
+/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
+ re_search_internal to map the inner one's opr.idx to this one's. Adjust
+ backreferences as well. Requires a preorder visit. */
+static reg_errcode_t
+optimize_subexps (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ re_dfa_t *dfa = (re_dfa_t *) extra;
+
+ if (node->token.type == OP_BACK_REF && dfa->subexp_map)
+ {
+ int idx = node->token.opr.idx;
+ node->token.opr.idx = dfa->subexp_map[idx];
+ dfa->used_bkref_map |= 1 << node->token.opr.idx;
+ }
+
+ else if (node->token.type == SUBEXP
+ && node->left && node->left->token.type == SUBEXP)
+ {
+ int other_idx = node->left->token.opr.idx;
+
+ node->left = node->left->left;
+ if (node->left)
+ node->left->parent = node;
+
+ dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
+ if (other_idx < 8 * sizeof (dfa->used_bkref_map))
+ dfa->used_bkref_map &= ~(1 << other_idx);
+ }
+
+ return REG_NOERROR;
+}
+
+/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
+ of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
+static reg_errcode_t
+lower_subexps (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ regex_t *preg = (regex_t *) extra;
+ reg_errcode_t err = REG_NOERROR;
+
+ if (node->left && node->left->token.type == SUBEXP)
+ {
+ node->left = lower_subexp (&err, preg, node->left);
+ if (node->left)
+ node->left->parent = node;
+ }
+ if (node->right && node->right->token.type == SUBEXP)
+ {
+ node->right = lower_subexp (&err, preg, node->right);
+ if (node->right)
+ node->right->parent = node;
+ }
+
+ return err;
+}
+
+static bin_tree_t *
+lower_subexp (err, preg, node)
+ reg_errcode_t *err;
+ regex_t *preg;
+ bin_tree_t *node;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_t *body = node->left;
+ bin_tree_t *op, *cls, *tree1, *tree;
+
+ if (preg->no_sub
+ /* We do not optimize empty subexpressions, because otherwise we may
+ have bad CONCAT nodes with NULL children. This is obviously not
+ very common, so we do not lose much. An example that triggers
+ this case is the sed "script" /\(\)/x. */
+ && node->left != NULL
+ && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
+ || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
+ return node->left;
+
+ /* Convert the SUBEXP node to the concatenation of an
+ OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
+ op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
+ cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
+ tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
+ tree = create_tree (dfa, op, tree1, CONCAT);
+ if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+
+ op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
+ op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
+ return tree;
+}
+
+/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
+ nodes. Requires a postorder visit. */
+static reg_errcode_t
+calc_first (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ re_dfa_t *dfa = (re_dfa_t *) extra;
+ if (node->token.type == CONCAT)
+ {
+ node->first = node->left->first;
+ node->node_idx = node->left->node_idx;
+ }
+ else
+ {
+ node->first = node;
+ node->node_idx = re_dfa_add_node (dfa, node->token);
+ if (BE (node->node_idx == -1, 0))
+ return REG_ESPACE;
+ }
+ return REG_NOERROR;
+}
+
+/* Pass 2: compute NEXT on the tree. Preorder visit. */
+static reg_errcode_t
+calc_next (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ switch (node->token.type)
+ {
+ case OP_DUP_ASTERISK:
+ node->left->next = node;
+ break;
+ case CONCAT:
+ node->left->next = node->right->first;
+ node->right->next = node->next;
+ break;
+ default:
+ if (node->left)
+ node->left->next = node->next;
+ if (node->right)
+ node->right->next = node->next;
+ break;
+ }
+ return REG_NOERROR;
+}
+
+/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
+static reg_errcode_t
+link_nfa_nodes (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ re_dfa_t *dfa = (re_dfa_t *) extra;
+ int idx = node->node_idx;
+ reg_errcode_t err = REG_NOERROR;
+
+ switch (node->token.type)
+ {
+ case CONCAT:
+ break;
+
+ case END_OF_RE:
+ assert (node->next == NULL);
+ break;
+
+ case OP_DUP_ASTERISK:
+ case OP_ALT:
+ {
+ int left, right;
+ dfa->has_plural_match = 1;
+ if (node->left != NULL)
+ left = node->left->first->node_idx;
+ else
+ left = node->next->node_idx;
+ if (node->right != NULL)
+ right = node->right->first->node_idx;
+ else
+ right = node->next->node_idx;
+ assert (left > -1);
+ assert (right > -1);
+ err = re_node_set_init_2 (dfa->edests + idx, left, right);
+ }
+ break;
+
+ case ANCHOR:
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
+ break;
+
+ case OP_BACK_REF:
+ dfa->nexts[idx] = node->next->node_idx;
+ if (node->token.type == OP_BACK_REF)
+ re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
+ break;
+
+ default:
+ assert (!IS_EPSILON_NODE (node->token.type));
+ dfa->nexts[idx] = node->next->node_idx;
+ break;
+ }
+
+ return err;
+}
+
+/* Duplicate the epsilon closure of the node ROOT_NODE.
+ Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
+ to their own constraint. */
+
+static reg_errcode_t
+duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
+ init_constraint)
+ re_dfa_t *dfa;
+ int top_org_node, top_clone_node, root_node;
+ unsigned int init_constraint;
+{
+ reg_errcode_t err;
+ int org_node, clone_node, ret;
+ unsigned int constraint = init_constraint;
+ for (org_node = top_org_node, clone_node = top_clone_node;;)
+ {
+ int org_dest, clone_dest;
+ if (dfa->nodes[org_node].type == OP_BACK_REF)
+ {
+ /* If the back reference epsilon-transit, its destination must
+ also have the constraint. Then duplicate the epsilon closure
+ of the destination of the back reference, and store it in
+ edests of the back reference. */
+ org_dest = dfa->nexts[org_node];
+ re_node_set_empty (dfa->edests + clone_node);
+ err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ dfa->nexts[clone_node] = dfa->nexts[org_node];
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ }
+ else if (dfa->edests[org_node].nelem == 0)
+ {
+ /* In case of the node can't epsilon-transit, don't duplicate the
+ destination and store the original destination as the
+ destination of the node. */
+ dfa->nexts[clone_node] = dfa->nexts[org_node];
+ break;
+ }
+ else if (dfa->edests[org_node].nelem == 1)
+ {
+ /* In case of the node can epsilon-transit, and it has only one
+ destination. */
+ org_dest = dfa->edests[org_node].elems[0];
+ re_node_set_empty (dfa->edests + clone_node);
+ if (dfa->nodes[org_node].type == ANCHOR)
+ {
+ /* In case of the node has another constraint, append it. */
+ if (org_node == root_node && clone_node != org_node)
+ {
+ /* ...but if the node is root_node itself, it means the
+ epsilon closure have a loop, then tie it to the
+ destination of the root_node. */
+ ret = re_node_set_insert (dfa->edests + clone_node,
+ org_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ break;
+ }
+ constraint |= dfa->nodes[org_node].opr.ctx_type;
+ }
+ err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ }
+ else /* dfa->edests[org_node].nelem == 2 */
+ {
+ /* In case of the node can epsilon-transit, and it has two
+ destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
+ org_dest = dfa->edests[org_node].elems[0];
+ re_node_set_empty (dfa->edests + clone_node);
+ /* Search for a duplicated node which satisfies the constraint. */
+ clone_dest = search_duplicated_node (dfa, org_dest, constraint);
+ if (clone_dest == -1)
+ {
+ /* There are no such a duplicated node, create a new one. */
+ err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ err = duplicate_node_closure (dfa, org_dest, clone_dest,
+ root_node, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ else
+ {
+ /* There are a duplicated node which satisfy the constraint,
+ use it to avoid infinite loop. */
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ }
+
+ org_dest = dfa->edests[org_node].elems[1];
+ err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ }
+ org_node = org_dest;
+ clone_node = clone_dest;
+ }
+ return REG_NOERROR;
+}
+
+/* Search for a node which is duplicated from the node ORG_NODE, and
+ satisfies the constraint CONSTRAINT. */
+
+static int
+search_duplicated_node (dfa, org_node, constraint)
+ re_dfa_t *dfa;
+ int org_node;
+ unsigned int constraint;
+{
+ int idx;
+ for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
+ {
+ if (org_node == dfa->org_indices[idx]
+ && constraint == dfa->nodes[idx].constraint)
+ return idx; /* Found. */
+ }
+ return -1; /* Not found. */
+}
+
+/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
+ The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
+ otherwise return the error code. */
+
+static reg_errcode_t
+duplicate_node (new_idx, dfa, org_idx, constraint)
+ re_dfa_t *dfa;
+ int *new_idx, org_idx;
+ unsigned int constraint;
+{
+ int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
+ if (BE (dup_idx == -1, 0))
+ return REG_ESPACE;
+ dfa->nodes[dup_idx].constraint = constraint;
+ if (dfa->nodes[org_idx].type == ANCHOR)
+ dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
+ dfa->nodes[dup_idx].duplicated = 1;
+
+ /* Store the index of the original node. */
+ dfa->org_indices[dup_idx] = org_idx;
+ *new_idx = dup_idx;
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+calc_inveclosure (dfa)
+ re_dfa_t *dfa;
+{
+ int src, idx, ret;
+ for (idx = 0; idx < dfa->nodes_len; ++idx)
+ re_node_set_init_empty (dfa->inveclosures + idx);
+
+ for (src = 0; src < dfa->nodes_len; ++src)
+ {
+ int *elems = dfa->eclosures[src].elems;
+ for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
+ {
+ ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
+ if (BE (ret == -1, 0))
+ return REG_ESPACE;
+ }
+ }
+
+ return REG_NOERROR;
+}
+
+/* Calculate "eclosure" for all the node in DFA. */
+
+static reg_errcode_t
+calc_eclosure (dfa)
+ re_dfa_t *dfa;
+{
+ int node_idx, incomplete;
+#ifdef DEBUG
+ assert (dfa->nodes_len > 0);
+#endif
+ incomplete = 0;
+ /* For each nodes, calculate epsilon closure. */
+ for (node_idx = 0; ; ++node_idx)
+ {
+ reg_errcode_t err;
+ re_node_set eclosure_elem;
+ if (node_idx == dfa->nodes_len)
+ {
+ if (!incomplete)
+ break;
+ incomplete = 0;
+ node_idx = 0;
+ }
+
+#ifdef DEBUG
+ assert (dfa->eclosures[node_idx].nelem != -1);
+#endif
+
+ /* If we have already calculated, skip it. */
+ if (dfa->eclosures[node_idx].nelem != 0)
+ continue;
+ /* Calculate epsilon closure of `node_idx'. */
+ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ if (dfa->eclosures[node_idx].nelem == 0)
+ {
+ incomplete = 1;
+ re_node_set_free (&eclosure_elem);
+ }
+ }
+ return REG_NOERROR;
+}
+
+/* Calculate epsilon closure of NODE. */
+
+static reg_errcode_t
+calc_eclosure_iter (new_set, dfa, node, root)
+ re_node_set *new_set;
+ re_dfa_t *dfa;
+ int node, root;
+{
+ reg_errcode_t err;
+ unsigned int constraint;
+ int i, incomplete;
+ re_node_set eclosure;
+ incomplete = 0;
+ err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ /* This indicates that we are calculating this node now.
+ We reference this value to avoid infinite loop. */
+ dfa->eclosures[node].nelem = -1;
+
+ constraint = ((dfa->nodes[node].type == ANCHOR)
+ ? dfa->nodes[node].opr.ctx_type : 0);
+ /* If the current node has constraints, duplicate all nodes.
+ Since they must inherit the constraints. */
+ if (constraint
+ && dfa->edests[node].nelem
+ && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
+ {
+ int org_node, cur_node;
+ org_node = cur_node = node;
+ err = duplicate_node_closure (dfa, node, node, node, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+
+ /* Expand each epsilon destination nodes. */
+ if (IS_EPSILON_NODE(dfa->nodes[node].type))
+ for (i = 0; i < dfa->edests[node].nelem; ++i)
+ {
+ re_node_set eclosure_elem;
+ int edest = dfa->edests[node].elems[i];
+ /* If calculating the epsilon closure of `edest' is in progress,
+ return intermediate result. */
+ if (dfa->eclosures[edest].nelem == -1)
+ {
+ incomplete = 1;
+ continue;
+ }
+ /* If we haven't calculated the epsilon closure of `edest' yet,
+ calculate now. Otherwise use calculated epsilon closure. */
+ if (dfa->eclosures[edest].nelem == 0)
+ {
+ err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ else
+ eclosure_elem = dfa->eclosures[edest];
+ /* Merge the epsilon closure of `edest'. */
+ re_node_set_merge (&eclosure, &eclosure_elem);
+ /* If the epsilon closure of `edest' is incomplete,
+ the epsilon closure of this node is also incomplete. */
+ if (dfa->eclosures[edest].nelem == 0)
+ {
+ incomplete = 1;
+ re_node_set_free (&eclosure_elem);
+ }
+ }
+
+ /* Epsilon closures include itself. */
+ re_node_set_insert (&eclosure, node);
+ if (incomplete && !root)
+ dfa->eclosures[node].nelem = 0;
+ else
+ dfa->eclosures[node] = eclosure;
+ *new_set = eclosure;
+ return REG_NOERROR;
+}
+
+/* Functions for token which are used in the parser. */
+
+/* Fetch a token from INPUT.
+ We must not use this function inside bracket expressions. */
+
+static void
+fetch_token (result, input, syntax)
+ re_token_t *result;
+ re_string_t *input;
+ reg_syntax_t syntax;
+{
+ re_string_skip_bytes (input, peek_token (result, input, syntax));
+}
+
+/* Peek a token from INPUT, and return the length of the token.
+ We must not use this function inside bracket expressions. */
+
+static int
+peek_token (token, input, syntax)
+ re_token_t *token;
+ re_string_t *input;
+ reg_syntax_t syntax;
+{
+ unsigned char c;
+
+ if (re_string_eoi (input))
+ {
+ token->type = END_OF_RE;
+ return 0;
+ }
+
+ c = re_string_peek_byte (input, 0);
+ token->opr.c = c;
+
+ token->word_char = 0;
+#ifdef RE_ENABLE_I18N
+ token->mb_partial = 0;
+ if (input->mb_cur_max > 1 &&
+ !re_string_first_byte (input, re_string_cur_idx (input)))
+ {
+ token->type = CHARACTER;
+ token->mb_partial = 1;
+ return 1;
+ }
+#endif
+ if (c == '\\')
+ {
+ unsigned char c2;
+ if (re_string_cur_idx (input) + 1 >= re_string_length (input))
+ {
+ token->type = BACK_SLASH;
+ return 1;
+ }
+
+ c2 = re_string_peek_byte_case (input, 1);
+ token->opr.c = c2;
+ token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1)
+ {
+ wint_t wc = re_string_wchar_at (input,
+ re_string_cur_idx (input) + 1);
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+ }
+ else
+#endif
+ token->word_char = IS_WORD_CHAR (c2) != 0;
+
+ switch (c2)
+ {
+ case '|':
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
+ token->type = OP_ALT;
+ break;
+ case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9':
+ if (!(syntax & RE_NO_BK_REFS))
+ {
+ token->type = OP_BACK_REF;
+ token->opr.idx = c2 - '1';
+ }
+ break;
+ case '<':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = WORD_FIRST;
+ }
+ break;
+ case '>':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = WORD_LAST;
+ }
+ break;
+ case 'b':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = WORD_DELIM;
+ }
+ break;
+ case 'B':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = NOT_WORD_DELIM;
+ }
+ break;
+ case 'w':
+ if (!(syntax & RE_NO_GNU_OPS))
+ token->type = OP_WORD;
+ break;
+ case 'W':
+ if (!(syntax & RE_NO_GNU_OPS))
+ token->type = OP_NOTWORD;
+ break;
+ case 's':
+ if (!(syntax & RE_NO_GNU_OPS))
+ token->type = OP_SPACE;
+ break;
+ case 'S':
+ if (!(syntax & RE_NO_GNU_OPS))
+ token->type = OP_NOTSPACE;
+ break;
+ case '`':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = BUF_FIRST;
+ }
+ break;
+ case '\'':
+ if (!(syntax & RE_NO_GNU_OPS))
+ {
+ token->type = ANCHOR;
+ token->opr.ctx_type = BUF_LAST;
+ }
+ break;
+ case '(':
+ if (!(syntax & RE_NO_BK_PARENS))
+ token->type = OP_OPEN_SUBEXP;
+ break;
+ case ')':
+ if (!(syntax & RE_NO_BK_PARENS))
+ token->type = OP_CLOSE_SUBEXP;
+ break;
+ case '+':
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
+ token->type = OP_DUP_PLUS;
+ break;
+ case '?':
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
+ token->type = OP_DUP_QUESTION;
+ break;
+ case '{':
+ if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
+ token->type = OP_OPEN_DUP_NUM;
+ break;
+ case '}':
+ if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
+ token->type = OP_CLOSE_DUP_NUM;
+ break;
+ default:
+ break;
+ }
+ return 2;
+ }
+
+ token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1)
+ {
+ wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+ }
+ else
+#endif
+ token->word_char = IS_WORD_CHAR (token->opr.c);
+
+ switch (c)
+ {
+ case '\n':
+ if (syntax & RE_NEWLINE_ALT)
+ token->type = OP_ALT;
+ break;
+ case '|':
+ if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
+ token->type = OP_ALT;
+ break;
+ case '*':
+ token->type = OP_DUP_ASTERISK;
+ break;
+ case '+':
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
+ token->type = OP_DUP_PLUS;
+ break;
+ case '?':
+ if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
+ token->type = OP_DUP_QUESTION;
+ break;
+ case '{':
+ if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
+ token->type = OP_OPEN_DUP_NUM;
+ break;
+ case '}':
+ if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
+ token->type = OP_CLOSE_DUP_NUM;
+ break;
+ case '(':
+ if (syntax & RE_NO_BK_PARENS)
+ token->type = OP_OPEN_SUBEXP;
+ break;
+ case ')':
+ if (syntax & RE_NO_BK_PARENS)
+ token->type = OP_CLOSE_SUBEXP;
+ break;
+ case '[':
+ token->type = OP_OPEN_BRACKET;
+ break;
+ case '.':
+ token->type = OP_PERIOD;
+ break;
+ case '^':
+ if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
+ re_string_cur_idx (input) != 0)
+ {
+ char prev = re_string_peek_byte (input, -1);
+ if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
+ break;
+ }
+ token->type = ANCHOR;
+ token->opr.ctx_type = LINE_FIRST;
+ break;
+ case '$':
+ if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
+ re_string_cur_idx (input) + 1 != re_string_length (input))
+ {
+ re_token_t next;
+ re_string_skip_bytes (input, 1);
+ peek_token (&next, input, syntax);
+ re_string_skip_bytes (input, -1);
+ if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
+ break;
+ }
+ token->type = ANCHOR;
+ token->opr.ctx_type = LINE_LAST;
+ break;
+ default:
+ break;
+ }
+ return 1;
+}
+
+/* Peek a token from INPUT, and return the length of the token.
+ We must not use this function out of bracket expressions. */
+
+static int
+peek_token_bracket (token, input, syntax)
+ re_token_t *token;
+ re_string_t *input;
+ reg_syntax_t syntax;
+{
+ unsigned char c;
+ if (re_string_eoi (input))
+ {
+ token->type = END_OF_RE;
+ return 0;
+ }
+ c = re_string_peek_byte (input, 0);
+ token->opr.c = c;
+
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1 &&
+ !re_string_first_byte (input, re_string_cur_idx (input)))
+ {
+ token->type = CHARACTER;
+ return 1;
+ }
+#endif /* RE_ENABLE_I18N */
+
+ if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
+ && re_string_cur_idx (input) + 1 < re_string_length (input))
+ {
+ /* In this case, '\' escape a character. */
+ unsigned char c2;
+ re_string_skip_bytes (input, 1);
+ c2 = re_string_peek_byte (input, 0);
+ token->opr.c = c2;
+ token->type = CHARACTER;
+ return 1;
+ }
+ if (c == '[') /* '[' is a special char in a bracket exps. */
+ {
+ unsigned char c2;
+ int token_len;
+ if (re_string_cur_idx (input) + 1 < re_string_length (input))
+ c2 = re_string_peek_byte (input, 1);
+ else
+ c2 = 0;
+ token->opr.c = c2;
+ token_len = 2;
+ switch (c2)
+ {
+ case '.':
+ token->type = OP_OPEN_COLL_ELEM;
+ break;
+ case '=':
+ token->type = OP_OPEN_EQUIV_CLASS;
+ break;
+ case ':':
+ if (syntax & RE_CHAR_CLASSES)
+ {
+ token->type = OP_OPEN_CHAR_CLASS;
+ break;
+ }
+ /* else fall through. */
+ default:
+ token->type = CHARACTER;
+ token->opr.c = c;
+ token_len = 1;
+ break;
+ }
+ return token_len;
+ }
+ switch (c)
+ {
+ case '-':
+ token->type = OP_CHARSET_RANGE;
+ break;
+ case ']':
+ token->type = OP_CLOSE_BRACKET;
+ break;
+ case '^':
+ token->type = OP_NON_MATCH_LIST;
+ break;
+ default:
+ token->type = CHARACTER;
+ }
+ return 1;
+}
+
+/* Functions for parser. */
+
+/* Entry point of the parser.
+ Parse the regular expression REGEXP and return the structure tree.
+ If an error is occured, ERR is set by error code, and return NULL.
+ This function build the following tree, from regular expression <reg_exp>:
+ CAT
+ / \
+ / \
+ <reg_exp> EOR
+
+ CAT means concatenation.
+ EOR means end of regular expression. */
+
+static bin_tree_t *
+parse (regexp, preg, syntax, err)
+ re_string_t *regexp;
+ regex_t *preg;
+ reg_syntax_t syntax;
+ reg_errcode_t *err;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_t *tree, *eor, *root;
+ re_token_t current_token;
+ dfa->syntax = syntax;
+ fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
+ tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ eor = create_tree (dfa, NULL, NULL, END_OF_RE);
+ if (tree != NULL)
+ root = create_tree (dfa, tree, eor, CONCAT);
+ else
+ root = eor;
+ if (BE (eor == NULL || root == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ return root;
+}
+
+/* This function build the following tree, from regular expression
+ <branch1>|<branch2>:
+ ALT
+ / \
+ / \
+ <branch1> <branch2>
+
+ ALT means alternative, which represents the operator `|'. */
+
+static bin_tree_t *
+parse_reg_exp (regexp, preg, token, syntax, nest, err)
+ re_string_t *regexp;
+ regex_t *preg;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ int nest;
+ reg_errcode_t *err;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_t *tree, *branch = NULL;
+ tree = parse_branch (regexp, preg, token, syntax, nest, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+
+ while (token->type == OP_ALT)
+ {
+ fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
+ if (token->type != OP_ALT && token->type != END_OF_RE
+ && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
+ {
+ branch = parse_branch (regexp, preg, token, syntax, nest, err);
+ if (BE (*err != REG_NOERROR && branch == NULL, 0))
+ return NULL;
+ }
+ else
+ branch = NULL;
+ tree = create_tree (dfa, tree, branch, OP_ALT);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ return tree;
+}
+
+/* This function build the following tree, from regular expression
+ <exp1><exp2>:
+ CAT
+ / \
+ / \
+ <exp1> <exp2>
+
+ CAT means concatenation. */
+
+static bin_tree_t *
+parse_branch (regexp, preg, token, syntax, nest, err)
+ re_string_t *regexp;
+ regex_t *preg;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ int nest;
+ reg_errcode_t *err;
+{
+ bin_tree_t *tree, *exp;
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ tree = parse_expression (regexp, preg, token, syntax, nest, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+
+ while (token->type != OP_ALT && token->type != END_OF_RE
+ && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
+ {
+ exp = parse_expression (regexp, preg, token, syntax, nest, err);
+ if (BE (*err != REG_NOERROR && exp == NULL, 0))
+ {
+ return NULL;
+ }
+ if (tree != NULL && exp != NULL)
+ {
+ tree = create_tree (dfa, tree, exp, CONCAT);
+ if (tree == NULL)
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ else if (tree == NULL)
+ tree = exp;
+ /* Otherwise exp == NULL, we don't need to create new tree. */
+ }
+ return tree;
+}
+
+/* This function build the following tree, from regular expression a*:
+ *
+ |
+ a
+*/
+
+static bin_tree_t *
+parse_expression (regexp, preg, token, syntax, nest, err)
+ re_string_t *regexp;
+ regex_t *preg;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ int nest;
+ reg_errcode_t *err;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_t *tree;
+ switch (token->type)
+ {
+ case CHARACTER:
+ tree = create_token_tree (dfa, NULL, NULL, token);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ {
+ while (!re_string_eoi (regexp)
+ && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
+ {
+ bin_tree_t *mbc_remain;
+ fetch_token (token, regexp, syntax);
+ mbc_remain = create_token_tree (dfa, NULL, NULL, token);
+ tree = create_tree (dfa, tree, mbc_remain, CONCAT);
+ if (BE (mbc_remain == NULL || tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ }
+#endif
+ break;
+ case OP_OPEN_SUBEXP:
+ tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ break;
+ case OP_OPEN_BRACKET:
+ tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ break;
+ case OP_BACK_REF:
+ if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
+ {
+ *err = REG_ESUBREG;
+ return NULL;
+ }
+ dfa->used_bkref_map |= 1 << token->opr.idx;
+ tree = create_token_tree (dfa, NULL, NULL, token);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ ++dfa->nbackref;
+ dfa->has_mb_node = 1;
+ break;
+ case OP_OPEN_DUP_NUM:
+ if (syntax & RE_CONTEXT_INVALID_DUP)
+ {
+ *err = REG_BADRPT;
+ return NULL;
+ }
+ /* FALLTHROUGH */
+ case OP_DUP_ASTERISK:
+ case OP_DUP_PLUS:
+ case OP_DUP_QUESTION:
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ {
+ *err = REG_BADRPT;
+ return NULL;
+ }
+ else if (syntax & RE_CONTEXT_INDEP_OPS)
+ {
+ fetch_token (token, regexp, syntax);
+ return parse_expression (regexp, preg, token, syntax, nest, err);
+ }
+ /* else fall through */
+ case OP_CLOSE_SUBEXP:
+ if ((token->type == OP_CLOSE_SUBEXP) &&
+ !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
+ {
+ *err = REG_ERPAREN;
+ return NULL;
+ }
+ /* else fall through */
+ case OP_CLOSE_DUP_NUM:
+ /* We treat it as a normal character. */
+
+ /* Then we can these characters as normal characters. */
+ token->type = CHARACTER;
+ /* mb_partial and word_char bits should be initialized already
+ by peek_token. */
+ tree = create_token_tree (dfa, NULL, NULL, token);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ break;
+ case ANCHOR:
+ if ((token->opr.ctx_type
+ & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
+ && dfa->word_ops_used == 0)
+ init_word_char (dfa);
+ if (token->opr.ctx_type == WORD_DELIM
+ || token->opr.ctx_type == NOT_WORD_DELIM)
+ {
+ bin_tree_t *tree_first, *tree_last;
+ if (token->opr.ctx_type == WORD_DELIM)
+ {
+ token->opr.ctx_type = WORD_FIRST;
+ tree_first = create_token_tree (dfa, NULL, NULL, token);
+ token->opr.ctx_type = WORD_LAST;
+ }
+ else
+ {
+ token->opr.ctx_type = INSIDE_WORD;
+ tree_first = create_token_tree (dfa, NULL, NULL, token);
+ token->opr.ctx_type = INSIDE_NOTWORD;
+ }
+ tree_last = create_token_tree (dfa, NULL, NULL, token);
+ tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
+ if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ else
+ {
+ tree = create_token_tree (dfa, NULL, NULL, token);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ /* We must return here, since ANCHORs can't be followed
+ by repetition operators.
+ eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
+ it must not be "<ANCHOR(^)><REPEAT(*)>". */
+ fetch_token (token, regexp, syntax);
+ return tree;
+ case OP_PERIOD:
+ tree = create_token_tree (dfa, NULL, NULL, token);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ if (dfa->mb_cur_max > 1)
+ dfa->has_mb_node = 1;
+ break;
+ case OP_WORD:
+ case OP_NOTWORD:
+ tree = build_charclass_op (dfa, regexp->trans,
+ (const unsigned char *) "alnum",
+ (const unsigned char *) "_",
+ token->type == OP_NOTWORD, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ break;
+ case OP_SPACE:
+ case OP_NOTSPACE:
+ tree = build_charclass_op (dfa, regexp->trans,
+ (const unsigned char *) "space",
+ (const unsigned char *) "",
+ token->type == OP_NOTSPACE, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ break;
+ case OP_ALT:
+ case END_OF_RE:
+ return NULL;
+ case BACK_SLASH:
+ *err = REG_EESCAPE;
+ return NULL;
+ default:
+ /* Must not happen? */
+#ifdef DEBUG
+ assert (0);
+#endif
+ return NULL;
+ }
+ fetch_token (token, regexp, syntax);
+
+ while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
+ || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
+ {
+ tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
+ if (BE (*err != REG_NOERROR && tree == NULL, 0))
+ return NULL;
+ /* In BRE consecutive duplications are not allowed. */
+ if ((syntax & RE_CONTEXT_INVALID_DUP)
+ && (token->type == OP_DUP_ASTERISK
+ || token->type == OP_OPEN_DUP_NUM))
+ {
+ *err = REG_BADRPT;
+ return NULL;
+ }
+ }
+
+ return tree;
+}
+
+/* This function build the following tree, from regular expression
+ (<reg_exp>):
+ SUBEXP
+ |
+ <reg_exp>
+*/
+
+static bin_tree_t *
+parse_sub_exp (regexp, preg, token, syntax, nest, err)
+ re_string_t *regexp;
+ regex_t *preg;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ int nest;
+ reg_errcode_t *err;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ bin_tree_t *tree;
+ size_t cur_nsub;
+ cur_nsub = preg->re_nsub++;
+
+ fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
+
+ /* The subexpression may be a null string. */
+ if (token->type == OP_CLOSE_SUBEXP)
+ tree = NULL;
+ else
+ {
+ tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
+ if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
+ *err = REG_EPAREN;
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ }
+ dfa->completed_bkref_map |= 1 << cur_nsub;
+
+ tree = create_tree (dfa, tree, NULL, SUBEXP);
+ if (BE (tree == NULL, 0))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ tree->token.opr.idx = cur_nsub;
+ return tree;
+}
+
+/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
+
+static bin_tree_t *
+parse_dup_op (elem, regexp, dfa, token, syntax, err)
+ bin_tree_t *elem;
+ re_string_t *regexp;
+ re_dfa_t *dfa;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ reg_errcode_t *err;
+{
+ bin_tree_t *tree = NULL, *old_tree = NULL;
+ int i, start, end, start_idx = re_string_cur_idx (regexp);
+ re_token_t start_token = *token;
+
+ if (token->type == OP_OPEN_DUP_NUM)
+ {
+ end = 0;
+ start = fetch_number (regexp, token, syntax);
+ if (start == -1)
+ {
+ if (token->type == CHARACTER && token->opr.c == ',')
+ start = 0; /* We treat "{,m}" as "{0,m}". */
+ else
+ {
+ *err = REG_BADBR; /* <re>{} is invalid. */
+ return NULL;
+ }
+ }
+ if (BE (start != -2, 1))
+ {
+ /* We treat "{n}" as "{n,n}". */
+ end = ((token->type == OP_CLOSE_DUP_NUM) ? start
+ : ((token->type == CHARACTER && token->opr.c == ',')
+ ? fetch_number (regexp, token, syntax) : -2));
+ }
+ if (BE (start == -2 || end == -2, 0))
+ {
+ /* Invalid sequence. */
+ if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
+ {
+ if (token->type == END_OF_RE)
+ *err = REG_EBRACE;
+ else
+ *err = REG_BADBR;
+
+ return NULL;
+ }
+
+ /* If the syntax bit is set, rollback. */
+ re_string_set_index (regexp, start_idx);
+ *token = start_token;
+ token->type = CHARACTER;
+ /* mb_partial and word_char bits should be already initialized by
+ peek_token. */
+ return elem;
+ }
+
+ if (BE (end != -1 && start > end, 0))
+ {
+ /* First number greater than second. */
+ *err = REG_BADBR;
+ return NULL;
+ }
+ }
+ else
+ {
+ start = (token->type == OP_DUP_PLUS) ? 1 : 0;
+ end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
+ }
+
+ fetch_token (token, regexp, syntax);
+
+ if (BE (elem == NULL, 0))
+ return NULL;
+ if (BE (start == 0 && end == 0, 0))
+ {
+ postorder (elem, free_tree, NULL);
+ return NULL;
+ }
+
+ /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
+ if (BE (start > 0, 0))
+ {
+ tree = elem;
+ for (i = 2; i <= start; ++i)
+ {
+ elem = duplicate_tree (elem, dfa);
+ tree = create_tree (dfa, tree, elem, CONCAT);
+ if (BE (elem == NULL || tree == NULL, 0))
+ goto parse_dup_op_espace;
+ }
+
+ if (start == end)
+ return tree;
+
+ /* Duplicate ELEM before it is marked optional. */
+ elem = duplicate_tree (elem, dfa);
+ old_tree = tree;
+ }
+ else
+ old_tree = NULL;
+
+ if (elem->token.type == SUBEXP)
+ postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
+
+ tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
+ if (BE (tree == NULL, 0))
+ goto parse_dup_op_espace;
+
+ /* This loop is actually executed only when end != -1,
+ to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
+ already created the start+1-th copy. */
+ for (i = start + 2; i <= end; ++i)
+ {
+ elem = duplicate_tree (elem, dfa);
+ tree = create_tree (dfa, tree, elem, CONCAT);
+ if (BE (elem == NULL || tree == NULL, 0))
+ goto parse_dup_op_espace;
+
+ tree = create_tree (dfa, tree, NULL, OP_ALT);
+ if (BE (tree == NULL, 0))
+ goto parse_dup_op_espace;
+ }
+
+ if (old_tree)
+ tree = create_tree (dfa, old_tree, tree, CONCAT);
+
+ return tree;
+
+ parse_dup_op_espace:
+ *err = REG_ESPACE;
+ return NULL;
+}
+
+/* Size of the names for collating symbol/equivalence_class/character_class.
+ I'm not sure, but maybe enough. */
+#define BRACKET_NAME_BUF_SIZE 32
+
+#ifndef _LIBC
+ /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
+ Build the range expression which starts from START_ELEM, and ends
+ at END_ELEM. The result are written to MBCSET and SBCSET.
+ RANGE_ALLOC is the allocated size of mbcset->range_starts, and
+ mbcset->range_ends, is a pointer argument sinse we may
+ update it. */
+
+static reg_errcode_t
+# ifdef RE_ENABLE_I18N
+build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
+ re_charset_t *mbcset;
+ int *range_alloc;
+# else /* not RE_ENABLE_I18N */
+build_range_exp (sbcset, start_elem, end_elem)
+# endif /* not RE_ENABLE_I18N */
+ re_bitset_ptr_t sbcset;
+ bracket_elem_t *start_elem, *end_elem;
+{
+ unsigned int start_ch, end_ch;
+ /* Equivalence Classes and Character Classes can't be a range start/end. */
+ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
+ || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
+ 0))
+ return REG_ERANGE;
+
+ /* We can handle no multi character collating elements without libc
+ support. */
+ if (BE ((start_elem->type == COLL_SYM
+ && strlen ((char *) start_elem->opr.name) > 1)
+ || (end_elem->type == COLL_SYM
+ && strlen ((char *) end_elem->opr.name) > 1), 0))
+ return REG_ECOLLATE;
+
+# ifdef RE_ENABLE_I18N
+ {
+ wchar_t wc, start_wc, end_wc;
+ wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
+
+ start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
+ : 0));
+ end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
+ : 0));
+ start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
+ ? __btowc (start_ch) : start_elem->opr.wch);
+ end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
+ ? __btowc (end_ch) : end_elem->opr.wch);
+ if (start_wc == WEOF || end_wc == WEOF)
+ return REG_ECOLLATE;
+ cmp_buf[0] = start_wc;
+ cmp_buf[4] = end_wc;
+ if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ return REG_ERANGE;
+
+ /* Got valid collation sequence values, add them as a new entry.
+ However, for !_LIBC we have no collation elements: if the
+ character set is single byte, the single byte character set
+ that we build below suffices. parse_bracket_exp passes
+ no MBCSET if dfa->mb_cur_max == 1. */
+ if (mbcset)
+ {
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
+ {
+ /* There is not enough space, need realloc. */
+ wchar_t *new_array_start, *new_array_end;
+ int new_nranges;
+
+ /* +1 in case of mbcset->nranges is 0. */
+ new_nranges = 2 * mbcset->nranges + 1;
+ /* Use realloc since mbcset->range_starts and mbcset->range_ends
+ are NULL if *range_alloc == 0. */
+ new_array_start = re_realloc (mbcset->range_starts, wchar_t,
+ new_nranges);
+ new_array_end = re_realloc (mbcset->range_ends, wchar_t,
+ new_nranges);
+
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+ return REG_ESPACE;
+
+ mbcset->range_starts = new_array_start;
+ mbcset->range_ends = new_array_end;
+ *range_alloc = new_nranges;
+ }
+
+ mbcset->range_starts[mbcset->nranges] = start_wc;
+ mbcset->range_ends[mbcset->nranges++] = end_wc;
+ }
+
+ /* Build the table for single byte characters. */
+ for (wc = 0; wc < SBC_MAX; ++wc)
+ {
+ cmp_buf[2] = wc;
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ bitset_set (sbcset, wc);
+ }
+ }
+# else /* not RE_ENABLE_I18N */
+ {
+ unsigned int ch;
+ start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
+ : 0));
+ end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
+ : 0));
+ if (start_ch > end_ch)
+ return REG_ERANGE;
+ /* Build the table for single byte characters. */
+ for (ch = 0; ch < SBC_MAX; ++ch)
+ if (start_ch <= ch && ch <= end_ch)
+ bitset_set (sbcset, ch);
+ }
+# endif /* not RE_ENABLE_I18N */
+ return REG_NOERROR;
+}
+#endif /* not _LIBC */
+
+#ifndef _LIBC
+/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
+ Build the collating element which is represented by NAME.
+ The result are written to MBCSET and SBCSET.
+ COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
+ pointer argument since we may update it. */
+
+static reg_errcode_t
+# ifdef RE_ENABLE_I18N
+build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
+ re_charset_t *mbcset;
+ int *coll_sym_alloc;
+# else /* not RE_ENABLE_I18N */
+build_collating_symbol (sbcset, name)
+# endif /* not RE_ENABLE_I18N */
+ re_bitset_ptr_t sbcset;
+ const unsigned char *name;
+{
+ size_t name_len = strlen ((const char *) name);
+ if (BE (name_len != 1, 0))
+ return REG_ECOLLATE;
+ else
+ {
+ bitset_set (sbcset, name[0]);
+ return REG_NOERROR;
+ }
+}
+#endif /* not _LIBC */
+
+/* This function parse bracket expression like "[abc]", "[a-c]",
+ "[[.a-a.]]" etc. */
+
+static bin_tree_t *
+parse_bracket_exp (regexp, dfa, token, syntax, err)
+ re_string_t *regexp;
+ re_dfa_t *dfa;
+ re_token_t *token;
+ reg_syntax_t syntax;
+ reg_errcode_t *err;
+{
+#ifdef _LIBC
+ const unsigned char *collseqmb;
+ const char *collseqwc;
+ uint32_t nrules;
+ int32_t table_size;
+ const int32_t *symb_table;
+ const unsigned char *extra;
+
+ /* Local function for parse_bracket_exp used in _LIBC environement.
+ Seek the collating symbol entry correspondings to NAME.
+ Return the index of the symbol in the SYMB_TABLE. */
+
+ auto inline int32_t
+ __attribute ((always_inline))
+ seek_collating_symbol_entry (name, name_len)
+ const unsigned char *name;
+ size_t name_len;
+ {
+ int32_t hash = elem_hash ((const char *) name, name_len);
+ int32_t elem = hash % table_size;
+ int32_t second = hash % (table_size - 2);
+ while (symb_table[2 * elem] != 0)
+ {
+ /* First compare the hashing value. */
+ if (symb_table[2 * elem] == hash
+ /* Compare the length of the name. */
+ && name_len == extra[symb_table[2 * elem + 1]]
+ /* Compare the name. */
+ && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
+ name_len) == 0)
+ {
+ /* Yep, this is the entry. */
+ break;
+ }
+
+ /* Next entry. */
+ elem += second;
+ }
+ return elem;
+ }
+
+ /* Local function for parse_bracket_exp used in _LIBC environement.
+ Look up the collation sequence value of BR_ELEM.
+ Return the value if succeeded, UINT_MAX otherwise. */
+
+ auto inline unsigned int
+ __attribute ((always_inline))
+ lookup_collation_sequence_value (br_elem)
+ bracket_elem_t *br_elem;
+ {
+ if (br_elem->type == SB_CHAR)
+ {
+ /*
+ if (MB_CUR_MAX == 1)
+ */
+ if (nrules == 0)
+ return collseqmb[br_elem->opr.ch];
+ else
+ {
+ wint_t wc = __btowc (br_elem->opr.ch);
+ return __collseq_table_lookup (collseqwc, wc);
+ }
+ }
+ else if (br_elem->type == MB_CHAR)
+ {
+ return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+ }
+ else if (br_elem->type == COLL_SYM)
+ {
+ size_t sym_name_len = strlen ((char *) br_elem->opr.name);
+ if (nrules != 0)
+ {
+ int32_t elem, idx;
+ elem = seek_collating_symbol_entry (br_elem->opr.name,
+ sym_name_len);
+ if (symb_table[2 * elem] != 0)
+ {
+ /* We found the entry. */
+ idx = symb_table[2 * elem + 1];
+ /* Skip the name of collating element name. */
+ idx += 1 + extra[idx];
+ /* Skip the byte sequence of the collating element. */
+ idx += 1 + extra[idx];
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~3;
+ /* Skip the multibyte collation sequence value. */
+ idx += sizeof (unsigned int);
+ /* Skip the wide char sequence of the collating element. */
+ idx += sizeof (unsigned int) *
+ (1 + *(unsigned int *) (extra + idx));
+ /* Return the collation sequence value. */
+ return *(unsigned int *) (extra + idx);
+ }
+ else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
+ {
+ /* No valid character. Match it as a single byte
+ character. */
+ return collseqmb[br_elem->opr.name[0]];
+ }
+ }
+ else if (sym_name_len == 1)
+ return collseqmb[br_elem->opr.name[0]];
+ }
+ return UINT_MAX;
+ }
+
+ /* Local function for parse_bracket_exp used in _LIBC environement.
+ Build the range expression which starts from START_ELEM, and ends
+ at END_ELEM. The result are written to MBCSET and SBCSET.
+ RANGE_ALLOC is the allocated size of mbcset->range_starts, and
+ mbcset->range_ends, is a pointer argument sinse we may
+ update it. */
+
+ auto inline reg_errcode_t
+ __attribute ((always_inline))
+ build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
+ re_charset_t *mbcset;
+ int *range_alloc;
+ re_bitset_ptr_t sbcset;
+ bracket_elem_t *start_elem, *end_elem;
+ {
+ unsigned int ch;
+ uint32_t start_collseq;
+ uint32_t end_collseq;
+
+ /* Equivalence Classes and Character Classes can't be a range
+ start/end. */
+ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
+ || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
+ 0))
+ return REG_ERANGE;
+
+ start_collseq = lookup_collation_sequence_value (start_elem);
+ end_collseq = lookup_collation_sequence_value (end_elem);
+ /* Check start/end collation sequence values. */
+ if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
+ return REG_ECOLLATE;
+ if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
+ return REG_ERANGE;
+
+ /* Got valid collation sequence values, add them as a new entry.
+ However, if we have no collation elements, and the character set
+ is single byte, the single byte character set that we
+ build below suffices. */
+ if (nrules > 0 || dfa->mb_cur_max > 1)
+ {
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
+ {
+ /* There is not enough space, need realloc. */
+ uint32_t *new_array_start;
+ uint32_t *new_array_end;
+ int new_nranges;
+
+ /* +1 in case of mbcset->nranges is 0. */
+ new_nranges = 2 * mbcset->nranges + 1;
+ new_array_start = re_realloc (mbcset->range_starts, uint32_t,
+ new_nranges);
+ new_array_end = re_realloc (mbcset->range_ends, uint32_t,
+ new_nranges);
+
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+ return REG_ESPACE;
+
+ mbcset->range_starts = new_array_start;
+ mbcset->range_ends = new_array_end;
+ *range_alloc = new_nranges;
+ }
+
+ mbcset->range_starts[mbcset->nranges] = start_collseq;
+ mbcset->range_ends[mbcset->nranges++] = end_collseq;
+ }
+
+ /* Build the table for single byte characters. */
+ for (ch = 0; ch < SBC_MAX; ch++)
+ {
+ uint32_t ch_collseq;
+ /*
+ if (MB_CUR_MAX == 1)
+ */
+ if (nrules == 0)
+ ch_collseq = collseqmb[ch];
+ else
+ ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
+ if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
+ bitset_set (sbcset, ch);
+ }
+ return REG_NOERROR;
+ }
+
+ /* Local function for parse_bracket_exp used in _LIBC environement.
+ Build the collating element which is represented by NAME.
+ The result are written to MBCSET and SBCSET.
+ COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
+ pointer argument sinse we may update it. */
+
+ auto inline reg_errcode_t
+ __attribute ((always_inline))
+ build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
+ re_charset_t *mbcset;
+ int *coll_sym_alloc;
+ re_bitset_ptr_t sbcset;
+ const unsigned char *name;
+ {
+ int32_t elem, idx;
+ size_t name_len = strlen ((const char *) name);
+ if (nrules != 0)
+ {
+ elem = seek_collating_symbol_entry (name, name_len);
+ if (symb_table[2 * elem] != 0)
+ {
+ /* We found the entry. */
+ idx = symb_table[2 * elem + 1];
+ /* Skip the name of collating element name. */
+ idx += 1 + extra[idx];
+ }
+ else if (symb_table[2 * elem] == 0 && name_len == 1)
+ {
+ /* No valid character, treat it as a normal
+ character. */
+ bitset_set (sbcset, name[0]);
+ return REG_NOERROR;
+ }
+ else
+ return REG_ECOLLATE;
+
+ /* Got valid collation sequence, add it as a new entry. */
+ /* Check the space of the arrays. */
+ if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
+ {
+ /* Not enough, realloc it. */
+ /* +1 in case of mbcset->ncoll_syms is 0. */
+ int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
+ /* Use realloc since mbcset->coll_syms is NULL
+ if *alloc == 0. */
+ int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
+ new_coll_sym_alloc);
+ if (BE (new_coll_syms == NULL, 0))
+ return REG_ESPACE;
+ mbcset->coll_syms = new_coll_syms;
+ *coll_sym_alloc = new_coll_sym_alloc;
+ }
+ mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
+ return REG_NOERROR;
+ }
+ else
+ {
+ if (BE (name_len != 1, 0))
+ return REG_ECOLLATE;
+ else
+ {
+ bitset_set (sbcset, name[0]);
+ return REG_NOERROR;
+ }
+ }
+ }
+#endif
+
+ re_token_t br_token;
+ re_bitset_ptr_t sbcset;
+#ifdef RE_ENABLE_I18N
+ re_charset_t *mbcset;
+ int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
+ int equiv_class_alloc = 0, char_class_alloc = 0;
+#endif /* not RE_ENABLE_I18N */
+ int non_match = 0;
+ bin_tree_t *work_tree;
+ int token_len;
+ int first_round = 1;
+#ifdef _LIBC
+ collseqmb = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules)
+ {
+ /*
+ if (MB_CUR_MAX > 1)
+ */
+ collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
+ table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
+ symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_TABLEMB);
+ extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_EXTRAMB);
+ }
+#endif
+ sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
+#ifdef RE_ENABLE_I18N
+ mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
+#endif /* RE_ENABLE_I18N */
+#ifdef RE_ENABLE_I18N
+ if (BE (sbcset == NULL || mbcset == NULL, 0))
+#else
+ if (BE (sbcset == NULL, 0))
+#endif /* RE_ENABLE_I18N */
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+
+ token_len = peek_token_bracket (token, regexp, syntax);
+ if (BE (token->type == END_OF_RE, 0))
+ {
+ *err = REG_BADPAT;
+ goto parse_bracket_exp_free_return;
+ }
+ if (token->type == OP_NON_MATCH_LIST)
+ {
+#ifdef RE_ENABLE_I18N
+ mbcset->non_match = 1;
+#endif /* not RE_ENABLE_I18N */
+ non_match = 1;
+ if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
+ bitset_set (sbcset, '\0');
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
+ token_len = peek_token_bracket (token, regexp, syntax);
+ if (BE (token->type == END_OF_RE, 0))
+ {
+ *err = REG_BADPAT;
+ goto parse_bracket_exp_free_return;
+ }
+ }
+
+ /* We treat the first ']' as a normal character. */
+ if (token->type == OP_CLOSE_BRACKET)
+ token->type = CHARACTER;
+
+ while (1)
+ {
+ bracket_elem_t start_elem, end_elem;
+ unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
+ unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
+ reg_errcode_t ret;
+ int token_len2 = 0, is_range_exp = 0;
+ re_token_t token2;
+
+ start_elem.opr.name = start_name_buf;
+ ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
+ syntax, first_round);
+ if (BE (ret != REG_NOERROR, 0))
+ {
+ *err = ret;
+ goto parse_bracket_exp_free_return;
+ }
+ first_round = 0;
+
+ /* Get information about the next token. We need it in any case. */
+ token_len = peek_token_bracket (token, regexp, syntax);
+
+ /* Do not check for ranges if we know they are not allowed. */
+ if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
+ {
+ if (BE (token->type == END_OF_RE, 0))
+ {
+ *err = REG_EBRACK;
+ goto parse_bracket_exp_free_return;
+ }
+ if (token->type == OP_CHARSET_RANGE)
+ {
+ re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
+ token_len2 = peek_token_bracket (&token2, regexp, syntax);
+ if (BE (token2.type == END_OF_RE, 0))
+ {
+ *err = REG_EBRACK;
+ goto parse_bracket_exp_free_return;
+ }
+ if (token2.type == OP_CLOSE_BRACKET)
+ {
+ /* We treat the last '-' as a normal character. */
+ re_string_skip_bytes (regexp, -token_len);
+ token->type = CHARACTER;
+ }
+ else
+ is_range_exp = 1;
+ }
+ }
+
+ if (is_range_exp == 1)
+ {
+ end_elem.opr.name = end_name_buf;
+ ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
+ dfa, syntax, 1);
+ if (BE (ret != REG_NOERROR, 0))
+ {
+ *err = ret;
+ goto parse_bracket_exp_free_return;
+ }
+
+ token_len = peek_token_bracket (token, regexp, syntax);
+
+#ifdef _LIBC
+ *err = build_range_exp (sbcset, mbcset, &range_alloc,
+ &start_elem, &end_elem);
+#else
+# ifdef RE_ENABLE_I18N
+ *err = build_range_exp (sbcset,
+ dfa->mb_cur_max > 1 ? mbcset : NULL,
+ &range_alloc, &start_elem, &end_elem);
+# else
+ *err = build_range_exp (sbcset, &start_elem, &end_elem);
+# endif
+#endif /* RE_ENABLE_I18N */
+ if (BE (*err != REG_NOERROR, 0))
+ goto parse_bracket_exp_free_return;
+ }
+ else
+ {
+ switch (start_elem.type)
+ {
+ case SB_CHAR:
+ bitset_set (sbcset, start_elem.opr.ch);
+ break;
+#ifdef RE_ENABLE_I18N
+ case MB_CHAR:
+ /* Check whether the array has enough space. */
+ if (BE (mbchar_alloc == mbcset->nmbchars, 0))
+ {
+ wchar_t *new_mbchars;
+ /* Not enough, realloc it. */
+ /* +1 in case of mbcset->nmbchars is 0. */
+ mbchar_alloc = 2 * mbcset->nmbchars + 1;
+ /* Use realloc since array is NULL if *alloc == 0. */
+ new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
+ mbchar_alloc);
+ if (BE (new_mbchars == NULL, 0))
+ goto parse_bracket_exp_espace;
+ mbcset->mbchars = new_mbchars;
+ }
+ mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
+ break;
+#endif /* RE_ENABLE_I18N */
+ case EQUIV_CLASS:
+ *err = build_equiv_class (sbcset,
+#ifdef RE_ENABLE_I18N
+ mbcset, &equiv_class_alloc,
+#endif /* RE_ENABLE_I18N */
+ start_elem.opr.name);
+ if (BE (*err != REG_NOERROR, 0))
+ goto parse_bracket_exp_free_return;
+ break;
+ case COLL_SYM:
+ *err = build_collating_symbol (sbcset,
+#ifdef RE_ENABLE_I18N
+ mbcset, &coll_sym_alloc,
+#endif /* RE_ENABLE_I18N */
+ start_elem.opr.name);
+ if (BE (*err != REG_NOERROR, 0))
+ goto parse_bracket_exp_free_return;
+ break;
+ case CHAR_CLASS:
+ *err = build_charclass (regexp->trans, sbcset,
+#ifdef RE_ENABLE_I18N
+ mbcset, &char_class_alloc,
+#endif /* RE_ENABLE_I18N */
+ start_elem.opr.name, syntax);
+ if (BE (*err != REG_NOERROR, 0))
+ goto parse_bracket_exp_free_return;
+ break;
+ default:
+ assert (0);
+ break;
+ }
+ }
+ if (BE (token->type == END_OF_RE, 0))
+ {
+ *err = REG_EBRACK;
+ goto parse_bracket_exp_free_return;
+ }
+ if (token->type == OP_CLOSE_BRACKET)
+ break;
+ }
+
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
+
+ /* If it is non-matching list. */
+ if (non_match)
+ bitset_not (sbcset);
+
+#ifdef RE_ENABLE_I18N
+ /* Ensure only single byte characters are set. */
+ if (dfa->mb_cur_max > 1)
+ bitset_mask (sbcset, dfa->sb_char);
+
+ if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
+ || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
+ || mbcset->non_match)))
+ {
+ bin_tree_t *mbc_tree;
+ int sbc_idx;
+ /* Build a tree for complex bracket. */
+ dfa->has_mb_node = 1;
+ br_token.type = COMPLEX_BRACKET;
+ br_token.opr.mbcset = mbcset;
+ mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (mbc_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
+ for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
+ if (sbcset[sbc_idx])
+ break;
+ /* If there are no bits set in sbcset, there is no point
+ of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
+ if (sbc_idx < BITSET_UINTS)
+ {
+ /* Build a tree for simple bracket. */
+ br_token.type = SIMPLE_BRACKET;
+ br_token.opr.sbcset = sbcset;
+ work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (work_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
+
+ /* Then join them by ALT node. */
+ work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
+ if (BE (work_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
+ }
+ else
+ {
+ re_free (sbcset);
+ work_tree = mbc_tree;
+ }
+ }
+ else
+#endif /* not RE_ENABLE_I18N */
+ {
+#ifdef RE_ENABLE_I18N
+ free_charset (mbcset);
+#endif
+ /* Build a tree for simple bracket. */
+ br_token.type = SIMPLE_BRACKET;
+ br_token.opr.sbcset = sbcset;
+ work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (work_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
+ }
+ return work_tree;
+
+ parse_bracket_exp_espace:
+ *err = REG_ESPACE;
+ parse_bracket_exp_free_return:
+ re_free (sbcset);
+#ifdef RE_ENABLE_I18N
+ free_charset (mbcset);
+#endif /* RE_ENABLE_I18N */
+ return NULL;
+}
+
+/* Parse an element in the bracket expression. */
+
+static reg_errcode_t
+parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
+ accept_hyphen)
+ bracket_elem_t *elem;
+ re_string_t *regexp;
+ re_token_t *token;
+ int token_len;
+ re_dfa_t *dfa;
+ reg_syntax_t syntax;
+ int accept_hyphen;
+{
+#ifdef RE_ENABLE_I18N
+ int cur_char_size;
+ cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
+ if (cur_char_size > 1)
+ {
+ elem->type = MB_CHAR;
+ elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
+ re_string_skip_bytes (regexp, cur_char_size);
+ return REG_NOERROR;
+ }
+#endif /* RE_ENABLE_I18N */
+ re_string_skip_bytes (regexp, token_len); /* Skip a token. */
+ if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
+ || token->type == OP_OPEN_EQUIV_CLASS)
+ return parse_bracket_symbol (elem, regexp, token);
+ if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
+ {
+ /* A '-' must only appear as anything but a range indicator before
+ the closing bracket. Everything else is an error. */
+ re_token_t token2;
+ (void) peek_token_bracket (&token2, regexp, syntax);
+ if (token2.type != OP_CLOSE_BRACKET)
+ /* The actual error value is not standardized since this whole
+ case is undefined. But ERANGE makes good sense. */
+ return REG_ERANGE;
+ }
+ elem->type = SB_CHAR;
+ elem->opr.ch = token->opr.c;
+ return REG_NOERROR;
+}
+
+/* Parse a bracket symbol in the bracket expression. Bracket symbols are
+ such as [:<character_class>:], [.<collating_element>.], and
+ [=<equivalent_class>=]. */
+
+static reg_errcode_t
+parse_bracket_symbol (elem, regexp, token)
+ bracket_elem_t *elem;
+ re_string_t *regexp;
+ re_token_t *token;
+{
+ unsigned char ch, delim = token->opr.c;
+ int i = 0;
+ if (re_string_eoi(regexp))
+ return REG_EBRACK;
+ for (;; ++i)
+ {
+ if (i >= BRACKET_NAME_BUF_SIZE)
+ return REG_EBRACK;
+ if (token->type == OP_OPEN_CHAR_CLASS)
+ ch = re_string_fetch_byte_case (regexp);
+ else
+ ch = re_string_fetch_byte (regexp);
+ if (re_string_eoi(regexp))
+ return REG_EBRACK;
+ if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
+ break;
+ elem->opr.name[i] = ch;
+ }
+ re_string_skip_bytes (regexp, 1);
+ elem->opr.name[i] = '\0';
+ switch (token->type)
+ {
+ case OP_OPEN_COLL_ELEM:
+ elem->type = COLL_SYM;
+ break;
+ case OP_OPEN_EQUIV_CLASS:
+ elem->type = EQUIV_CLASS;
+ break;
+ case OP_OPEN_CHAR_CLASS:
+ elem->type = CHAR_CLASS;
+ break;
+ default:
+ break;
+ }
+ return REG_NOERROR;
+}
+
+ /* Helper function for parse_bracket_exp.
+ Build the equivalence class which is represented by NAME.
+ The result are written to MBCSET and SBCSET.
+ EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
+ is a pointer argument sinse we may update it. */
+
+static reg_errcode_t
+#ifdef RE_ENABLE_I18N
+build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
+ re_charset_t *mbcset;
+ int *equiv_class_alloc;
+#else /* not RE_ENABLE_I18N */
+build_equiv_class (sbcset, name)
+#endif /* not RE_ENABLE_I18N */
+ re_bitset_ptr_t sbcset;
+ const unsigned char *name;
+{
+#if defined _LIBC
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules != 0)
+ {
+ const int32_t *table, *indirect;
+ const unsigned char *weights, *extra, *cp;
+ unsigned char char_buf[2];
+ int32_t idx1, idx2;
+ unsigned int ch;
+ size_t len;
+ /* This #include defines a local function! */
+# include <locale/weight.h>
+ /* Calculate the index for equivalence class. */
+ cp = name;
+ table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_WEIGHTMB);
+ extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_INDIRECTMB);
+ idx1 = findidx (&cp);
+ if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
+ /* This isn't a valid character. */
+ return REG_ECOLLATE;
+
+ /* Build single byte matcing table for this equivalence class. */
+ char_buf[1] = (unsigned char) '\0';
+ len = weights[idx1];
+ for (ch = 0; ch < SBC_MAX; ++ch)
+ {
+ char_buf[0] = ch;
+ cp = char_buf;
+ idx2 = findidx (&cp);
+/*
+ idx2 = table[ch];
+*/
+ if (idx2 == 0)
+ /* This isn't a valid character. */
+ continue;
+ if (len == weights[idx2])
+ {
+ int cnt = 0;
+ while (cnt <= len &&
+ weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
+ ++cnt;
+
+ if (cnt > len)
+ bitset_set (sbcset, ch);
+ }
+ }
+ /* Check whether the array has enough space. */
+ if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
+ {
+ /* Not enough, realloc it. */
+ /* +1 in case of mbcset->nequiv_classes is 0. */
+ int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
+ /* Use realloc since the array is NULL if *alloc == 0. */
+ int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
+ int32_t,
+ new_equiv_class_alloc);
+ if (BE (new_equiv_classes == NULL, 0))
+ return REG_ESPACE;
+ mbcset->equiv_classes = new_equiv_classes;
+ *equiv_class_alloc = new_equiv_class_alloc;
+ }
+ mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
+ }
+ else
+#endif /* _LIBC */
+ {
+ if (BE (strlen ((const char *) name) != 1, 0))
+ return REG_ECOLLATE;
+ bitset_set (sbcset, *name);
+ }
+ return REG_NOERROR;
+}
+
+ /* Helper function for parse_bracket_exp.
+ Build the character class which is represented by NAME.
+ The result are written to MBCSET and SBCSET.
+ CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
+ is a pointer argument sinse we may update it. */
+
+static reg_errcode_t
+#ifdef RE_ENABLE_I18N
+build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
+ re_charset_t *mbcset;
+ int *char_class_alloc;
+#else /* not RE_ENABLE_I18N */
+build_charclass (trans, sbcset, class_name, syntax)
+#endif /* not RE_ENABLE_I18N */
+ unsigned RE_TRANSLATE_TYPE trans;
+ re_bitset_ptr_t sbcset;
+ const unsigned char *class_name;
+ reg_syntax_t syntax;
+{
+ int i;
+ const char *name = (const char *) class_name;
+
+ /* In case of REG_ICASE "upper" and "lower" match the both of
+ upper and lower cases. */
+ if ((syntax & RE_ICASE)
+ && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
+ name = "alpha";
+
+#ifdef RE_ENABLE_I18N
+ /* Check the space of the arrays. */
+ if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
+ {
+ /* Not enough, realloc it. */
+ /* +1 in case of mbcset->nchar_classes is 0. */
+ int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
+ /* Use realloc since array is NULL if *alloc == 0. */
+ wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
+ new_char_class_alloc);
+ if (BE (new_char_classes == NULL, 0))
+ return REG_ESPACE;
+ mbcset->char_classes = new_char_classes;
+ *char_class_alloc = new_char_class_alloc;
+ }
+ mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
+#endif /* RE_ENABLE_I18N */
+
+#define BUILD_CHARCLASS_LOOP(ctype_func) \
+ for (i = 0; i < SBC_MAX; ++i) \
+ { \
+ if (ctype_func (i)) \
+ { \
+ int ch = trans ? trans[i] : i; \
+ bitset_set (sbcset, ch); \
+ } \
+ }
+
+ if (strcmp (name, "alnum") == 0)
+ BUILD_CHARCLASS_LOOP (isalnum)
+ else if (strcmp (name, "cntrl") == 0)
+ BUILD_CHARCLASS_LOOP (iscntrl)
+ else if (strcmp (name, "lower") == 0)
+ BUILD_CHARCLASS_LOOP (islower)
+ else if (strcmp (name, "space") == 0)
+ BUILD_CHARCLASS_LOOP (isspace)
+ else if (strcmp (name, "alpha") == 0)
+ BUILD_CHARCLASS_LOOP (isalpha)
+ else if (strcmp (name, "digit") == 0)
+ BUILD_CHARCLASS_LOOP (isdigit)
+ else if (strcmp (name, "print") == 0)
+ BUILD_CHARCLASS_LOOP (isprint)
+ else if (strcmp (name, "upper") == 0)
+ BUILD_CHARCLASS_LOOP (isupper)
+ else if (strcmp (name, "blank") == 0)
+ BUILD_CHARCLASS_LOOP (isblank)
+ else if (strcmp (name, "graph") == 0)
+ BUILD_CHARCLASS_LOOP (isgraph)
+ else if (strcmp (name, "punct") == 0)
+ BUILD_CHARCLASS_LOOP (ispunct)
+ else if (strcmp (name, "xdigit") == 0)
+ BUILD_CHARCLASS_LOOP (isxdigit)
+ else
+ return REG_ECTYPE;
+
+ return REG_NOERROR;
+}
+
+static bin_tree_t *
+build_charclass_op (dfa, trans, class_name, extra, non_match, err)
+ re_dfa_t *dfa;
+ unsigned RE_TRANSLATE_TYPE trans;
+ const unsigned char *class_name;
+ const unsigned char *extra;
+ int non_match;
+ reg_errcode_t *err;
+{
+ re_bitset_ptr_t sbcset;
+#ifdef RE_ENABLE_I18N
+ re_charset_t *mbcset;
+ int alloc = 0;
+#endif /* not RE_ENABLE_I18N */
+ reg_errcode_t ret;
+ re_token_t br_token;
+ bin_tree_t *tree;
+
+ sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
+#ifdef RE_ENABLE_I18N
+ mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
+#endif /* RE_ENABLE_I18N */
+
+#ifdef RE_ENABLE_I18N
+ if (BE (sbcset == NULL || mbcset == NULL, 0))
+#else /* not RE_ENABLE_I18N */
+ if (BE (sbcset == NULL, 0))
+#endif /* not RE_ENABLE_I18N */
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+
+ if (non_match)
+ {
+#ifdef RE_ENABLE_I18N
+ /*
+ if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
+ bitset_set(cset->sbcset, '\0');
+ */
+ mbcset->non_match = 1;
+#endif /* not RE_ENABLE_I18N */
+ }
+
+ /* We don't care the syntax in this case. */
+ ret = build_charclass (trans, sbcset,
+#ifdef RE_ENABLE_I18N
+ mbcset, &alloc,
+#endif /* RE_ENABLE_I18N */
+ class_name, 0);
+
+ if (BE (ret != REG_NOERROR, 0))
+ {
+ re_free (sbcset);
+#ifdef RE_ENABLE_I18N
+ free_charset (mbcset);
+#endif /* RE_ENABLE_I18N */
+ *err = ret;
+ return NULL;
+ }
+ /* \w match '_' also. */
+ for (; *extra; extra++)
+ bitset_set (sbcset, *extra);
+
+ /* If it is non-matching list. */
+ if (non_match)
+ bitset_not (sbcset);
+
+#ifdef RE_ENABLE_I18N
+ /* Ensure only single byte characters are set. */
+ if (dfa->mb_cur_max > 1)
+ bitset_mask (sbcset, dfa->sb_char);
+#endif
+
+ /* Build a tree for simple bracket. */
+ br_token.type = SIMPLE_BRACKET;
+ br_token.opr.sbcset = sbcset;
+ tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (tree == NULL, 0))
+ goto build_word_op_espace;
+
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ {
+ bin_tree_t *mbc_tree;
+ /* Build a tree for complex bracket. */
+ br_token.type = COMPLEX_BRACKET;
+ br_token.opr.mbcset = mbcset;
+ dfa->has_mb_node = 1;
+ mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (mbc_tree == NULL, 0))
+ goto build_word_op_espace;
+ /* Then join them by ALT node. */
+ tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
+ if (BE (mbc_tree != NULL, 1))
+ return tree;
+ }
+ else
+ {
+ free_charset (mbcset);
+ return tree;
+ }
+#else /* not RE_ENABLE_I18N */
+ return tree;
+#endif /* not RE_ENABLE_I18N */
+
+ build_word_op_espace:
+ re_free (sbcset);
+#ifdef RE_ENABLE_I18N
+ free_charset (mbcset);
+#endif /* RE_ENABLE_I18N */
+ *err = REG_ESPACE;
+ return NULL;
+}
+
+/* This is intended for the expressions like "a{1,3}".
+ Fetch a number from `input', and return the number.
+ Return -1, if the number field is empty like "{,1}".
+ Return -2, If an error is occured. */
+
+static int
+fetch_number (input, token, syntax)
+ re_string_t *input;
+ re_token_t *token;
+ reg_syntax_t syntax;
+{
+ int num = -1;
+ unsigned char c;
+ while (1)
+ {
+ fetch_token (token, input, syntax);
+ c = token->opr.c;
+ if (BE (token->type == END_OF_RE, 0))
+ return -2;
+ if (token->type == OP_CLOSE_DUP_NUM || c == ',')
+ break;
+ num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
+ ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
+ num = (num > RE_DUP_MAX) ? -2 : num;
+ }
+ return num;
+}
+
+#ifdef RE_ENABLE_I18N
+static void
+free_charset (re_charset_t *cset)
+{
+ re_free (cset->mbchars);
+# ifdef _LIBC
+ re_free (cset->coll_syms);
+ re_free (cset->equiv_classes);
+ re_free (cset->range_starts);
+ re_free (cset->range_ends);
+# endif
+ re_free (cset->char_classes);
+ re_free (cset);
+}
+#endif /* RE_ENABLE_I18N */
+
+/* Functions for binary tree operation. */
+
+/* Create a tree node. */
+
+static bin_tree_t *
+create_tree (dfa, left, right, type)
+ re_dfa_t *dfa;
+ bin_tree_t *left;
+ bin_tree_t *right;
+ re_token_type_t type;
+{
+ re_token_t t;
+ t.type = type;
+ return create_token_tree (dfa, left, right, &t);
+}
+
+static bin_tree_t *
+create_token_tree (dfa, left, right, token)
+ re_dfa_t *dfa;
+ bin_tree_t *left;
+ bin_tree_t *right;
+ const re_token_t *token;
+{
+ bin_tree_t *tree;
+ if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
+ {
+ bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
+
+ if (storage == NULL)
+ return NULL;
+ storage->next = dfa->str_tree_storage;
+ dfa->str_tree_storage = storage;
+ dfa->str_tree_storage_idx = 0;
+ }
+ tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
+
+ tree->parent = NULL;
+ tree->left = left;
+ tree->right = right;
+ tree->token = *token;
+ tree->token.duplicated = 0;
+ tree->token.opt_subexp = 0;
+ tree->first = NULL;
+ tree->next = NULL;
+ tree->node_idx = -1;
+
+ if (left != NULL)
+ left->parent = tree;
+ if (right != NULL)
+ right->parent = tree;
+ return tree;
+}
+
+/* Mark the tree SRC as an optional subexpression.
+ To be called from preorder or postorder. */
+
+static reg_errcode_t
+mark_opt_subexp (extra, node)
+ void *extra;
+ bin_tree_t *node;
+{
+ int idx = (int) (long) extra;
+ if (node->token.type == SUBEXP && node->token.opr.idx == idx)
+ node->token.opt_subexp = 1;
+
+ return REG_NOERROR;
+}
+
+/* Free the allocated memory inside NODE. */
+
+static void
+free_token (re_token_t *node)
+{
+#ifdef RE_ENABLE_I18N
+ if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
+ free_charset (node->opr.mbcset);
+ else
+#endif /* RE_ENABLE_I18N */
+ if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
+ re_free (node->opr.sbcset);
+}
+
+/* Worker function for tree walking. Free the allocated memory inside NODE
+ and its children. */
+
+static reg_errcode_t
+free_tree (void *extra, bin_tree_t *node)
+{
+ free_token (&node->token);
+ return REG_NOERROR;
+}
+
+
+/* Duplicate the node SRC, and return new node. This is a preorder
+ visit similar to the one implemented by the generic visitor, but
+ we need more infrastructure to maintain two parallel trees --- so,
+ it's easier to duplicate. */
+
+static bin_tree_t *
+duplicate_tree (root, dfa)
+ const bin_tree_t *root;
+ re_dfa_t *dfa;
+{
+ const bin_tree_t *node;
+ bin_tree_t *dup_root;
+ bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
+
+ for (node = root; ; )
+ {
+ /* Create a new tree and link it back to the current parent. */
+ *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
+ if (*p_new == NULL)
+ return NULL;
+ (*p_new)->parent = dup_node;
+ (*p_new)->token.duplicated = 1;
+ dup_node = *p_new;
+
+ /* Go to the left node, or up and to the right. */
+ if (node->left)
+ {
+ node = node->left;
+ p_new = &dup_node->left;
+ }
+ else
+ {
+ const bin_tree_t *prev = NULL;
+ while (node->right == prev || node->right == NULL)
+ {
+ prev = node;
+ node = node->parent;
+ dup_node = dup_node->parent;
+ if (!node)
+ return dup_root;
+ }
+ node = node->right;
+ p_new = &dup_node->right;
+ }
+ }
+}
diff --git a/gnu/lib/libregex/regex.c b/gnu/lib/libregex/regex.c
index 358071b0dc78..7a4f304cddc5 100644
--- a/gnu/lib/libregex/regex.c
+++ b/gnu/lib/libregex/regex.c
@@ -1,85 +1,51 @@
-/* Extended regular expression matching and search library,
- version 0.12.
- (Implements POSIX draft P1003.2/D11.2, except for some of the
- internationalization features.)
- Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
+/* Extended regular expression matching and search library.
+ Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public License as
- published by the Free Software Foundation; either version 2 of the
- License, or (at your option) any later version.
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
+ Lesser General Public License for more details.
- You should have received a copy of the GNU Library General Public
- License along with the GNU C Library; see the file COPYING.LIB. If not,
- write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA. */
-
-/* AIX requires this to be the first thing in the file. */
-#if defined _AIX && !defined REGEX_MALLOC
- #pragma alloca
-#endif
-
-#undef _GNU_SOURCE
-#define _GNU_SOURCE
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
#ifdef HAVE_CONFIG_H
-# include <config.h>
+#include "config.h"
#endif
-#ifndef PARAMS
-# if defined __GNUC__ || (defined __STDC__ && __STDC__)
-# define PARAMS(args) args
-# else
-# define PARAMS(args) ()
-# endif /* GCC. */
-#endif /* Not PARAMS. */
-
-#if defined STDC_HEADERS && !defined emacs
-# include <stddef.h>
+#ifdef _AIX
+#pragma alloca
#else
-/* We need this for `regex.h', and perhaps for the Emacs include files. */
-# include <sys/types.h>
-#endif
-
-#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
-
-/* For platform which support the ISO C amendement 1 functionality we
- support user defined character classes. */
-#if defined _LIBC || WIDE_CHAR_SUPPORT
-/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
-# include <wchar.h>
-# include <wctype.h>
+# ifndef allocax /* predefined by HP cc +Olibcalls */
+# ifdef __GNUC__
+# define alloca(size) __builtin_alloca (size)
+# else
+# if HAVE_ALLOCA_H
+# include <alloca.h>
+# else
+# ifdef __hpux
+ void *alloca ();
+# else
+# if !defined __OS2__ && !defined WIN32
+ char *alloca ();
+# else
+# include <malloc.h> /* OS/2 defines alloca in here */
+# endif
+# endif
+# endif
+# endif
+# endif
#endif
-/* This is for multi byte string support. */
-#ifdef MBS_SUPPORT
-# define CHAR_TYPE wchar_t
-# define US_CHAR_TYPE wchar_t/* unsigned character type */
-# define COMPILED_BUFFER_VAR wc_buffer
-# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
-# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
-# define PUT_CHAR(c) \
- do { \
- if (MB_CUR_MAX == 1) \
- putchar (c); \
- else \
- printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \
- } while (0)
-# define TRUE 1
-# define FALSE 0
-#else
-# define CHAR_TYPE char
-# define US_CHAR_TYPE unsigned char /* unsigned character type */
-# define COMPILED_BUFFER_VAR bufp->buffer
-# define OFFSET_ADDRESS_SIZE 2
-# define PUT_CHAR(c) putchar (c)
-#endif /* MBS_SUPPORT */
-
#ifdef _LIBC
/* We have to keep the namespace clean. */
# define regfree(preg) __regfree (preg)
@@ -102,7787 +68,30 @@
__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
-# define btowc __btowc
-
-/* We are also using some library internals. */
-# include <locale/localeinfo.h>
-# include <locale/elem-hash.h>
-# include <langinfo.h>
-# include <locale/coll-lookup.h>
-#endif
-
-/* This is for other GNU distributions with internationalized messages. */
-#if HAVE_LIBINTL_H || defined _LIBC
-# include <libintl.h>
-# ifdef _LIBC
-# undef gettext
-# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
-# endif
-#else
-# define gettext(msgid) (msgid)
-#endif
-
-#ifndef gettext_noop
-/* This define is so xgettext can find the internationalizable
- strings. */
-# define gettext_noop(String) String
+# include "../locale/localeinfo.h"
#endif
-/* The `emacs' switch turns on certain matching commands
- that make sense only in Emacs. */
-#ifdef emacs
-
-# include "lisp.h"
-# include "buffer.h"
-# include "syntax.h"
-
-#else /* not emacs */
-
-/* If we are not linking with Emacs proper,
- we can't use the relocating allocator
- even if config.h says that we can. */
-# undef REL_ALLOC
-
-# if defined STDC_HEADERS || defined _LIBC
-# include <stdlib.h>
-# else
-char *malloc ();
-char *realloc ();
-# endif
-
-/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
- If nothing else has been done, use the method below. */
-# ifdef INHIBIT_STRING_HEADER
-# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
-# if !defined bzero && !defined bcopy
-# undef INHIBIT_STRING_HEADER
-# endif
-# endif
-# endif
-
-/* This is the normal way of making sure we have a bcopy and a bzero.
- This is used in most programs--a few other programs avoid this
- by defining INHIBIT_STRING_HEADER. */
-# ifndef INHIBIT_STRING_HEADER
-# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
-# include <string.h>
-# ifndef bzero
-# ifndef _LIBC
-# define bzero(s, n) (memset (s, '\0', n), (s))
-# else
-# define bzero(s, n) __bzero (s, n)
-# endif
-# endif
-# else
-# include <strings.h>
-# ifndef memcmp
-# define memcmp(s1, s2, n) bcmp (s1, s2, n)
-# endif
-# ifndef memcpy
-# define memcpy(d, s, n) (bcopy (s, d, n), (d))
-# endif
-# endif
-# endif
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+ <regex.h>. */
+#include <sys/types.h>
-/* Define the syntax stuff for \<, \>, etc. */
+/* On some systems, limits.h sets RE_DUP_MAX to a lower value than
+ GNU regex allows. Include it before <regex.h>, which correctly
+ #undefs RE_DUP_MAX and sets it to the right value. */
+#include <limits.h>
-/* This must be nonzero for the wordchar and notwordchar pattern
- commands in re_match_2. */
-# ifndef Sword
-# define Sword 1
-# endif
-
-# ifdef SWITCH_ENUM_BUG
-# define SWITCH_ENUM_CAST(x) ((int)(x))
-# else
-# define SWITCH_ENUM_CAST(x) (x)
-# endif
-
-#endif /* not emacs */
-
-#if defined _LIBC || HAVE_LIMITS_H
-# include <limits.h>
-#endif
-
-#ifndef MB_LEN_MAX
-# define MB_LEN_MAX 1
-#endif
-
-/* Get the interface, including the syntax bits. */
#include <regex.h>
+#include "regex_internal.h"
-/* isalpha etc. are used for the character classes. */
-#include <ctype.h>
-
-/* Jim Meyering writes:
-
- "... Some ctype macros are valid only for character codes that
- isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
- using /bin/cc or gcc but without giving an ansi option). So, all
- ctype uses should be through macros like ISPRINT... If
- STDC_HEADERS is defined, then autoconf has verified that the ctype
- macros don't need to be guarded with references to isascii. ...
- Defining isascii to 1 should let any compiler worth its salt
- eliminate the && through constant folding."
- Solaris defines some of these symbols so we must undefine them first. */
-
-#undef ISASCII
-#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
-# define ISASCII(c) 1
-#else
-# define ISASCII(c) isascii(c)
-#endif
-
-#ifdef isblank
-# define ISBLANK(c) (ISASCII (c) && isblank (c))
-#else
-# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
-#endif
-#ifdef isgraph
-# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
-#else
-# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
-#endif
-
-#undef ISPRINT
-#define ISPRINT(c) (ISASCII (c) && isprint (c))
-#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
-#define ISALNUM(c) (ISASCII (c) && isalnum (c))
-#define ISALPHA(c) (ISASCII (c) && isalpha (c))
-#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
-#define ISLOWER(c) (ISASCII (c) && islower (c))
-#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
-#define ISSPACE(c) (ISASCII (c) && isspace (c))
-#define ISUPPER(c) (ISASCII (c) && isupper (c))
-#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
-
-#ifdef _tolower
-# define TOLOWER(c) _tolower(c)
-#else
-# define TOLOWER(c) tolower(c)
-#endif
-
-#ifndef NULL
-# define NULL (void *)0
-#endif
-
-/* We remove any previous definition of `SIGN_EXTEND_CHAR',
- since ours (we hope) works properly with all combinations of
- machines, compilers, `char' and `unsigned char' argument types.
- (Per Bothner suggested the basic approach.) */
-#undef SIGN_EXTEND_CHAR
-#if __STDC__
-# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
-#else /* not __STDC__ */
-/* As in Harbison and Steele. */
-# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
-#endif
-
-#ifndef emacs
-/* How many characters in the character set. */
-# define CHAR_SET_SIZE 256
-
-# ifdef SYNTAX_TABLE
-
-extern char *re_syntax_table;
-
-# else /* not SYNTAX_TABLE */
-
-static char re_syntax_table[CHAR_SET_SIZE];
-
-static void init_syntax_once PARAMS ((void));
-
-static void
-init_syntax_once ()
-{
- register int c;
- static int done = 0;
-
- if (done)
- return;
- bzero (re_syntax_table, sizeof re_syntax_table);
-
- for (c = 0; c < CHAR_SET_SIZE; ++c)
- if (ISALNUM (c))
- re_syntax_table[c] = Sword;
-
- re_syntax_table['_'] = Sword;
-
- done = 1;
-}
-
-# endif /* not SYNTAX_TABLE */
-
-# define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
-
-#endif /* emacs */
-
-/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
- use `alloca' instead of `malloc'. This is because using malloc in
- re_search* or re_match* could cause memory leaks when C-g is used in
- Emacs; also, malloc is slower and causes storage fragmentation. On
- the other hand, malloc is more portable, and easier to debug.
-
- Because we sometimes use alloca, some routines have to be macros,
- not functions -- `alloca'-allocated space disappears at the end of the
- function it is called in. */
-
-#ifdef REGEX_MALLOC
-
-# define REGEX_ALLOCATE malloc
-# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
-# define REGEX_FREE free
-
-#else /* not REGEX_MALLOC */
-
-/* Emacs already defines alloca, sometimes. */
-# ifndef alloca
-
-/* Make alloca work the best possible way. */
-# ifdef __GNUC__
-# define alloca __builtin_alloca
-# else /* not __GNUC__ */
-# if HAVE_ALLOCA_H
-# include <alloca.h>
-# endif /* HAVE_ALLOCA_H */
-# endif /* not __GNUC__ */
-
-# endif /* not alloca */
-
-# define REGEX_ALLOCATE alloca
-
-/* Assumes a `char *destination' variable. */
-# define REGEX_REALLOCATE(source, osize, nsize) \
- (destination = (char *) alloca (nsize), \
- memcpy (destination, source, osize))
-
-/* No need to do anything to free, after alloca. */
-# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
-
-#endif /* not REGEX_MALLOC */
-
-/* Define how to allocate the failure stack. */
-
-#if defined REL_ALLOC && defined REGEX_MALLOC
-
-# define REGEX_ALLOCATE_STACK(size) \
- r_alloc (&failure_stack_ptr, (size))
-# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
- r_re_alloc (&failure_stack_ptr, (nsize))
-# define REGEX_FREE_STACK(ptr) \
- r_alloc_free (&failure_stack_ptr)
-
-#else /* not using relocating allocator */
-
-# ifdef REGEX_MALLOC
-
-# define REGEX_ALLOCATE_STACK malloc
-# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
-# define REGEX_FREE_STACK free
-
-# else /* not REGEX_MALLOC */
-
-# define REGEX_ALLOCATE_STACK alloca
-
-# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
- REGEX_REALLOCATE (source, osize, nsize)
-/* No need to explicitly free anything. */
-# define REGEX_FREE_STACK(arg)
-
-# endif /* not REGEX_MALLOC */
-#endif /* not using relocating allocator */
-
-
-/* True if `size1' is non-NULL and PTR is pointing anywhere inside
- `string1' or just past its end. This works if PTR is NULL, which is
- a good thing. */
-#define FIRST_STRING_P(ptr) \
- (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
-
-/* (Re)Allocate N items of type T using malloc, or fail. */
-#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
-#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
-#define RETALLOC_IF(addr, n, t) \
- if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
-#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
-
-#define BYTEWIDTH 8 /* In bits. */
-
-#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
-
-#undef MAX
-#undef MIN
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-typedef char boolean;
-#define false 0
-#define true 1
-
-static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
- const char *string1, int size1,
- const char *string2, int size2,
- int pos,
- struct re_registers *regs,
- int stop));
-
-/* These are the command codes that appear in compiled regular
- expressions. Some opcodes are followed by argument bytes. A
- command code can specify any interpretation whatsoever for its
- arguments. Zero bytes may appear in the compiled regular expression. */
-
-typedef enum
-{
- no_op = 0,
-
- /* Succeed right away--no more backtracking. */
- succeed,
-
- /* Followed by one byte giving n, then by n literal bytes. */
- exactn,
-
-#ifdef MBS_SUPPORT
- /* Same as exactn, but contains binary data. */
- exactn_bin,
-#endif
-
- /* Matches any (more or less) character. */
- anychar,
-
- /* Matches any one char belonging to specified set. First
- following byte is number of bitmap bytes. Then come bytes
- for a bitmap saying which chars are in. Bits in each byte
- are ordered low-bit-first. A character is in the set if its
- bit is 1. A character too large to have a bit in the map is
- automatically not in the set. */
- /* ifdef MBS_SUPPORT, following element is length of character
- classes, length of collating symbols, length of equivalence
- classes, length of character ranges, and length of characters.
- Next, character class element, collating symbols elements,
- equivalence class elements, range elements, and character
- elements follow.
- See regex_compile function. */
- charset,
-
- /* Same parameters as charset, but match any character that is
- not one of those specified. */
- charset_not,
-
- /* Start remembering the text that is matched, for storing in a
- register. Followed by one byte with the register number, in
- the range 0 to one less than the pattern buffer's re_nsub
- field. Then followed by one byte with the number of groups
- inner to this one. (This last has to be part of the
- start_memory only because we need it in the on_failure_jump
- of re_match_2.) */
- start_memory,
-
- /* Stop remembering the text that is matched and store it in a
- memory register. Followed by one byte with the register
- number, in the range 0 to one less than `re_nsub' in the
- pattern buffer, and one byte with the number of inner groups,
- just like `start_memory'. (We need the number of inner
- groups here because we don't have any easy way of finding the
- corresponding start_memory when we're at a stop_memory.) */
- stop_memory,
-
- /* Match a duplicate of something remembered. Followed by one
- byte containing the register number. */
- duplicate,
-
- /* Fail unless at beginning of line. */
- begline,
-
- /* Fail unless at end of line. */
- endline,
-
- /* Succeeds if at beginning of buffer (if emacs) or at beginning
- of string to be matched (if not). */
- begbuf,
-
- /* Analogously, for end of buffer/string. */
- endbuf,
-
- /* Followed by two byte relative address to which to jump. */
- jump,
-
- /* Same as jump, but marks the end of an alternative. */
- jump_past_alt,
-
- /* Followed by two-byte relative address of place to resume at
- in case of failure. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- on_failure_jump,
-
- /* Like on_failure_jump, but pushes a placeholder instead of the
- current string position when executed. */
- on_failure_keep_string_jump,
-
- /* Throw away latest failure point and then jump to following
- two-byte relative address. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- pop_failure_jump,
-
- /* Change to pop_failure_jump if know won't have to backtrack to
- match; otherwise change to jump. This is used to jump
- back to the beginning of a repeat. If what follows this jump
- clearly won't match what the repeat does, such that we can be
- sure that there is no use backtracking out of repetitions
- already matched, then we change it to a pop_failure_jump.
- Followed by two-byte address. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- maybe_pop_jump,
-
- /* Jump to following two-byte address, and push a dummy failure
- point. This failure point will be thrown away if an attempt
- is made to use it for a failure. A `+' construct makes this
- before the first repeat. Also used as an intermediary kind
- of jump when compiling an alternative. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- dummy_failure_jump,
-
- /* Push a dummy failure point and continue. Used at the end of
- alternatives. */
- push_dummy_failure,
-
- /* Followed by two-byte relative address and two-byte number n.
- After matching N times, jump to the address upon failure. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- succeed_n,
-
- /* Followed by two-byte relative address, and two-byte number n.
- Jump to the address N times, then fail. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- jump_n,
-
- /* Set the following two-byte relative address to the
- subsequent two-byte number. The address *includes* the two
- bytes of number. */
- /* ifdef MBS_SUPPORT, the size of address is 1. */
- set_number_at,
-
- wordchar, /* Matches any word-constituent character. */
- notwordchar, /* Matches any char that is not a word-constituent. */
-
- wordbeg, /* Succeeds if at word beginning. */
- wordend, /* Succeeds if at word end. */
-
- wordbound, /* Succeeds if at a word boundary. */
- notwordbound /* Succeeds if not at a word boundary. */
-
-#ifdef emacs
- ,before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
-
- /* Matches any character whose syntax is specified. Followed by
- a byte which contains a syntax code, e.g., Sword. */
- syntaxspec,
-
- /* Matches any character whose syntax is not that specified. */
- notsyntaxspec
-#endif /* emacs */
-} re_opcode_t;
-
-/* Common operations on the compiled pattern. */
-
-/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
-/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-
-#ifdef MBS_SUPPORT
-# define STORE_NUMBER(destination, number) \
- do { \
- *(destination) = (US_CHAR_TYPE)(number); \
- } while (0)
-#else
-# define STORE_NUMBER(destination, number) \
- do { \
- (destination)[0] = (number) & 0377; \
- (destination)[1] = (number) >> 8; \
- } while (0)
-#endif /* MBS_SUPPORT */
-
-/* Same as STORE_NUMBER, except increment DESTINATION to
- the byte after where the number is stored. Therefore, DESTINATION
- must be an lvalue. */
-/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-
-#define STORE_NUMBER_AND_INCR(destination, number) \
- do { \
- STORE_NUMBER (destination, number); \
- (destination) += OFFSET_ADDRESS_SIZE; \
- } while (0)
-
-/* Put into DESTINATION a number stored in two contiguous bytes starting
- at SOURCE. */
-/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-
-#ifdef MBS_SUPPORT
-# define EXTRACT_NUMBER(destination, source) \
- do { \
- (destination) = *(source); \
- } while (0)
-#else
-# define EXTRACT_NUMBER(destination, source) \
- do { \
- (destination) = *(source) & 0377; \
- (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
- } while (0)
-#endif
-
-#ifdef DEBUG
-static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source));
-static void
-extract_number (dest, source)
- int *dest;
- US_CHAR_TYPE *source;
-{
-#ifdef MBS_SUPPORT
- *dest = *source;
-#else
- int temp = SIGN_EXTEND_CHAR (*(source + 1));
- *dest = *source & 0377;
- *dest += temp << 8;
-#endif
-}
-
-# ifndef EXTRACT_MACROS /* To debug the macros. */
-# undef EXTRACT_NUMBER
-# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
-# endif /* not EXTRACT_MACROS */
-
-#endif /* DEBUG */
-
-/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
- SOURCE must be an lvalue. */
-
-#define EXTRACT_NUMBER_AND_INCR(destination, source) \
- do { \
- EXTRACT_NUMBER (destination, source); \
- (source) += OFFSET_ADDRESS_SIZE; \
- } while (0)
-
-#ifdef DEBUG
-static void extract_number_and_incr _RE_ARGS ((int *destination,
- US_CHAR_TYPE **source));
-static void
-extract_number_and_incr (destination, source)
- int *destination;
- US_CHAR_TYPE **source;
-{
- extract_number (destination, *source);
- *source += OFFSET_ADDRESS_SIZE;
-}
-
-# ifndef EXTRACT_MACROS
-# undef EXTRACT_NUMBER_AND_INCR
-# define EXTRACT_NUMBER_AND_INCR(dest, src) \
- extract_number_and_incr (&dest, &src)
-# endif /* not EXTRACT_MACROS */
-
-#endif /* DEBUG */
-
-/* If DEBUG is defined, Regex prints many voluminous messages about what
- it is doing (if the variable `debug' is nonzero). If linked with the
- main program in `iregex.c', you can enter patterns and strings
- interactively. And if linked with the main program in `main.c' and
- the other test files, you can run the already-written tests. */
-
-#ifdef DEBUG
-
-/* We use standard I/O for debugging. */
-# include <stdio.h>
-
-/* It is useful to test things that ``must'' be true when debugging. */
-# include <assert.h>
-
-static int debug;
-
-# define DEBUG_STATEMENT(e) e
-# define DEBUG_PRINT1(x) if (debug) printf (x)
-# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
-# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
-# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
-# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
- if (debug) print_partial_compiled_pattern (s, e)
-# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
- if (debug) print_double_string (w, s1, sz1, s2, sz2)
-
-
-/* Print the fastmap in human-readable form. */
-
-void
-print_fastmap (fastmap)
- char *fastmap;
-{
- unsigned was_a_range = 0;
- unsigned i = 0;
+#include "regex_internal.c"
+#include "regcomp.c"
+#include "regexec.c"
- while (i < (1 << BYTEWIDTH))
- {
- if (fastmap[i++])
- {
- was_a_range = 0;
- putchar (i - 1);
- while (i < (1 << BYTEWIDTH) && fastmap[i])
- {
- was_a_range = 1;
- i++;
- }
- if (was_a_range)
- {
- printf ("-");
- putchar (i - 1);
- }
- }
- }
- putchar ('\n');
-}
-
-
-/* Print a compiled pattern string in human-readable form, starting at
- the START pointer into it and ending just before the pointer END. */
-
-void
-print_partial_compiled_pattern (start, end)
- US_CHAR_TYPE *start;
- US_CHAR_TYPE *end;
-{
- int mcnt, mcnt2;
- US_CHAR_TYPE *p1;
- US_CHAR_TYPE *p = start;
- US_CHAR_TYPE *pend = end;
-
- if (start == NULL)
- {
- printf ("(null)\n");
- return;
- }
-
- /* Loop over pattern commands. */
- while (p < pend)
- {
-#ifdef _LIBC
- printf ("%td:\t", p - start);
-#else
- printf ("%ld:\t", (long int) (p - start));
-#endif
-
- switch ((re_opcode_t) *p++)
- {
- case no_op:
- printf ("/no_op");
- break;
-
- case exactn:
- mcnt = *p++;
- printf ("/exactn/%d", mcnt);
- do
- {
- putchar ('/');
- PUT_CHAR (*p++);
- }
- while (--mcnt);
- break;
-
-#ifdef MBS_SUPPORT
- case exactn_bin:
- mcnt = *p++;
- printf ("/exactn_bin/%d", mcnt);
- do
- {
- printf("/%lx", (long int) *p++);
- }
- while (--mcnt);
- break;
-#endif /* MBS_SUPPORT */
-
- case start_memory:
- mcnt = *p++;
- printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
- break;
-
- case stop_memory:
- mcnt = *p++;
- printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
- break;
-
- case duplicate:
- printf ("/duplicate/%ld", (long int) *p++);
- break;
-
- case anychar:
- printf ("/anychar");
- break;
-
- case charset:
- case charset_not:
- {
-#ifdef MBS_SUPPORT
- int i, length;
- wchar_t *workp = p;
- printf ("/charset [%s",
- (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
- p += 5;
- length = *workp++; /* the length of char_classes */
- for (i=0 ; i<length ; i++)
- printf("[:%lx:]", (long int) *p++);
- length = *workp++; /* the length of collating_symbol */
- for (i=0 ; i<length ;)
- {
- printf("[.");
- while(*p != 0)
- PUT_CHAR((i++,*p++));
- i++,p++;
- printf(".]");
- }
- length = *workp++; /* the length of equivalence_class */
- for (i=0 ; i<length ;)
- {
- printf("[=");
- while(*p != 0)
- PUT_CHAR((i++,*p++));
- i++,p++;
- printf("=]");
- }
- length = *workp++; /* the length of char_range */
- for (i=0 ; i<length ; i++)
- {
- wchar_t range_start = *p++;
- wchar_t range_end = *p++;
- if (MB_CUR_MAX == 1)
- printf("%c-%c", (char) range_start, (char) range_end);
- else
- printf("%C-%C", (wint_t) range_start, (wint_t) range_end);
- }
- length = *workp++; /* the length of char */
- for (i=0 ; i<length ; i++)
- if (MB_CUR_MAX == 1)
- putchar (*p++);
- else
- printf("%C", (wint_t) *p++);
- putchar (']');
-#else
- register int c, last = -100;
- register int in_range = 0;
-
- printf ("/charset [%s",
- (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
-
- assert (p + *p < pend);
-
- for (c = 0; c < 256; c++)
- if (c / 8 < *p
- && (p[1 + (c/8)] & (1 << (c % 8))))
- {
- /* Are we starting a range? */
- if (last + 1 == c && ! in_range)
- {
- putchar ('-');
- in_range = 1;
- }
- /* Have we broken a range? */
- else if (last + 1 != c && in_range)
- {
- putchar (last);
- in_range = 0;
- }
-
- if (! in_range)
- putchar (c);
-
- last = c;
- }
-
- if (in_range)
- putchar (last);
-
- putchar (']');
-
- p += 1 + *p;
-#endif /* MBS_SUPPORT */
- }
- break;
-
- case begline:
- printf ("/begline");
- break;
-
- case endline:
- printf ("/endline");
- break;
-
- case on_failure_jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/on_failure_jump to %td", p + mcnt - start);
-#else
- printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case on_failure_keep_string_jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
-#else
- printf ("/on_failure_keep_string_jump to %ld",
- (long int) (p + mcnt - start));
-#endif
- break;
-
- case dummy_failure_jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/dummy_failure_jump to %td", p + mcnt - start);
-#else
- printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case push_dummy_failure:
- printf ("/push_dummy_failure");
- break;
-
- case maybe_pop_jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/maybe_pop_jump to %td", p + mcnt - start);
-#else
- printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case pop_failure_jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/pop_failure_jump to %td", p + mcnt - start);
-#else
- printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case jump_past_alt:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/jump_past_alt to %td", p + mcnt - start);
-#else
- printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case jump:
- extract_number_and_incr (&mcnt, &p);
-#ifdef _LIBC
- printf ("/jump to %td", p + mcnt - start);
-#else
- printf ("/jump to %ld", (long int) (p + mcnt - start));
-#endif
- break;
-
- case succeed_n:
- extract_number_and_incr (&mcnt, &p);
- p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
-#ifdef _LIBC
- printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
-#else
- printf ("/succeed_n to %ld, %d times",
- (long int) (p1 - start), mcnt2);
-#endif
- break;
-
- case jump_n:
- extract_number_and_incr (&mcnt, &p);
- p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
- printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
- break;
-
- case set_number_at:
- extract_number_and_incr (&mcnt, &p);
- p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
-#ifdef _LIBC
- printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
-#else
- printf ("/set_number_at location %ld to %d",
- (long int) (p1 - start), mcnt2);
-#endif
- break;
-
- case wordbound:
- printf ("/wordbound");
- break;
-
- case notwordbound:
- printf ("/notwordbound");
- break;
-
- case wordbeg:
- printf ("/wordbeg");
- break;
-
- case wordend:
- printf ("/wordend");
- break;
-
-# ifdef emacs
- case before_dot:
- printf ("/before_dot");
- break;
-
- case at_dot:
- printf ("/at_dot");
- break;
-
- case after_dot:
- printf ("/after_dot");
- break;
-
- case syntaxspec:
- printf ("/syntaxspec");
- mcnt = *p++;
- printf ("/%d", mcnt);
- break;
-
- case notsyntaxspec:
- printf ("/notsyntaxspec");
- mcnt = *p++;
- printf ("/%d", mcnt);
- break;
-# endif /* emacs */
-
- case wordchar:
- printf ("/wordchar");
- break;
-
- case notwordchar:
- printf ("/notwordchar");
- break;
-
- case begbuf:
- printf ("/begbuf");
- break;
-
- case endbuf:
- printf ("/endbuf");
- break;
-
- default:
- printf ("?%ld", (long int) *(p-1));
- }
-
- putchar ('\n');
- }
-
-#ifdef _LIBC
- printf ("%td:\tend of pattern.\n", p - start);
-#else
- printf ("%ld:\tend of pattern.\n", (long int) (p - start));
-#endif
-}
-
-
-void
-print_compiled_pattern (bufp)
- struct re_pattern_buffer *bufp;
-{
- US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer;
-
- print_partial_compiled_pattern (buffer, buffer
- + bufp->used / sizeof(US_CHAR_TYPE));
- printf ("%ld bytes used/%ld bytes allocated.\n",
- bufp->used, bufp->allocated);
-
- if (bufp->fastmap_accurate && bufp->fastmap)
- {
- printf ("fastmap: ");
- print_fastmap (bufp->fastmap);
- }
-
-#ifdef _LIBC
- printf ("re_nsub: %Zd\t", bufp->re_nsub);
-#else
- printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
-#endif
- printf ("regs_alloc: %d\t", bufp->regs_allocated);
- printf ("can_be_null: %d\t", bufp->can_be_null);
- printf ("newline_anchor: %d\n", bufp->newline_anchor);
- printf ("no_sub: %d\t", bufp->no_sub);
- printf ("not_bol: %d\t", bufp->not_bol);
- printf ("not_eol: %d\t", bufp->not_eol);
- printf ("syntax: %lx\n", bufp->syntax);
- /* Perhaps we should print the translate table? */
-}
-
-
-void
-print_double_string (where, string1, size1, string2, size2)
- const CHAR_TYPE *where;
- const CHAR_TYPE *string1;
- const CHAR_TYPE *string2;
- int size1;
- int size2;
-{
- int this_char;
-
- if (where == NULL)
- printf ("(null)");
- else
- {
- if (FIRST_STRING_P (where))
- {
- for (this_char = where - string1; this_char < size1; this_char++)
- PUT_CHAR (string1[this_char]);
-
- where = string2;
- }
-
- for (this_char = where - string2; this_char < size2; this_char++)
- PUT_CHAR (string2[this_char]);
- }
-}
-
-void
-printchar (c)
- int c;
-{
- putc (c, stderr);
-}
-
-#else /* not DEBUG */
-
-# undef assert
-# define assert(e)
-
-# define DEBUG_STATEMENT(e)
-# define DEBUG_PRINT1(x)
-# define DEBUG_PRINT2(x1, x2)
-# define DEBUG_PRINT3(x1, x2, x3)
-# define DEBUG_PRINT4(x1, x2, x3, x4)
-# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
-# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
-
-#endif /* not DEBUG */
-
-#ifdef MBS_SUPPORT
-/* This convert a multibyte string to a wide character string.
- And write their correspondances to offset_buffer(see below)
- and write whether each wchar_t is binary data to is_binary.
- This assume invalid multibyte sequences as binary data.
- We assume offset_buffer and is_binary is already allocated
- enough space. */
-
-static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
- size_t len, int *offset_buffer,
- char *is_binary);
-static size_t
-convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
- CHAR_TYPE *dest;
- const unsigned char* src;
- size_t len; /* the length of multibyte string. */
-
- /* It hold correspondances between src(char string) and
- dest(wchar_t string) for optimization.
- e.g. src = "xxxyzz"
- dest = {'X', 'Y', 'Z'}
- (each "xxx", "y" and "zz" represent one multibyte character
- corresponding to 'X', 'Y' and 'Z'.)
- offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
- = {0, 3, 4, 6}
- */
- int *offset_buffer;
- char *is_binary;
-{
- wchar_t *pdest = dest;
- const unsigned char *psrc = src;
- size_t wc_count = 0;
-
- if (MB_CUR_MAX == 1)
- { /* We don't need conversion. */
- for ( ; wc_count < len ; ++wc_count)
- {
- *pdest++ = *psrc++;
- is_binary[wc_count] = FALSE;
- offset_buffer[wc_count] = wc_count;
- }
- offset_buffer[wc_count] = wc_count;
- }
- else
- {
- /* We need conversion. */
- mbstate_t mbs;
- int consumed;
- size_t mb_remain = len;
- size_t mb_count = 0;
-
- /* Initialize the conversion state. */
- memset (&mbs, 0, sizeof (mbstate_t));
-
- offset_buffer[0] = 0;
- for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
- psrc += consumed)
- {
- consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
-
- if (consumed <= 0)
- /* failed to convert. maybe src contains binary data.
- So we consume 1 byte manualy. */
- {
- *pdest = *psrc;
- consumed = 1;
- is_binary[wc_count] = TRUE;
- }
- else
- is_binary[wc_count] = FALSE;
- /* In sjis encoding, we use yen sign as escape character in
- place of reverse solidus. So we convert 0x5c(yen sign in
- sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
- solidus in UCS2). */
- if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
- *pdest = (wchar_t) *psrc;
-
- offset_buffer[wc_count + 1] = mb_count += consumed;
- }
- }
-
- return wc_count;
-}
-
-#endif /* MBS_SUPPORT */
-
-/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
- also be assigned to arbitrarily: each pattern buffer stores its own
- syntax, so it can be changed between regex compilations. */
-/* This has no initializer because initialized variables in Emacs
- become read-only after dumping. */
-reg_syntax_t re_syntax_options;
-
-
-/* Specify the precise syntax of regexps for compilation. This provides
- for compatibility for various utilities which historically have
- different, incompatible syntaxes.
-
- The argument SYNTAX is a bit mask comprised of the various bits
- defined in regex.h. We return the old syntax. */
-
-reg_syntax_t
-re_set_syntax (syntax)
- reg_syntax_t syntax;
-{
- reg_syntax_t ret = re_syntax_options;
-
- re_syntax_options = syntax;
-#ifdef DEBUG
- if (syntax & RE_DEBUG)
- debug = 1;
- else if (debug) /* was on but now is not */
- debug = 0;
-#endif /* DEBUG */
- return ret;
-}
-#ifdef _LIBC
-weak_alias (__re_set_syntax, re_set_syntax)
-#endif
-
-/* This table gives an error message for each of the error codes listed
- in regex.h. Obviously the order here has to be same as there.
- POSIX doesn't require that we do anything for REG_NOERROR,
- but why not be nice? */
-
-static const char re_error_msgid[] =
- {
-#define REG_NOERROR_IDX 0
- gettext_noop ("Success") /* REG_NOERROR */
- "\0"
-#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
- gettext_noop ("No match") /* REG_NOMATCH */
- "\0"
-#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
- gettext_noop ("Invalid regular expression") /* REG_BADPAT */
- "\0"
-#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
- gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
- "\0"
-#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
- gettext_noop ("Invalid character class name") /* REG_ECTYPE */
- "\0"
-#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
- gettext_noop ("Trailing backslash") /* REG_EESCAPE */
- "\0"
-#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
- gettext_noop ("Invalid back reference") /* REG_ESUBREG */
- "\0"
-#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
- gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
- "\0"
-#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
- gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
- "\0"
-#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
- gettext_noop ("Unmatched \\{") /* REG_EBRACE */
- "\0"
-#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
- gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
- "\0"
-#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
- gettext_noop ("Invalid range end") /* REG_ERANGE */
- "\0"
-#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
- gettext_noop ("Memory exhausted") /* REG_ESPACE */
- "\0"
-#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
- gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
- "\0"
-#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
- gettext_noop ("Premature end of regular expression") /* REG_EEND */
- "\0"
-#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
- gettext_noop ("Regular expression too big") /* REG_ESIZE */
- "\0"
-#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
- gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
- };
-
-static const size_t re_error_msgid_idx[] =
- {
- REG_NOERROR_IDX,
- REG_NOMATCH_IDX,
- REG_BADPAT_IDX,
- REG_ECOLLATE_IDX,
- REG_ECTYPE_IDX,
- REG_EESCAPE_IDX,
- REG_ESUBREG_IDX,
- REG_EBRACK_IDX,
- REG_EPAREN_IDX,
- REG_EBRACE_IDX,
- REG_BADBR_IDX,
- REG_ERANGE_IDX,
- REG_ESPACE_IDX,
- REG_BADRPT_IDX,
- REG_EEND_IDX,
- REG_ESIZE_IDX,
- REG_ERPAREN_IDX
- };
-
-/* Avoiding alloca during matching, to placate r_alloc. */
-
-/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
- searching and matching functions should not call alloca. On some
- systems, alloca is implemented in terms of malloc, and if we're
- using the relocating allocator routines, then malloc could cause a
- relocation, which might (if the strings being searched are in the
- ralloc heap) shift the data out from underneath the regexp
- routines.
-
- Here's another reason to avoid allocation: Emacs
- processes input from X in a signal handler; processing X input may
- call malloc; if input arrives while a matching routine is calling
- malloc, then we're scrod. But Emacs can't just block input while
- calling matching routines; then we don't notice interrupts when
- they come in. So, Emacs blocks input around all regexp calls
- except the matching calls, which it leaves unprotected, in the
- faith that they will not malloc. */
-
-/* Normally, this is fine. */
-#define MATCH_MAY_ALLOCATE
-
-/* When using GNU C, we are not REALLY using the C alloca, no matter
- what config.h may say. So don't take precautions for it. */
-#ifdef __GNUC__
-# undef C_ALLOCA
-#endif
-
-/* The match routines may not allocate if (1) they would do it with malloc
- and (2) it's not safe for them to use malloc.
- Note that if REL_ALLOC is defined, matching would not use malloc for the
- failure stack, but we would still use it for the register vectors;
- so REL_ALLOC should not affect this. */
-#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
-# undef MATCH_MAY_ALLOCATE
-#endif
-
-
-/* Failure stack declarations and macros; both re_compile_fastmap and
- re_match_2 use a failure stack. These have to be macros because of
- REGEX_ALLOCATE_STACK. */
-
-
-/* Number of failure points for which to initially allocate space
- when matching. If this number is exceeded, we allocate more
- space, so it is not a hard limit. */
-#ifndef INIT_FAILURE_ALLOC
-# define INIT_FAILURE_ALLOC 5
-#endif
-
-/* Roughly the maximum number of failure points on the stack. Would be
- exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
- This is a variable only so users of regex can assign to it; we never
- change it ourselves. */
-
-#ifdef INT_IS_16BIT
-
-# if defined MATCH_MAY_ALLOCATE
-/* 4400 was enough to cause a crash on Alpha OSF/1,
- whose default stack limit is 2mb. */
-long int re_max_failures = 4000;
-# else
-long int re_max_failures = 2000;
-# endif
-
-union fail_stack_elt
-{
- US_CHAR_TYPE *pointer;
- long int integer;
-};
-
-typedef union fail_stack_elt fail_stack_elt_t;
-
-typedef struct
-{
- fail_stack_elt_t *stack;
- unsigned long int size;
- unsigned long int avail; /* Offset of next open position. */
-} fail_stack_type;
-
-#else /* not INT_IS_16BIT */
-
-# if defined MATCH_MAY_ALLOCATE
-/* 4400 was enough to cause a crash on Alpha OSF/1,
- whose default stack limit is 2mb. */
-int re_max_failures = 4000;
-# else
+/* Binary backward compatibility. */
+#if _LIBC
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
+link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
int re_max_failures = 2000;
# endif
-
-union fail_stack_elt
-{
- US_CHAR_TYPE *pointer;
- int integer;
-};
-
-typedef union fail_stack_elt fail_stack_elt_t;
-
-typedef struct
-{
- fail_stack_elt_t *stack;
- unsigned size;
- unsigned avail; /* Offset of next open position. */
-} fail_stack_type;
-
-#endif /* INT_IS_16BIT */
-
-#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
-#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
-#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
-
-
-/* Define macros to initialize and free the failure stack.
- Do `return -2' if the alloc fails. */
-
-#ifdef MATCH_MAY_ALLOCATE
-# define INIT_FAIL_STACK() \
- do { \
- fail_stack.stack = (fail_stack_elt_t *) \
- REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
- \
- if (fail_stack.stack == NULL) \
- return -2; \
- \
- fail_stack.size = INIT_FAILURE_ALLOC; \
- fail_stack.avail = 0; \
- } while (0)
-
-# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
-#else
-# define INIT_FAIL_STACK() \
- do { \
- fail_stack.avail = 0; \
- } while (0)
-
-# define RESET_FAIL_STACK()
-#endif
-
-
-/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
-
- Return 1 if succeeds, and 0 if either ran out of memory
- allocating space for it or it was already too large.
-
- REGEX_REALLOCATE_STACK requires `destination' be declared. */
-
-#define DOUBLE_FAIL_STACK(fail_stack) \
- ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
- ? 0 \
- : ((fail_stack).stack = (fail_stack_elt_t *) \
- REGEX_REALLOCATE_STACK ((fail_stack).stack, \
- (fail_stack).size * sizeof (fail_stack_elt_t), \
- ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
- \
- (fail_stack).stack == NULL \
- ? 0 \
- : ((fail_stack).size <<= 1, \
- 1)))
-
-
-/* Push pointer POINTER on FAIL_STACK.
- Return 1 if was able to do so and 0 if ran out of memory allocating
- space to do so. */
-#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
- ((FAIL_STACK_FULL () \
- && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
- ? 0 \
- : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
- 1))
-
-/* Push a pointer value onto the failure stack.
- Assumes the variable `fail_stack'. Probably should only
- be called from within `PUSH_FAILURE_POINT'. */
-#define PUSH_FAILURE_POINTER(item) \
- fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item)
-
-/* This pushes an integer-valued item onto the failure stack.
- Assumes the variable `fail_stack'. Probably should only
- be called from within `PUSH_FAILURE_POINT'. */
-#define PUSH_FAILURE_INT(item) \
- fail_stack.stack[fail_stack.avail++].integer = (item)
-
-/* Push a fail_stack_elt_t value onto the failure stack.
- Assumes the variable `fail_stack'. Probably should only
- be called from within `PUSH_FAILURE_POINT'. */
-#define PUSH_FAILURE_ELT(item) \
- fail_stack.stack[fail_stack.avail++] = (item)
-
-/* These three POP... operations complement the three PUSH... operations.
- All assume that `fail_stack' is nonempty. */
-#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
-#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
-#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
-
-/* Used to omit pushing failure point id's when we're not debugging. */
-#ifdef DEBUG
-# define DEBUG_PUSH PUSH_FAILURE_INT
-# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
-#else
-# define DEBUG_PUSH(item)
-# define DEBUG_POP(item_addr)
-#endif
-
-
-/* Push the information about the state we will need
- if we ever fail back to it.
-
- Requires variables fail_stack, regstart, regend, reg_info, and
- num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
- be declared.
-
- Does `return FAILURE_CODE' if runs out of memory. */
-
-#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
- do { \
- char *destination; \
- /* Must be int, so when we don't save any registers, the arithmetic \
- of 0 + -1 isn't done as unsigned. */ \
- /* Can't be int, since there is not a shred of a guarantee that int \
- is wide enough to hold a value of something to which pointer can \
- be assigned */ \
- active_reg_t this_reg; \
- \
- DEBUG_STATEMENT (failure_id++); \
- DEBUG_STATEMENT (nfailure_points_pushed++); \
- DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
- DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
- DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
- \
- DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
- DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
- \
- /* Ensure we have enough space allocated for what we will push. */ \
- while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
- { \
- if (!DOUBLE_FAIL_STACK (fail_stack)) \
- return failure_code; \
- \
- DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
- (fail_stack).size); \
- DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
- } \
- \
- /* Push the info, starting with the registers. */ \
- DEBUG_PRINT1 ("\n"); \
- \
- if (1) \
- for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
- this_reg++) \
- { \
- DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
- DEBUG_STATEMENT (num_regs_pushed++); \
- \
- DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
- PUSH_FAILURE_POINTER (regstart[this_reg]); \
- \
- DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
- PUSH_FAILURE_POINTER (regend[this_reg]); \
- \
- DEBUG_PRINT2 (" info: %p\n ", \
- reg_info[this_reg].word.pointer); \
- DEBUG_PRINT2 (" match_null=%d", \
- REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
- DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
- DEBUG_PRINT2 (" matched_something=%d", \
- MATCHED_SOMETHING (reg_info[this_reg])); \
- DEBUG_PRINT2 (" ever_matched=%d", \
- EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
- DEBUG_PRINT1 ("\n"); \
- PUSH_FAILURE_ELT (reg_info[this_reg].word); \
- } \
- \
- DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
- PUSH_FAILURE_INT (lowest_active_reg); \
- \
- DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
- PUSH_FAILURE_INT (highest_active_reg); \
- \
- DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
- DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
- PUSH_FAILURE_POINTER (pattern_place); \
- \
- DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
- DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
- size2); \
- DEBUG_PRINT1 ("'\n"); \
- PUSH_FAILURE_POINTER (string_place); \
- \
- DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
- DEBUG_PUSH (failure_id); \
- } while (0)
-
-/* This is the number of items that are pushed and popped on the stack
- for each register. */
-#define NUM_REG_ITEMS 3
-
-/* Individual items aside from the registers. */
-#ifdef DEBUG
-# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
-#else
-# define NUM_NONREG_ITEMS 4
-#endif
-
-/* We push at most this many items on the stack. */
-/* We used to use (num_regs - 1), which is the number of registers
- this regexp will save; but that was changed to 5
- to avoid stack overflow for a regexp with lots of parens. */
-#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
-
-/* We actually push this many items. */
-#define NUM_FAILURE_ITEMS \
- (((0 \
- ? 0 : highest_active_reg - lowest_active_reg + 1) \
- * NUM_REG_ITEMS) \
- + NUM_NONREG_ITEMS)
-
-/* How many items can still be added to the stack without overflowing it. */
-#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
-
-
-/* Pops what PUSH_FAIL_STACK pushes.
-
- We restore into the parameters, all of which should be lvalues:
- STR -- the saved data position.
- PAT -- the saved pattern position.
- LOW_REG, HIGH_REG -- the highest and lowest active registers.
- REGSTART, REGEND -- arrays of string positions.
- REG_INFO -- array of information about each subexpression.
-
- Also assumes the variables `fail_stack' and (if debugging), `bufp',
- `pend', `string1', `size1', `string2', and `size2'. */
-#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
-{ \
- DEBUG_STATEMENT (unsigned failure_id;) \
- active_reg_t this_reg; \
- const US_CHAR_TYPE *string_temp; \
- \
- assert (!FAIL_STACK_EMPTY ()); \
- \
- /* Remove failure points and point to how many regs pushed. */ \
- DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
- DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
- DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
- \
- assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
- \
- DEBUG_POP (&failure_id); \
- DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
- \
- /* If the saved string location is NULL, it came from an \
- on_failure_keep_string_jump opcode, and we want to throw away the \
- saved NULL, thus retaining our current position in the string. */ \
- string_temp = POP_FAILURE_POINTER (); \
- if (string_temp != NULL) \
- str = (const CHAR_TYPE *) string_temp; \
- \
- DEBUG_PRINT2 (" Popping string %p: `", str); \
- DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
- DEBUG_PRINT1 ("'\n"); \
- \
- pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \
- DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
- DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
- \
- /* Restore register info. */ \
- high_reg = (active_reg_t) POP_FAILURE_INT (); \
- DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
- \
- low_reg = (active_reg_t) POP_FAILURE_INT (); \
- DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
- \
- if (1) \
- for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
- { \
- DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
- \
- reg_info[this_reg].word = POP_FAILURE_ELT (); \
- DEBUG_PRINT2 (" info: %p\n", \
- reg_info[this_reg].word.pointer); \
- \
- regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \
- DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
- \
- regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\
- DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
- } \
- else \
- { \
- for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
- { \
- reg_info[this_reg].word.integer = 0; \
- regend[this_reg] = 0; \
- regstart[this_reg] = 0; \
- } \
- highest_active_reg = high_reg; \
- } \
- \
- set_regs_matched_done = 0; \
- DEBUG_STATEMENT (nfailure_points_popped++); \
-} /* POP_FAILURE_POINT */
-
-
-/* Structure for per-register (a.k.a. per-group) information.
- Other register information, such as the
- starting and ending positions (which are addresses), and the list of
- inner groups (which is a bits list) are maintained in separate
- variables.
-
- We are making a (strictly speaking) nonportable assumption here: that
- the compiler will pack our bit fields into something that fits into
- the type of `word', i.e., is something that fits into one item on the
- failure stack. */
-
-
-/* Declarations and macros for re_match_2. */
-
-typedef union
-{
- fail_stack_elt_t word;
- struct
- {
- /* This field is one if this group can match the empty string,
- zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
-#define MATCH_NULL_UNSET_VALUE 3
- unsigned match_null_string_p : 2;
- unsigned is_active : 1;
- unsigned matched_something : 1;
- unsigned ever_matched_something : 1;
- } bits;
-} register_info_type;
-
-#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
-#define IS_ACTIVE(R) ((R).bits.is_active)
-#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
-#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
-
-
-/* Call this when have matched a real character; it sets `matched' flags
- for the subexpressions which we are currently inside. Also records
- that those subexprs have matched. */
-#define SET_REGS_MATCHED() \
- do \
- { \
- if (!set_regs_matched_done) \
- { \
- active_reg_t r; \
- set_regs_matched_done = 1; \
- for (r = lowest_active_reg; r <= highest_active_reg; r++) \
- { \
- MATCHED_SOMETHING (reg_info[r]) \
- = EVER_MATCHED_SOMETHING (reg_info[r]) \
- = 1; \
- } \
- } \
- } \
- while (0)
-
-/* Registers are set to a sentinel when they haven't yet matched. */
-static CHAR_TYPE reg_unset_dummy;
-#define REG_UNSET_VALUE (&reg_unset_dummy)
-#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
-
-/* Subroutine declarations and macros for regex_compile. */
-
-static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
- reg_syntax_t syntax,
- struct re_pattern_buffer *bufp));
-static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg));
-static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
- int arg1, int arg2));
-static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
- int arg, US_CHAR_TYPE *end));
-static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
- int arg1, int arg2, US_CHAR_TYPE *end));
-static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern,
- const CHAR_TYPE *p,
- reg_syntax_t syntax));
-static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p,
- const CHAR_TYPE *pend,
- reg_syntax_t syntax));
-#ifdef MBS_SUPPORT
-static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start,
- const CHAR_TYPE **p_ptr,
- const CHAR_TYPE *pend,
- char *translate,
- reg_syntax_t syntax,
- US_CHAR_TYPE *b,
- CHAR_TYPE *char_set));
-static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end));
-#else
-static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
- const CHAR_TYPE **p_ptr,
- const CHAR_TYPE *pend,
- char *translate,
- reg_syntax_t syntax,
- US_CHAR_TYPE *b));
-#endif /* MBS_SUPPORT */
-
-/* Fetch the next character in the uncompiled pattern---translating it
- if necessary. Also cast from a signed character in the constant
- string passed to us by the user to an unsigned char that we can use
- as an array index (in, e.g., `translate'). */
-/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
- because it is impossible to allocate 4GB array for some encodings
- which have 4 byte character_set like UCS4. */
-#ifndef PATFETCH
-# ifdef MBS_SUPPORT
-# define PATFETCH(c) \
- do {if (p == pend) return REG_EEND; \
- c = (US_CHAR_TYPE) *p++; \
- if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \
- } while (0)
-# else
-# define PATFETCH(c) \
- do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
- if (translate) c = (unsigned char) translate[c]; \
- } while (0)
-# endif /* MBS_SUPPORT */
-#endif
-
-/* Fetch the next character in the uncompiled pattern, with no
- translation. */
-#define PATFETCH_RAW(c) \
- do {if (p == pend) return REG_EEND; \
- c = (US_CHAR_TYPE) *p++; \
- } while (0)
-
-/* Go backwards one character in the pattern. */
-#define PATUNFETCH p--
-
-
-/* If `translate' is non-null, return translate[D], else just D. We
- cast the subscript to translate because some data is declared as
- `char *', to avoid warnings when a string constant is passed. But
- when we use a character as a subscript we must make it unsigned. */
-/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
- because it is impossible to allocate 4GB array for some encodings
- which have 4 byte character_set like UCS4. */
-#ifndef TRANSLATE
-# ifdef MBS_SUPPORT
-# define TRANSLATE(d) \
- ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
- ? (char) translate[(unsigned char) (d)] : (d))
-#else
-# define TRANSLATE(d) \
- (translate ? (char) translate[(unsigned char) (d)] : (d))
-# endif /* MBS_SUPPORT */
-#endif
-
-
-/* Macros for outputting the compiled pattern into `buffer'. */
-
-/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE))
-
-/* Make sure we have at least N more bytes of space in buffer. */
-#ifdef MBS_SUPPORT
-# define GET_BUFFER_SPACE(n) \
- while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
- + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \
- EXTEND_BUFFER ()
-#else
-# define GET_BUFFER_SPACE(n) \
- while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
- EXTEND_BUFFER ()
-#endif /* MBS_SUPPORT */
-
-/* Make sure we have one more byte of buffer space and then add C to it. */
-#define BUF_PUSH(c) \
- do { \
- GET_BUFFER_SPACE (1); \
- *b++ = (US_CHAR_TYPE) (c); \
- } while (0)
-
-
-/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
-#define BUF_PUSH_2(c1, c2) \
- do { \
- GET_BUFFER_SPACE (2); \
- *b++ = (US_CHAR_TYPE) (c1); \
- *b++ = (US_CHAR_TYPE) (c2); \
- } while (0)
-
-
-/* As with BUF_PUSH_2, except for three bytes. */
-#define BUF_PUSH_3(c1, c2, c3) \
- do { \
- GET_BUFFER_SPACE (3); \
- *b++ = (US_CHAR_TYPE) (c1); \
- *b++ = (US_CHAR_TYPE) (c2); \
- *b++ = (US_CHAR_TYPE) (c3); \
- } while (0)
-
-/* Store a jump with opcode OP at LOC to location TO. We store a
- relative address offset by the three bytes the jump itself occupies. */
-#define STORE_JUMP(op, loc, to) \
- store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
-
-/* Likewise, for a two-argument jump. */
-#define STORE_JUMP2(op, loc, to, arg) \
- store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
-
-/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
-#define INSERT_JUMP(op, loc, to) \
- insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
-
-/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
-#define INSERT_JUMP2(op, loc, to, arg) \
- insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
- arg, b)
-
-
-/* This is not an arbitrary limit: the arguments which represent offsets
- into the pattern are two bytes long. So if 2^16 bytes turns out to
- be too small, many things would have to change. */
-/* Any other compiler which, like MSC, has allocation limit below 2^16
- bytes will have to use approach similar to what was done below for
- MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
- reallocating to 0 bytes. Such thing is not going to work too well.
- You have been warned!! */
-#if defined _MSC_VER && !defined WIN32
-/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
- The REALLOC define eliminates a flurry of conversion warnings,
- but is not required. */
-# define MAX_BUF_SIZE 65500L
-# define REALLOC(p,s) realloc ((p), (size_t) (s))
-#else
-# define MAX_BUF_SIZE (1L << 16)
-# define REALLOC(p,s) realloc ((p), (s))
-#endif
-
-/* Extend the buffer by twice its current size via realloc and
- reset the pointers that pointed into the old block to point to the
- correct places in the new one. If extending the buffer results in it
- being larger than MAX_BUF_SIZE, then flag memory exhausted. */
-#if __BOUNDED_POINTERS__
-# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
-# define MOVE_BUFFER_POINTER(P) \
- (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
-# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
- else \
- { \
- SET_HIGH_BOUND (b); \
- SET_HIGH_BOUND (begalt); \
- if (fixup_alt_jump) \
- SET_HIGH_BOUND (fixup_alt_jump); \
- if (laststart) \
- SET_HIGH_BOUND (laststart); \
- if (pending_exact) \
- SET_HIGH_BOUND (pending_exact); \
- }
-#else
-# define MOVE_BUFFER_POINTER(P) (P) += incr
-# define ELSE_EXTEND_BUFFER_HIGH_BOUND
-#endif
-
-#ifdef MBS_SUPPORT
-# define EXTEND_BUFFER() \
- do { \
- US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
- int wchar_count; \
- if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \
- return REG_ESIZE; \
- bufp->allocated <<= 1; \
- if (bufp->allocated > MAX_BUF_SIZE) \
- bufp->allocated = MAX_BUF_SIZE; \
- /* How many characters the new buffer can have? */ \
- wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \
- if (wchar_count == 0) wchar_count = 1; \
- /* Truncate the buffer to CHAR_TYPE align. */ \
- bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \
- RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \
- bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
- if (COMPILED_BUFFER_VAR == NULL) \
- return REG_ESPACE; \
- /* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != COMPILED_BUFFER_VAR) \
- { \
- int incr = COMPILED_BUFFER_VAR - old_buffer; \
- MOVE_BUFFER_POINTER (b); \
- MOVE_BUFFER_POINTER (begalt); \
- if (fixup_alt_jump) \
- MOVE_BUFFER_POINTER (fixup_alt_jump); \
- if (laststart) \
- MOVE_BUFFER_POINTER (laststart); \
- if (pending_exact) \
- MOVE_BUFFER_POINTER (pending_exact); \
- } \
- ELSE_EXTEND_BUFFER_HIGH_BOUND \
- } while (0)
-#else
-# define EXTEND_BUFFER() \
- do { \
- US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
- if (bufp->allocated == MAX_BUF_SIZE) \
- return REG_ESIZE; \
- bufp->allocated <<= 1; \
- if (bufp->allocated > MAX_BUF_SIZE) \
- bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \
- bufp->allocated); \
- if (COMPILED_BUFFER_VAR == NULL) \
- return REG_ESPACE; \
- /* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != COMPILED_BUFFER_VAR) \
- { \
- int incr = COMPILED_BUFFER_VAR - old_buffer; \
- MOVE_BUFFER_POINTER (b); \
- MOVE_BUFFER_POINTER (begalt); \
- if (fixup_alt_jump) \
- MOVE_BUFFER_POINTER (fixup_alt_jump); \
- if (laststart) \
- MOVE_BUFFER_POINTER (laststart); \
- if (pending_exact) \
- MOVE_BUFFER_POINTER (pending_exact); \
- } \
- ELSE_EXTEND_BUFFER_HIGH_BOUND \
- } while (0)
-#endif /* MBS_SUPPORT */
-
-/* Since we have one byte reserved for the register number argument to
- {start,stop}_memory, the maximum number of groups we can report
- things about is what fits in that byte. */
-#define MAX_REGNUM 255
-
-/* But patterns can have more than `MAX_REGNUM' registers. We just
- ignore the excess. */
-typedef unsigned regnum_t;
-
-
-/* Macros for the compile stack. */
-
-/* Since offsets can go either forwards or backwards, this type needs to
- be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
-/* int may be not enough when sizeof(int) == 2. */
-typedef long pattern_offset_t;
-
-typedef struct
-{
- pattern_offset_t begalt_offset;
- pattern_offset_t fixup_alt_jump;
- pattern_offset_t inner_group_offset;
- pattern_offset_t laststart_offset;
- regnum_t regnum;
-} compile_stack_elt_t;
-
-
-typedef struct
-{
- compile_stack_elt_t *stack;
- unsigned size;
- unsigned avail; /* Offset of next open position. */
-} compile_stack_type;
-
-
-#define INIT_COMPILE_STACK_SIZE 32
-
-#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
-#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
-
-/* The next available element. */
-#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
-
-
-/* Set the bit for character C in a list. */
-#define SET_LIST_BIT(c) \
- (b[((unsigned char) (c)) / BYTEWIDTH] \
- |= 1 << (((unsigned char) c) % BYTEWIDTH))
-
-
-/* Get the next unsigned number in the uncompiled pattern. */
-#define GET_UNSIGNED_NUMBER(num) \
- { \
- while (p != pend) \
- { \
- PATFETCH (c); \
- if (! ('0' <= c && c <= '9')) \
- break; \
- if (num <= RE_DUP_MAX) \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- } \
- } \
- }
-
-#if defined _LIBC || WIDE_CHAR_SUPPORT
-/* The GNU C library provides support for user-defined character classes
- and the functions from ISO C amendement 1. */
-# ifdef CHARCLASS_NAME_MAX
-# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
-# else
-/* This shouldn't happen but some implementation might still have this
- problem. Use a reasonable default value. */
-# define CHAR_CLASS_MAX_LENGTH 256
-# endif
-
-# ifdef _LIBC
-# define IS_CHAR_CLASS(string) __wctype (string)
-# else
-# define IS_CHAR_CLASS(string) wctype (string)
-# endif
-#else
-# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
-
-# define IS_CHAR_CLASS(string) \
- (STREQ (string, "alpha") || STREQ (string, "upper") \
- || STREQ (string, "lower") || STREQ (string, "digit") \
- || STREQ (string, "alnum") || STREQ (string, "xdigit") \
- || STREQ (string, "space") || STREQ (string, "print") \
- || STREQ (string, "punct") || STREQ (string, "graph") \
- || STREQ (string, "cntrl") || STREQ (string, "blank"))
#endif
-
-#ifndef MATCH_MAY_ALLOCATE
-
-/* If we cannot allocate large objects within re_match_2_internal,
- we make the fail stack and register vectors global.
- The fail stack, we grow to the maximum size when a regexp
- is compiled.
- The register vectors, we adjust in size each time we
- compile a regexp, according to the number of registers it needs. */
-
-static fail_stack_type fail_stack;
-
-/* Size with which the following vectors are currently allocated.
- That is so we can make them bigger as needed,
- but never make them smaller. */
-static int regs_allocated_size;
-
-static const char ** regstart, ** regend;
-static const char ** old_regstart, ** old_regend;
-static const char **best_regstart, **best_regend;
-static register_info_type *reg_info;
-static const char **reg_dummy;
-static register_info_type *reg_info_dummy;
-
-/* Make the register vectors big enough for NUM_REGS registers,
- but don't make them smaller. */
-
-static
-regex_grow_registers (num_regs)
- int num_regs;
-{
- if (num_regs > regs_allocated_size)
- {
- RETALLOC_IF (regstart, num_regs, const char *);
- RETALLOC_IF (regend, num_regs, const char *);
- RETALLOC_IF (old_regstart, num_regs, const char *);
- RETALLOC_IF (old_regend, num_regs, const char *);
- RETALLOC_IF (best_regstart, num_regs, const char *);
- RETALLOC_IF (best_regend, num_regs, const char *);
- RETALLOC_IF (reg_info, num_regs, register_info_type);
- RETALLOC_IF (reg_dummy, num_regs, const char *);
- RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
-
- regs_allocated_size = num_regs;
- }
-}
-
-#endif /* not MATCH_MAY_ALLOCATE */
-
-static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
- compile_stack,
- regnum_t regnum));
-
-/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
- Returns one of error codes defined in `regex.h', or zero for success.
-
- Assumes the `allocated' (and perhaps `buffer') and `translate'
- fields are set in BUFP on entry.
-
- If it succeeds, results are put in BUFP (if it returns an error, the
- contents of BUFP are undefined):
- `buffer' is the compiled pattern;
- `syntax' is set to SYNTAX;
- `used' is set to the length of the compiled pattern;
- `fastmap_accurate' is zero;
- `re_nsub' is the number of subexpressions in PATTERN;
- `not_bol' and `not_eol' are zero;
-
- The `fastmap' and `newline_anchor' fields are neither
- examined nor set. */
-
-/* Return, freeing storage we allocated. */
-#ifdef MBS_SUPPORT
-# define FREE_STACK_RETURN(value) \
- return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
-#else
-# define FREE_STACK_RETURN(value) \
- return (free (compile_stack.stack), value)
-#endif /* MBS_SUPPORT */
-
-static reg_errcode_t
-#ifdef MBS_SUPPORT
-regex_compile (cpattern, csize, syntax, bufp)
- const char *cpattern;
- size_t csize;
-#else
-regex_compile (pattern, size, syntax, bufp)
- const char *pattern;
- size_t size;
-#endif /* MBS_SUPPORT */
- reg_syntax_t syntax;
- struct re_pattern_buffer *bufp;
-{
- /* We fetch characters from PATTERN here. Even though PATTERN is
- `char *' (i.e., signed), we declare these variables as unsigned, so
- they can be reliably used as array indices. */
- register US_CHAR_TYPE c, c1;
-
-#ifdef MBS_SUPPORT
- /* A temporary space to keep wchar_t pattern and compiled pattern. */
- CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR;
- size_t size;
- /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
- int *mbs_offset = NULL;
- /* It hold whether each wchar_t is binary data or not. */
- char *is_binary = NULL;
- /* A flag whether exactn is handling binary data or not. */
- char is_exactn_bin = FALSE;
-#endif /* MBS_SUPPORT */
-
- /* A random temporary spot in PATTERN. */
- const CHAR_TYPE *p1;
-
- /* Points to the end of the buffer, where we should append. */
- register US_CHAR_TYPE *b;
-
- /* Keeps track of unclosed groups. */
- compile_stack_type compile_stack;
-
- /* Points to the current (ending) position in the pattern. */
-#ifdef MBS_SUPPORT
- const CHAR_TYPE *p;
- const CHAR_TYPE *pend;
-#else
- const CHAR_TYPE *p = pattern;
- const CHAR_TYPE *pend = pattern + size;
-#endif /* MBS_SUPPORT */
-
- /* How to translate the characters in the pattern. */
- RE_TRANSLATE_TYPE translate = bufp->translate;
-
- /* Address of the count-byte of the most recently inserted `exactn'
- command. This makes it possible to tell if a new exact-match
- character can be added to that command or if the character requires
- a new `exactn' command. */
- US_CHAR_TYPE *pending_exact = 0;
-
- /* Address of start of the most recently finished expression.
- This tells, e.g., postfix * where to find the start of its
- operand. Reset at the beginning of groups and alternatives. */
- US_CHAR_TYPE *laststart = 0;
-
- /* Address of beginning of regexp, or inside of last group. */
- US_CHAR_TYPE *begalt;
-
- /* Address of the place where a forward jump should go to the end of
- the containing expression. Each alternative of an `or' -- except the
- last -- ends with a forward jump of this sort. */
- US_CHAR_TYPE *fixup_alt_jump = 0;
-
- /* Counts open-groups as they are encountered. Remembered for the
- matching close-group on the compile stack, so the same register
- number is put in the stop_memory as the start_memory. */
- regnum_t regnum = 0;
-
-#ifdef MBS_SUPPORT
- /* Initialize the wchar_t PATTERN and offset_buffer. */
- p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
- p[csize] = L'\0'; /* sentinel */
- mbs_offset = TALLOC(csize + 1, int);
- is_binary = TALLOC(csize + 1, char);
- if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
- {
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
- return REG_ESPACE;
- }
- size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
- pend = p + size;
- if (size < 0)
- {
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
- return REG_BADPAT;
- }
-#endif
-
-#ifdef DEBUG
- DEBUG_PRINT1 ("\nCompiling pattern: ");
- if (debug)
- {
- unsigned debug_count;
-
- for (debug_count = 0; debug_count < size; debug_count++)
- PUT_CHAR (pattern[debug_count]);
- putchar ('\n');
- }
-#endif /* DEBUG */
-
- /* Initialize the compile stack. */
- compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
- if (compile_stack.stack == NULL)
- {
-#ifdef MBS_SUPPORT
- if (pattern) free(pattern);
- if (mbs_offset) free(mbs_offset);
- if (is_binary) free(is_binary);
-#endif
- return REG_ESPACE;
- }
-
- compile_stack.size = INIT_COMPILE_STACK_SIZE;
- compile_stack.avail = 0;
-
- /* Initialize the pattern buffer. */
- bufp->syntax = syntax;
- bufp->fastmap_accurate = 0;
- bufp->not_bol = bufp->not_eol = 0;
-
- /* Set `used' to zero, so that if we return an error, the pattern
- printer (for debugging) will think there's no pattern. We reset it
- at the end. */
- bufp->used = 0;
-
- /* Always count groups, whether or not bufp->no_sub is set. */
- bufp->re_nsub = 0;
-
-#if !defined emacs && !defined SYNTAX_TABLE
- /* Initialize the syntax table. */
- init_syntax_once ();
-#endif
-
- if (bufp->allocated == 0)
- {
- if (bufp->buffer)
- { /* If zero allocated, but buffer is non-null, try to realloc
- enough space. This loses if buffer's address is bogus, but
- that is the user's responsibility. */
-#ifdef MBS_SUPPORT
- /* Free bufp->buffer and allocate an array for wchar_t pattern
- buffer. */
- free(bufp->buffer);
- COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE),
- US_CHAR_TYPE);
-#else
- RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE);
-#endif /* MBS_SUPPORT */
- }
- else
- { /* Caller did not allocate a buffer. Do it for them. */
- COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE),
- US_CHAR_TYPE);
- }
-
- if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
-#ifdef MBS_SUPPORT
- bufp->buffer = (char*)COMPILED_BUFFER_VAR;
-#endif /* MBS_SUPPORT */
- bufp->allocated = INIT_BUF_SIZE;
- }
-#ifdef MBS_SUPPORT
- else
- COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer;
-#endif
-
- begalt = b = COMPILED_BUFFER_VAR;
-
- /* Loop through the uncompiled pattern until we're at the end. */
- while (p != pend)
- {
- PATFETCH (c);
-
- switch (c)
- {
- case '^':
- {
- if ( /* If at start of pattern, it's an operator. */
- p == pattern + 1
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's come before. */
- || at_begline_loc_p (pattern, p, syntax))
- BUF_PUSH (begline);
- else
- goto normal_char;
- }
- break;
-
-
- case '$':
- {
- if ( /* If at end of pattern, it's an operator. */
- p == pend
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's next. */
- || at_endline_loc_p (p, pend, syntax))
- BUF_PUSH (endline);
- else
- goto normal_char;
- }
- break;
-
-
- case '+':
- case '?':
- if ((syntax & RE_BK_PLUS_QM)
- || (syntax & RE_LIMITED_OPS))
- goto normal_char;
- handle_plus:
- case '*':
- /* If there is no previous pattern... */
- if (!laststart)
- {
- if (syntax & RE_CONTEXT_INVALID_OPS)
- FREE_STACK_RETURN (REG_BADRPT);
- else if (!(syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- }
-
- {
- /* Are we optimizing this jump? */
- boolean keep_string_p = false;
-
- /* 1 means zero (many) matches is allowed. */
- char zero_times_ok = 0, many_times_ok = 0;
-
- /* If there is a sequence of repetition chars, collapse it
- down to just one (the right one). We can't combine
- interval operators with these because of, e.g., `a{2}*',
- which should only match an even number of `a's. */
-
- for (;;)
- {
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
-
- if (p == pend)
- break;
-
- PATFETCH (c);
-
- if (c == '*'
- || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
- ;
-
- else if (syntax & RE_BK_PLUS_QM && c == '\\')
- {
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
- {
- PATUNFETCH;
- PATUNFETCH;
- break;
- }
-
- c = c1;
- }
- else
- {
- PATUNFETCH;
- break;
- }
-
- /* If we get here, we found another repeat character. */
- }
-
- /* Star, etc. applied to an empty pattern is equivalent
- to an empty pattern. */
- if (!laststart)
- break;
-
- /* Now we know whether or not zero matches is allowed
- and also whether or not two or more matches is allowed. */
- if (many_times_ok)
- { /* More than one repetition is allowed, so put in at the
- end a backward relative jump from `b' to before the next
- jump we're going to put in below (which jumps from
- laststart to after this jump).
-
- But if we are at the `*' in the exact sequence `.*\n',
- insert an unconditional jump backwards to the .,
- instead of the beginning of the loop. This way we only
- push a failure point once, instead of every time
- through the loop. */
- assert (p - 1 > pattern);
-
- /* Allocate the space for the jump. */
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
-
- /* We know we are not at the first character of the pattern,
- because laststart was nonzero. And we've already
- incremented `p', by the way, to be the character after
- the `*'. Do we have to do something analogous here
- for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
- && zero_times_ok
- && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
- && !(syntax & RE_DOT_NEWLINE))
- { /* We have .*\n. */
- STORE_JUMP (jump, b, laststart);
- keep_string_p = true;
- }
- else
- /* Anything else. */
- STORE_JUMP (maybe_pop_jump, b, laststart -
- (1 + OFFSET_ADDRESS_SIZE));
-
- /* We've added more stuff to the buffer. */
- b += 1 + OFFSET_ADDRESS_SIZE;
- }
-
- /* On failure, jump from laststart to b + 3, which will be the
- end of the buffer after this jump is inserted. */
- /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
- 'b + 3'. */
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
- INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
- : on_failure_jump,
- laststart, b + 1 + OFFSET_ADDRESS_SIZE);
- pending_exact = 0;
- b += 1 + OFFSET_ADDRESS_SIZE;
-
- if (!zero_times_ok)
- {
- /* At least one repetition is required, so insert a
- `dummy_failure_jump' before the initial
- `on_failure_jump' instruction of the loop. This
- effects a skip over that instruction the first time
- we hit that loop. */
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
- INSERT_JUMP (dummy_failure_jump, laststart, laststart +
- 2 + 2 * OFFSET_ADDRESS_SIZE);
- b += 1 + OFFSET_ADDRESS_SIZE;
- }
- }
- break;
-
-
- case '.':
- laststart = b;
- BUF_PUSH (anychar);
- break;
-
-
- case '[':
- {
- boolean had_char_class = false;
-#ifdef MBS_SUPPORT
- CHAR_TYPE range_start = 0xffffffff;
-#else
- unsigned int range_start = 0xffffffff;
-#endif
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
-#ifdef MBS_SUPPORT
- /* We assume a charset(_not) structure as a wchar_t array.
- charset[0] = (re_opcode_t) charset(_not)
- charset[1] = l (= length of char_classes)
- charset[2] = m (= length of collating_symbols)
- charset[3] = n (= length of equivalence_classes)
- charset[4] = o (= length of char_ranges)
- charset[5] = p (= length of chars)
-
- charset[6] = char_class (wctype_t)
- charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
- ...
- charset[l+5] = char_class (wctype_t)
-
- charset[l+6] = collating_symbol (wchar_t)
- ...
- charset[l+m+5] = collating_symbol (wchar_t)
- ifdef _LIBC we use the index if
- _NL_COLLATE_SYMB_EXTRAMB instead of
- wchar_t string.
-
- charset[l+m+6] = equivalence_classes (wchar_t)
- ...
- charset[l+m+n+5] = equivalence_classes (wchar_t)
- ifdef _LIBC we use the index in
- _NL_COLLATE_WEIGHT instead of
- wchar_t string.
-
- charset[l+m+n+6] = range_start
- charset[l+m+n+7] = range_end
- ...
- charset[l+m+n+2o+4] = range_start
- charset[l+m+n+2o+5] = range_end
- ifdef _LIBC we use the value looked up
- in _NL_COLLATE_COLLSEQ instead of
- wchar_t character.
-
- charset[l+m+n+2o+6] = char
- ...
- charset[l+m+n+2o+p+5] = char
-
- */
-
- /* We need at least 6 spaces: the opcode, the length of
- char_classes, the length of collating_symbols, the length of
- equivalence_classes, the length of char_ranges, the length of
- chars. */
- GET_BUFFER_SPACE (6);
-
- /* Save b as laststart. And We use laststart as the pointer
- to the first element of the charset here.
- In other words, laststart[i] indicates charset[i]. */
- laststart = b;
-
- /* We test `*p == '^' twice, instead of using an if
- statement, so we only need one BUF_PUSH. */
- BUF_PUSH (*p == '^' ? charset_not : charset);
- if (*p == '^')
- p++;
-
- /* Push the length of char_classes, the length of
- collating_symbols, the length of equivalence_classes, the
- length of char_ranges and the length of chars. */
- BUF_PUSH_3 (0, 0, 0);
- BUF_PUSH_2 (0, 0);
-
- /* Remember the first position in the bracket expression. */
- p1 = p;
-
- /* charset_not matches newline according to a syntax bit. */
- if ((re_opcode_t) b[-6] == charset_not
- && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
- {
- BUF_PUSH('\n');
- laststart[5]++; /* Update the length of characters */
- }
-
- /* Read in characters and ranges, setting map bits. */
- for (;;)
- {
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- PATFETCH (c);
-
- /* \ might escape characters inside [...] and [^...]. */
- if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
- {
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- PATFETCH (c1);
- BUF_PUSH(c1);
- laststart[5]++; /* Update the length of chars */
- range_start = c1;
- continue;
- }
-
- /* Could be the end of the bracket expression. If it's
- not (i.e., when the bracket expression is `[]' so
- far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
- break;
-
- /* Look ahead to see if it's a range when the last thing
- was a character class. */
- if (had_char_class && c == '-' && *p != ']')
- FREE_STACK_RETURN (REG_ERANGE);
-
- /* Look ahead to see if it's a range when the last thing
- was a character: if this is a hyphen not at the
- beginning or the end of a list, then it's the range
- operator. */
- if (c == '-'
- && !(p - 2 >= pattern && p[-2] == '[')
- && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
- && *p != ']')
- {
- reg_errcode_t ret;
- /* Allocate the space for range_start and range_end. */
- GET_BUFFER_SPACE (2);
- /* Update the pointer to indicate end of buffer. */
- b += 2;
- ret = compile_range (range_start, &p, pend, translate,
- syntax, b, laststart);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
- range_start = 0xffffffff;
- }
- else if (p[0] == '-' && p[1] != ']')
- { /* This handles ranges made up of characters only. */
- reg_errcode_t ret;
-
- /* Move past the `-'. */
- PATFETCH (c1);
- /* Allocate the space for range_start and range_end. */
- GET_BUFFER_SPACE (2);
- /* Update the pointer to indicate end of buffer. */
- b += 2;
- ret = compile_range (c, &p, pend, translate, syntax, b,
- laststart);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
- range_start = 0xffffffff;
- }
-
- /* See if we're at the beginning of a possible character
- class. */
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
- { /* Leave room for the null. */
- char str[CHAR_CLASS_MAX_LENGTH + 1];
-
- PATFETCH (c);
- c1 = 0;
-
- /* If pattern is `[[:'. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == ':' && *p == ']') || p == pend)
- break;
- if (c1 < CHAR_CLASS_MAX_LENGTH)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- /* If isn't a word bracketed by `[:' and `:]':
- undo the ending character, the letters, and leave
- the leading `:' and `[' (but store them as character). */
- if (c == ':' && *p == ']')
- {
- wctype_t wt;
- uintptr_t alignedp;
-
- /* Query the character class as wctype_t. */
- wt = IS_CHAR_CLASS (str);
- if (wt == 0)
- FREE_STACK_RETURN (REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
-
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- /* Allocate the space for character class. */
- GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
- /* Update the pointer to indicate end of buffer. */
- b += CHAR_CLASS_SIZE;
- /* Move data which follow character classes
- not to violate the data. */
- insert_space(CHAR_CLASS_SIZE,
- laststart + 6 + laststart[1],
- b - 1);
- alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
- + __alignof__(wctype_t) - 1)
- & ~(uintptr_t)(__alignof__(wctype_t) - 1);
- /* Store the character class. */
- *((wctype_t*)alignedp) = wt;
- /* Update length of char_classes */
- laststart[1] += CHAR_CLASS_SIZE;
-
- had_char_class = true;
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- BUF_PUSH ('[');
- BUF_PUSH (':');
- laststart[5] += 2; /* Update the length of characters */
- range_start = ':';
- had_char_class = false;
- }
- }
- else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
- || *p == '.'))
- {
- CHAR_TYPE str[128]; /* Should be large enough. */
- CHAR_TYPE delim = *p; /* '=' or '.' */
-# ifdef _LIBC
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif
- PATFETCH (c);
- c1 = 0;
-
- /* If pattern is `[[=' or '[[.'. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == delim && *p == ']') || p == pend)
- break;
- if (c1 < sizeof (str) - 1)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- if (c == delim && *p == ']' && str[0] != '\0')
- {
- unsigned int i, offset;
- /* If we have no collation data we use the default
- collation in which each character is in a class
- by itself. It also means that ASCII is the
- character set and therefore we cannot have character
- with more than one byte in the multibyte
- representation. */
-
- /* If not defined _LIBC, we push the name and
- `\0' for the sake of matching performance. */
- int datasize = c1 + 1;
-
-# ifdef _LIBC
- int32_t idx = 0;
- if (nrules == 0)
-# endif
- {
- if (c1 != 1)
- FREE_STACK_RETURN (REG_ECOLLATE);
- }
-# ifdef _LIBC
- else
- {
- const int32_t *table;
- const int32_t *weights;
- const int32_t *extra;
- const int32_t *indirect;
- wint_t *cp;
-
- /* This #include defines a local function! */
-# include <locale/weightwc.h>
-
- if(delim == '=')
- {
- /* We push the index for equivalence class. */
- cp = (wint_t*)str;
-
- table = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_TABLEWC);
- weights = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_WEIGHTWC);
- extra = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_EXTRAWC);
- indirect = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_INDIRECTWC);
-
- idx = findidx ((const wint_t**)&cp);
- if (idx == 0 || cp < (wint_t*) str + c1)
- /* This is no valid character. */
- FREE_STACK_RETURN (REG_ECOLLATE);
-
- str[0] = (wchar_t)idx;
- }
- else /* delim == '.' */
- {
- /* We push collation sequence value
- for collating symbol. */
- int32_t table_size;
- const int32_t *symb_table;
- const unsigned char *extra;
- int32_t idx;
- int32_t elem;
- int32_t second;
- int32_t hash;
- char char_str[c1];
-
- /* We have to convert the name to a single-byte
- string. This is possible since the names
- consist of ASCII characters and the internal
- representation is UCS4. */
- for (i = 0; i < c1; ++i)
- char_str[i] = str[i];
-
- table_size =
- _NL_CURRENT_WORD (LC_COLLATE,
- _NL_COLLATE_SYMB_HASH_SIZEMB);
- symb_table = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_SYMB_TABLEMB);
- extra = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_SYMB_EXTRAMB);
-
- /* Locate the character in the hashing table. */
- hash = elem_hash (char_str, c1);
-
- idx = 0;
- elem = hash % table_size;
- second = hash % (table_size - 2);
- while (symb_table[2 * elem] != 0)
- {
- /* First compare the hashing value. */
- if (symb_table[2 * elem] == hash
- && c1 == extra[symb_table[2 * elem + 1]]
- && memcmp (str,
- &extra[symb_table[2 * elem + 1]
- + 1], c1) == 0)
- {
- /* Yep, this is the entry. */
- idx = symb_table[2 * elem + 1];
- idx += 1 + extra[idx];
- break;
- }
-
- /* Next entry. */
- elem += second;
- }
-
- if (symb_table[2 * elem] != 0)
- {
- /* Compute the index of the byte sequence
- in the table. */
- idx += 1 + extra[idx];
- /* Adjust for the alignment. */
- idx = (idx + 3) & ~4;
-
- str[0] = (wchar_t) idx + 4;
- }
- else if (symb_table[2 * elem] == 0 && c1 == 1)
- {
- /* No valid character. Match it as a
- single byte character. */
- had_char_class = false;
- BUF_PUSH(str[0]);
- /* Update the length of characters */
- laststart[5]++;
- range_start = str[0];
-
- /* Throw away the ] at the end of the
- collating symbol. */
- PATFETCH (c);
- /* exit from the switch block. */
- continue;
- }
- else
- FREE_STACK_RETURN (REG_ECOLLATE);
- }
- datasize = 1;
- }
-# endif
- /* Throw away the ] at the end of the equivalence
- class (or collating symbol). */
- PATFETCH (c);
-
- /* Allocate the space for the equivalence class
- (or collating symbol) (and '\0' if needed). */
- GET_BUFFER_SPACE(datasize);
- /* Update the pointer to indicate end of buffer. */
- b += datasize;
-
- if (delim == '=')
- { /* equivalence class */
- /* Calculate the offset of char_ranges,
- which is next to equivalence_classes. */
- offset = laststart[1] + laststart[2]
- + laststart[3] +6;
- /* Insert space. */
- insert_space(datasize, laststart + offset, b - 1);
-
- /* Write the equivalence_class and \0. */
- for (i = 0 ; i < datasize ; i++)
- laststart[offset + i] = str[i];
-
- /* Update the length of equivalence_classes. */
- laststart[3] += datasize;
- had_char_class = true;
- }
- else /* delim == '.' */
- { /* collating symbol */
- /* Calculate the offset of the equivalence_classes,
- which is next to collating_symbols. */
- offset = laststart[1] + laststart[2] + 6;
- /* Insert space and write the collationg_symbol
- and \0. */
- insert_space(datasize, laststart + offset, b-1);
- for (i = 0 ; i < datasize ; i++)
- laststart[offset + i] = str[i];
-
- /* In re_match_2_internal if range_start < -1, we
- assume -range_start is the offset of the
- collating symbol which is specified as
- the character of the range start. So we assign
- -(laststart[1] + laststart[2] + 6) to
- range_start. */
- range_start = -(laststart[1] + laststart[2] + 6);
- /* Update the length of collating_symbol. */
- laststart[2] += datasize;
- had_char_class = false;
- }
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- BUF_PUSH ('[');
- BUF_PUSH (delim);
- laststart[5] += 2; /* Update the length of characters */
- range_start = delim;
- had_char_class = false;
- }
- }
- else
- {
- had_char_class = false;
- BUF_PUSH(c);
- laststart[5]++; /* Update the length of characters */
- range_start = c;
- }
- }
-
-#else /* not MBS_SUPPORT */
- /* Ensure that we have enough space to push a charset: the
- opcode, the length count, and the bitset; 34 bytes in all. */
- GET_BUFFER_SPACE (34);
-
- laststart = b;
-
- /* We test `*p == '^' twice, instead of using an if
- statement, so we only need one BUF_PUSH. */
- BUF_PUSH (*p == '^' ? charset_not : charset);
- if (*p == '^')
- p++;
-
- /* Remember the first position in the bracket expression. */
- p1 = p;
-
- /* Push the number of bytes in the bitmap. */
- BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
-
- /* Clear the whole map. */
- bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
-
- /* charset_not matches newline according to a syntax bit. */
- if ((re_opcode_t) b[-2] == charset_not
- && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
- SET_LIST_BIT ('\n');
-
- /* Read in characters and ranges, setting map bits. */
- for (;;)
- {
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- PATFETCH (c);
-
- /* \ might escape characters inside [...] and [^...]. */
- if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
- {
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- PATFETCH (c1);
- SET_LIST_BIT (c1);
- range_start = c1;
- continue;
- }
-
- /* Could be the end of the bracket expression. If it's
- not (i.e., when the bracket expression is `[]' so
- far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
- break;
-
- /* Look ahead to see if it's a range when the last thing
- was a character class. */
- if (had_char_class && c == '-' && *p != ']')
- FREE_STACK_RETURN (REG_ERANGE);
-
- /* Look ahead to see if it's a range when the last thing
- was a character: if this is a hyphen not at the
- beginning or the end of a list, then it's the range
- operator. */
- if (c == '-'
- && !(p - 2 >= pattern && p[-2] == '[')
- && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
- && *p != ']')
- {
- reg_errcode_t ret
- = compile_range (range_start, &p, pend, translate,
- syntax, b);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
- range_start = 0xffffffff;
- }
-
- else if (p[0] == '-' && p[1] != ']')
- { /* This handles ranges made up of characters only. */
- reg_errcode_t ret;
-
- /* Move past the `-'. */
- PATFETCH (c1);
-
- ret = compile_range (c, &p, pend, translate, syntax, b);
- if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
- range_start = 0xffffffff;
- }
-
- /* See if we're at the beginning of a possible character
- class. */
-
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
- { /* Leave room for the null. */
- char str[CHAR_CLASS_MAX_LENGTH + 1];
-
- PATFETCH (c);
- c1 = 0;
-
- /* If pattern is `[[:'. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == ':' && *p == ']') || p == pend)
- break;
- if (c1 < CHAR_CLASS_MAX_LENGTH)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- /* If isn't a word bracketed by `[:' and `:]':
- undo the ending character, the letters, and leave
- the leading `:' and `[' (but set bits for them). */
- if (c == ':' && *p == ']')
- {
-# if defined _LIBC || WIDE_CHAR_SUPPORT
- boolean is_lower = STREQ (str, "lower");
- boolean is_upper = STREQ (str, "upper");
- wctype_t wt;
- int ch;
-
- wt = IS_CHAR_CLASS (str);
- if (wt == 0)
- FREE_STACK_RETURN (REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
-
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
- {
-# ifdef _LIBC
- if (__iswctype (__btowc (ch), wt))
- SET_LIST_BIT (ch);
-# else
- if (iswctype (btowc (ch), wt))
- SET_LIST_BIT (ch);
-# endif
-
- if (translate && (is_upper || is_lower)
- && (ISUPPER (ch) || ISLOWER (ch)))
- SET_LIST_BIT (ch);
- }
-
- had_char_class = true;
-# else
- int ch;
- boolean is_alnum = STREQ (str, "alnum");
- boolean is_alpha = STREQ (str, "alpha");
- boolean is_blank = STREQ (str, "blank");
- boolean is_cntrl = STREQ (str, "cntrl");
- boolean is_digit = STREQ (str, "digit");
- boolean is_graph = STREQ (str, "graph");
- boolean is_lower = STREQ (str, "lower");
- boolean is_print = STREQ (str, "print");
- boolean is_punct = STREQ (str, "punct");
- boolean is_space = STREQ (str, "space");
- boolean is_upper = STREQ (str, "upper");
- boolean is_xdigit = STREQ (str, "xdigit");
-
- if (!IS_CHAR_CLASS (str))
- FREE_STACK_RETURN (REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
-
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
- {
- /* This was split into 3 if's to
- avoid an arbitrary limit in some compiler. */
- if ( (is_alnum && ISALNUM (ch))
- || (is_alpha && ISALPHA (ch))
- || (is_blank && ISBLANK (ch))
- || (is_cntrl && ISCNTRL (ch)))
- SET_LIST_BIT (ch);
- if ( (is_digit && ISDIGIT (ch))
- || (is_graph && ISGRAPH (ch))
- || (is_lower && ISLOWER (ch))
- || (is_print && ISPRINT (ch)))
- SET_LIST_BIT (ch);
- if ( (is_punct && ISPUNCT (ch))
- || (is_space && ISSPACE (ch))
- || (is_upper && ISUPPER (ch))
- || (is_xdigit && ISXDIGIT (ch)))
- SET_LIST_BIT (ch);
- if ( translate && (is_upper || is_lower)
- && (ISUPPER (ch) || ISLOWER (ch)))
- SET_LIST_BIT (ch);
- }
- had_char_class = true;
-# endif /* libc || wctype.h */
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT (':');
- range_start = ':';
- had_char_class = false;
- }
- }
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
- {
- unsigned char str[MB_LEN_MAX + 1];
-# ifdef _LIBC
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif
-
- PATFETCH (c);
- c1 = 0;
-
- /* If pattern is `[[='. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == '=' && *p == ']') || p == pend)
- break;
- if (c1 < MB_LEN_MAX)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- if (c == '=' && *p == ']' && str[0] != '\0')
- {
- /* If we have no collation data we use the default
- collation in which each character is in a class
- by itself. It also means that ASCII is the
- character set and therefore we cannot have character
- with more than one byte in the multibyte
- representation. */
-# ifdef _LIBC
- if (nrules == 0)
-# endif
- {
- if (c1 != 1)
- FREE_STACK_RETURN (REG_ECOLLATE);
-
- /* Throw away the ] at the end of the equivalence
- class. */
- PATFETCH (c);
-
- /* Set the bit for the character. */
- SET_LIST_BIT (str[0]);
- }
-# ifdef _LIBC
- else
- {
- /* Try to match the byte sequence in `str' against
- those known to the collate implementation.
- First find out whether the bytes in `str' are
- actually from exactly one character. */
- const int32_t *table;
- const unsigned char *weights;
- const unsigned char *extra;
- const int32_t *indirect;
- int32_t idx;
- const unsigned char *cp = str;
- int ch;
-
- /* This #include defines a local function! */
-# include <locale/weight.h>
-
- table = (const int32_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
- weights = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
- extra = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
- indirect = (const int32_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
-
- idx = findidx (&cp);
- if (idx == 0 || cp < str + c1)
- /* This is no valid character. */
- FREE_STACK_RETURN (REG_ECOLLATE);
-
- /* Throw away the ] at the end of the equivalence
- class. */
- PATFETCH (c);
-
- /* Now we have to go throught the whole table
- and find all characters which have the same
- first level weight.
-
- XXX Note that this is not entirely correct.
- we would have to match multibyte sequences
- but this is not possible with the current
- implementation. */
- for (ch = 1; ch < 256; ++ch)
- /* XXX This test would have to be changed if we
- would allow matching multibyte sequences. */
- if (table[ch] > 0)
- {
- int32_t idx2 = table[ch];
- size_t len = weights[idx2];
-
- /* Test whether the lenghts match. */
- if (weights[idx] == len)
- {
- /* They do. New compare the bytes of
- the weight. */
- size_t cnt = 0;
-
- while (cnt < len
- && (weights[idx + 1 + cnt]
- == weights[idx2 + 1 + cnt]))
- ++cnt;
-
- if (cnt == len)
- /* They match. Mark the character as
- acceptable. */
- SET_LIST_BIT (ch);
- }
- }
- }
-# endif
- had_char_class = true;
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT ('=');
- range_start = '=';
- had_char_class = false;
- }
- }
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
- {
- unsigned char str[128]; /* Should be large enough. */
-# ifdef _LIBC
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif
-
- PATFETCH (c);
- c1 = 0;
-
- /* If pattern is `[[.'. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == '.' && *p == ']') || p == pend)
- break;
- if (c1 < sizeof (str))
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- if (c == '.' && *p == ']' && str[0] != '\0')
- {
- /* If we have no collation data we use the default
- collation in which each character is the name
- for its own class which contains only the one
- character. It also means that ASCII is the
- character set and therefore we cannot have character
- with more than one byte in the multibyte
- representation. */
-# ifdef _LIBC
- if (nrules == 0)
-# endif
- {
- if (c1 != 1)
- FREE_STACK_RETURN (REG_ECOLLATE);
-
- /* Throw away the ] at the end of the equivalence
- class. */
- PATFETCH (c);
-
- /* Set the bit for the character. */
- SET_LIST_BIT (str[0]);
- range_start = ((const unsigned char *) str)[0];
- }
-# ifdef _LIBC
- else
- {
- /* Try to match the byte sequence in `str' against
- those known to the collate implementation.
- First find out whether the bytes in `str' are
- actually from exactly one character. */
- int32_t table_size;
- const int32_t *symb_table;
- const unsigned char *extra;
- int32_t idx;
- int32_t elem;
- int32_t second;
- int32_t hash;
-
- table_size =
- _NL_CURRENT_WORD (LC_COLLATE,
- _NL_COLLATE_SYMB_HASH_SIZEMB);
- symb_table = (const int32_t *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_SYMB_TABLEMB);
- extra = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_SYMB_EXTRAMB);
-
- /* Locate the character in the hashing table. */
- hash = elem_hash (str, c1);
-
- idx = 0;
- elem = hash % table_size;
- second = hash % (table_size - 2);
- while (symb_table[2 * elem] != 0)
- {
- /* First compare the hashing value. */
- if (symb_table[2 * elem] == hash
- && c1 == extra[symb_table[2 * elem + 1]]
- && memcmp (str,
- &extra[symb_table[2 * elem + 1]
- + 1],
- c1) == 0)
- {
- /* Yep, this is the entry. */
- idx = symb_table[2 * elem + 1];
- idx += 1 + extra[idx];
- break;
- }
-
- /* Next entry. */
- elem += second;
- }
-
- if (symb_table[2 * elem] == 0)
- /* This is no valid character. */
- FREE_STACK_RETURN (REG_ECOLLATE);
-
- /* Throw away the ] at the end of the equivalence
- class. */
- PATFETCH (c);
-
- /* Now add the multibyte character(s) we found
- to the accept list.
-
- XXX Note that this is not entirely correct.
- we would have to match multibyte sequences
- but this is not possible with the current
- implementation. Also, we have to match
- collating symbols, which expand to more than
- one file, as a whole and not allow the
- individual bytes. */
- c1 = extra[idx++];
- if (c1 == 1)
- range_start = extra[idx];
- while (c1-- > 0)
- {
- SET_LIST_BIT (extra[idx]);
- ++idx;
- }
- }
-# endif
- had_char_class = false;
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT ('.');
- range_start = '.';
- had_char_class = false;
- }
- }
- else
- {
- had_char_class = false;
- SET_LIST_BIT (c);
- range_start = c;
- }
- }
-
- /* Discard any (non)matching list bytes that are all 0 at the
- end of the map. Decrease the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
-#endif /* MBS_SUPPORT */
- }
- break;
-
-
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_open;
- else
- goto normal_char;
-
-
- case ')':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_close;
- else
- goto normal_char;
-
-
- case '\n':
- if (syntax & RE_NEWLINE_ALT)
- goto handle_alt;
- else
- goto normal_char;
-
-
- case '|':
- if (syntax & RE_NO_BK_VBAR)
- goto handle_alt;
- else
- goto normal_char;
-
-
- case '{':
- if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
- goto handle_interval;
- else
- goto normal_char;
-
-
- case '\\':
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- /* Do not translate the character after the \, so that we can
- distinguish, e.g., \B from \b, even if we normally would
- translate, e.g., B to b. */
- PATFETCH_RAW (c);
-
- switch (c)
- {
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto normal_backslash;
-
- handle_open:
- bufp->re_nsub++;
- regnum++;
-
- if (COMPILE_STACK_FULL)
- {
- RETALLOC (compile_stack.stack, compile_stack.size << 1,
- compile_stack_elt_t);
- if (compile_stack.stack == NULL) return REG_ESPACE;
-
- compile_stack.size <<= 1;
- }
-
- /* These are the values to restore when we hit end of this
- group. They are all relative offsets, so that if the
- whole pattern moves because of realloc, they will still
- be valid. */
- COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
- COMPILE_STACK_TOP.fixup_alt_jump
- = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
- COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
- COMPILE_STACK_TOP.regnum = regnum;
-
- /* We will eventually replace the 0 with the number of
- groups inner to this one. But do not push a
- start_memory for groups beyond the last one we can
- represent in the compiled pattern. */
- if (regnum <= MAX_REGNUM)
- {
- COMPILE_STACK_TOP.inner_group_offset = b
- - COMPILED_BUFFER_VAR + 2;
- BUF_PUSH_3 (start_memory, regnum, 0);
- }
-
- compile_stack.avail++;
-
- fixup_alt_jump = 0;
- laststart = 0;
- begalt = b;
- /* If we've reached MAX_REGNUM groups, then this open
- won't actually generate any code, so we'll have to
- clear pending_exact explicitly. */
- pending_exact = 0;
- break;
-
-
- case ')':
- if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
-
- if (COMPILE_STACK_EMPTY)
- {
- if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
- goto normal_backslash;
- else
- FREE_STACK_RETURN (REG_ERPAREN);
- }
-
- handle_close:
- if (fixup_alt_jump)
- { /* Push a dummy failure point at the end of the
- alternative for a possible future
- `pop_failure_jump' to pop. See comments at
- `push_dummy_failure' in `re_match_2'. */
- BUF_PUSH (push_dummy_failure);
-
- /* We allocated space for this jump when we assigned
- to `fixup_alt_jump', in the `handle_alt' case below. */
- STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
- }
-
- /* See similar code for backslashed left paren above. */
- if (COMPILE_STACK_EMPTY)
- {
- if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
- goto normal_char;
- else
- FREE_STACK_RETURN (REG_ERPAREN);
- }
-
- /* Since we just checked for an empty stack above, this
- ``can't happen''. */
- assert (compile_stack.avail != 0);
- {
- /* We don't just want to restore into `regnum', because
- later groups should continue to be numbered higher,
- as in `(ab)c(de)' -- the second group is #2. */
- regnum_t this_group_regnum;
-
- compile_stack.avail--;
- begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
- fixup_alt_jump
- = COMPILE_STACK_TOP.fixup_alt_jump
- ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
- : 0;
- laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
- this_group_regnum = COMPILE_STACK_TOP.regnum;
- /* If we've reached MAX_REGNUM groups, then this open
- won't actually generate any code, so we'll have to
- clear pending_exact explicitly. */
- pending_exact = 0;
-
- /* We're at the end of the group, so now we know how many
- groups were inside this one. */
- if (this_group_regnum <= MAX_REGNUM)
- {
- US_CHAR_TYPE *inner_group_loc
- = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
-
- *inner_group_loc = regnum - this_group_regnum;
- BUF_PUSH_3 (stop_memory, this_group_regnum,
- regnum - this_group_regnum);
- }
- }
- break;
-
-
- case '|': /* `\|'. */
- if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
- goto normal_backslash;
- handle_alt:
- if (syntax & RE_LIMITED_OPS)
- goto normal_char;
-
- /* Insert before the previous alternative a jump which
- jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
- INSERT_JUMP (on_failure_jump, begalt,
- b + 2 + 2 * OFFSET_ADDRESS_SIZE);
- pending_exact = 0;
- b += 1 + OFFSET_ADDRESS_SIZE;
-
- /* The alternative before this one has a jump after it
- which gets executed if it gets matched. Adjust that
- jump so it will jump to this alternative's analogous
- jump (put in below, which in turn will jump to the next
- (if any) alternative's such jump, etc.). The last such
- jump jumps to the correct final destination. A picture:
- _____ _____
- | | | |
- | v | v
- a | b | c
-
- If we are at `b', then fixup_alt_jump right now points to a
- three-byte space after `a'. We'll put in the jump, set
- fixup_alt_jump to right after `b', and leave behind three
- bytes which we'll fill in when we get to after `c'. */
-
- if (fixup_alt_jump)
- STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
-
- /* Mark and leave space for a jump after this alternative,
- to be filled in later either by next alternative or
- when know we're at the end of a series of alternatives. */
- fixup_alt_jump = b;
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
- b += 1 + OFFSET_ADDRESS_SIZE;
-
- laststart = 0;
- begalt = b;
- break;
-
-
- case '{':
- /* If \{ is a literal. */
- if (!(syntax & RE_INTERVALS)
- /* If we're at `\{' and it's not the open-interval
- operator. */
- || (syntax & RE_NO_BK_BRACES))
- goto normal_backslash;
-
- handle_interval:
- {
- /* If got here, then the syntax allows intervals. */
-
- /* At least (most) this many matches must be made. */
- int lower_bound = -1, upper_bound = -1;
-
- /* Place in the uncompiled pattern (i.e., just after
- the '{') to go back to if the interval is invalid. */
- const CHAR_TYPE *beg_interval = p;
-
- if (p == pend)
- goto invalid_interval;
-
- GET_UNSIGNED_NUMBER (lower_bound);
-
- if (c == ',')
- {
- GET_UNSIGNED_NUMBER (upper_bound);
- if (upper_bound < 0)
- upper_bound = RE_DUP_MAX;
- }
- else
- /* Interval such as `{1}' => match exactly once. */
- upper_bound = lower_bound;
-
- if (! (0 <= lower_bound && lower_bound <= upper_bound))
- goto invalid_interval;
-
- if (!(syntax & RE_NO_BK_BRACES))
- {
- if (c != '\\' || p == pend)
- goto invalid_interval;
- PATFETCH (c);
- }
-
- if (c != '}')
- goto invalid_interval;
-
- /* If it's invalid to have no preceding re. */
- if (!laststart)
- {
- if (syntax & RE_CONTEXT_INVALID_OPS
- && !(syntax & RE_INVALID_INTERVAL_ORD))
- FREE_STACK_RETURN (REG_BADRPT);
- else if (syntax & RE_CONTEXT_INDEP_OPS)
- laststart = b;
- else
- goto unfetch_interval;
- }
-
- /* We just parsed a valid interval. */
-
- if (RE_DUP_MAX < upper_bound)
- FREE_STACK_RETURN (REG_BADBR);
-
- /* If the upper bound is zero, don't want to succeed at
- all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
- /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE'
- instead of 'b + 3'. */
- if (upper_bound == 0)
- {
- GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
- INSERT_JUMP (jump, laststart, b + 1
- + OFFSET_ADDRESS_SIZE);
- b += 1 + OFFSET_ADDRESS_SIZE;
- }
-
- /* Otherwise, we have a nontrivial interval. When
- we're all done, the pattern will look like:
- set_number_at <jump count> <upper bound>
- set_number_at <succeed_n count> <lower bound>
- succeed_n <after jump addr> <succeed_n count>
- <body of loop>
- jump_n <succeed_n addr> <jump count>
- (The upper bound and `jump_n' are omitted if
- `upper_bound' is 1, though.) */
- else
- { /* If the upper bound is > 1, we need to insert
- more at the end of the loop. */
- unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
- (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
-
- GET_BUFFER_SPACE (nbytes);
-
- /* Initialize lower bound of the `succeed_n', even
- though it will be set during matching by its
- attendant `set_number_at' (inserted next),
- because `re_compile_fastmap' needs to know.
- Jump to the `jump_n' we might insert below. */
- INSERT_JUMP2 (succeed_n, laststart,
- b + 1 + 2 * OFFSET_ADDRESS_SIZE
- + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
- , lower_bound);
- b += 1 + 2 * OFFSET_ADDRESS_SIZE;
-
- /* Code to initialize the lower bound. Insert
- before the `succeed_n'. The `5' is the last two
- bytes of this `set_number_at', plus 3 bytes of
- the following `succeed_n'. */
- /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE'
- is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
- of the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 1
- + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
- b += 1 + 2 * OFFSET_ADDRESS_SIZE;
-
- if (upper_bound > 1)
- { /* More than one repetition is allowed, so
- append a backward jump to the `succeed_n'
- that starts this interval.
-
- When we've reached this during matching,
- we'll have matched the interval once, so
- jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart
- + 2 * OFFSET_ADDRESS_SIZE + 1,
- upper_bound - 1);
- b += 1 + 2 * OFFSET_ADDRESS_SIZE;
-
- /* The location we want to set is the second
- parameter of the `jump_n'; that is `b-2' as
- an absolute address. `laststart' will be
- the `set_number_at' we're about to insert;
- `laststart+3' the number to set, the source
- for the relative address. But we are
- inserting into the middle of the pattern --
- so everything is getting moved up by 5.
- Conclusion: (b - 2) - (laststart + 3) + 5,
- i.e., b - laststart.
-
- We insert this at the beginning of the loop
- so that if we fail during matching, we'll
- reinitialize the bounds. */
- insert_op2 (set_number_at, laststart, b - laststart,
- upper_bound - 1, b);
- b += 1 + 2 * OFFSET_ADDRESS_SIZE;
- }
- }
- pending_exact = 0;
- break;
-
- invalid_interval:
- if (!(syntax & RE_INVALID_INTERVAL_ORD))
- FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
- unfetch_interval:
- /* Match the characters as literals. */
- p = beg_interval;
- c = '{';
- if (syntax & RE_NO_BK_BRACES)
- goto normal_char;
- else
- goto normal_backslash;
- }
-
-#ifdef emacs
- /* There is no way to specify the before_dot and after_dot
- operators. rms says this is ok. --karl */
- case '=':
- BUF_PUSH (at_dot);
- break;
-
- case 's':
- laststart = b;
- PATFETCH (c);
- BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
- break;
-
- case 'S':
- laststart = b;
- PATFETCH (c);
- BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
- break;
-#endif /* emacs */
-
-
- case 'w':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- laststart = b;
- BUF_PUSH (wordchar);
- break;
-
-
- case 'W':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- laststart = b;
- BUF_PUSH (notwordchar);
- break;
-
-
- case '<':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (wordbeg);
- break;
-
- case '>':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (wordend);
- break;
-
- case 'b':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (wordbound);
- break;
-
- case 'B':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (notwordbound);
- break;
-
- case '`':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (begbuf);
- break;
-
- case '\'':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH (endbuf);
- break;
-
- case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9':
- if (syntax & RE_NO_BK_REFS)
- goto normal_char;
-
- c1 = c - '0';
-
- if (c1 > regnum)
- FREE_STACK_RETURN (REG_ESUBREG);
-
- /* Can't back reference to a subexpression if inside of it. */
- if (group_in_compile_stack (compile_stack, (regnum_t) c1))
- goto normal_char;
-
- laststart = b;
- BUF_PUSH_2 (duplicate, c1);
- break;
-
-
- case '+':
- case '?':
- if (syntax & RE_BK_PLUS_QM)
- goto handle_plus;
- else
- goto normal_backslash;
-
- default:
- normal_backslash:
- /* You might think it would be useful for \ to mean
- not to translate; but if we don't translate it
- it will never match anything. */
- c = TRANSLATE (c);
- goto normal_char;
- }
- break;
-
-
- default:
- /* Expects the character in `c'. */
- normal_char:
- /* If no exactn currently being built. */
- if (!pending_exact
-#ifdef MBS_SUPPORT
- /* If last exactn handle binary(or character) and
- new exactn handle character(or binary). */
- || is_exactn_bin != is_binary[p - 1 - pattern]
-#endif /* MBS_SUPPORT */
-
- /* If last exactn not at current position. */
- || pending_exact + *pending_exact + 1 != b
-
- /* We have only one byte following the exactn for the count. */
- || *pending_exact == (1 << BYTEWIDTH) - 1
-
- /* If followed by a repetition operator. */
- || *p == '*' || *p == '^'
- || ((syntax & RE_BK_PLUS_QM)
- ? *p == '\\' && (p[1] == '+' || p[1] == '?')
- : (*p == '+' || *p == '?'))
- || ((syntax & RE_INTERVALS)
- && ((syntax & RE_NO_BK_BRACES)
- ? *p == '{'
- : (p[0] == '\\' && p[1] == '{'))))
- {
- /* Start building a new exactn. */
-
- laststart = b;
-
-#ifdef MBS_SUPPORT
- /* Is this exactn binary data or character? */
- is_exactn_bin = is_binary[p - 1 - pattern];
- if (is_exactn_bin)
- BUF_PUSH_2 (exactn_bin, 0);
- else
- BUF_PUSH_2 (exactn, 0);
-#else
- BUF_PUSH_2 (exactn, 0);
-#endif /* MBS_SUPPORT */
- pending_exact = b - 1;
- }
-
- BUF_PUSH (c);
- (*pending_exact)++;
- break;
- } /* switch (c) */
- } /* while p != pend */
-
-
- /* Through the pattern now. */
-
- if (fixup_alt_jump)
- STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
-
- if (!COMPILE_STACK_EMPTY)
- FREE_STACK_RETURN (REG_EPAREN);
-
- /* If we don't want backtracking, force success
- the first time we reach the end of the compiled pattern. */
- if (syntax & RE_NO_POSIX_BACKTRACKING)
- BUF_PUSH (succeed);
-
-#ifdef MBS_SUPPORT
- free (pattern);
- free (mbs_offset);
- free (is_binary);
-#endif
- free (compile_stack.stack);
-
- /* We have succeeded; set the length of the buffer. */
-#ifdef MBS_SUPPORT
- bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
-#else
- bufp->used = b - bufp->buffer;
-#endif
-
-#ifdef DEBUG
- if (debug)
- {
- DEBUG_PRINT1 ("\nCompiled pattern: \n");
- print_compiled_pattern (bufp);
- }
-#endif /* DEBUG */
-
-#ifndef MATCH_MAY_ALLOCATE
- /* Initialize the failure stack to the largest possible stack. This
- isn't necessary unless we're trying to avoid calling alloca in
- the search and match routines. */
- {
- int num_regs = bufp->re_nsub + 1;
-
- /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
- is strictly greater than re_max_failures, the largest possible stack
- is 2 * re_max_failures failure points. */
- if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
- {
- fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
-
-# ifdef emacs
- if (! fail_stack.stack)
- fail_stack.stack
- = (fail_stack_elt_t *) xmalloc (fail_stack.size
- * sizeof (fail_stack_elt_t));
- else
- fail_stack.stack
- = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
- (fail_stack.size
- * sizeof (fail_stack_elt_t)));
-# else /* not emacs */
- if (! fail_stack.stack)
- fail_stack.stack
- = (fail_stack_elt_t *) malloc (fail_stack.size
- * sizeof (fail_stack_elt_t));
- else
- fail_stack.stack
- = (fail_stack_elt_t *) realloc (fail_stack.stack,
- (fail_stack.size
- * sizeof (fail_stack_elt_t)));
-# endif /* not emacs */
- }
-
- regex_grow_registers (num_regs);
- }
-#endif /* not MATCH_MAY_ALLOCATE */
-
- return REG_NOERROR;
-} /* regex_compile */
-
-/* Subroutines for `regex_compile'. */
-
-/* Store OP at LOC followed by two-byte integer parameter ARG. */
-/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
-
-static void
-store_op1 (op, loc, arg)
- re_opcode_t op;
- US_CHAR_TYPE *loc;
- int arg;
-{
- *loc = (US_CHAR_TYPE) op;
- STORE_NUMBER (loc + 1, arg);
-}
-
-
-/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
-/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
-
-static void
-store_op2 (op, loc, arg1, arg2)
- re_opcode_t op;
- US_CHAR_TYPE *loc;
- int arg1, arg2;
-{
- *loc = (US_CHAR_TYPE) op;
- STORE_NUMBER (loc + 1, arg1);
- STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
-}
-
-
-/* Copy the bytes from LOC to END to open up three bytes of space at LOC
- for OP followed by two-byte integer parameter ARG. */
-/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
-
-static void
-insert_op1 (op, loc, arg, end)
- re_opcode_t op;
- US_CHAR_TYPE *loc;
- int arg;
- US_CHAR_TYPE *end;
-{
- register US_CHAR_TYPE *pfrom = end;
- register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE;
-
- while (pfrom != loc)
- *--pto = *--pfrom;
-
- store_op1 (op, loc, arg);
-}
-
-
-/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
-/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
-
-static void
-insert_op2 (op, loc, arg1, arg2, end)
- re_opcode_t op;
- US_CHAR_TYPE *loc;
- int arg1, arg2;
- US_CHAR_TYPE *end;
-{
- register US_CHAR_TYPE *pfrom = end;
- register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
-
- while (pfrom != loc)
- *--pto = *--pfrom;
-
- store_op2 (op, loc, arg1, arg2);
-}
-
-
-/* P points to just after a ^ in PATTERN. Return true if that ^ comes
- after an alternative or a begin-subexpression. We assume there is at
- least one character before the ^. */
-
-static boolean
-at_begline_loc_p (pattern, p, syntax)
- const CHAR_TYPE *pattern, *p;
- reg_syntax_t syntax;
-{
- const CHAR_TYPE *prev = p - 2;
- boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
-
- return
- /* After a subexpression? */
- (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
- /* After an alternative? */
- || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
-}
-
-
-/* The dual of at_begline_loc_p. This one is for $. We assume there is
- at least one character after the $, i.e., `P < PEND'. */
-
-static boolean
-at_endline_loc_p (p, pend, syntax)
- const CHAR_TYPE *p, *pend;
- reg_syntax_t syntax;
-{
- const CHAR_TYPE *next = p;
- boolean next_backslash = *next == '\\';
- const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0;
-
- return
- /* Before a subexpression? */
- (syntax & RE_NO_BK_PARENS ? *next == ')'
- : next_backslash && next_next && *next_next == ')')
- /* Before an alternative? */
- || (syntax & RE_NO_BK_VBAR ? *next == '|'
- : next_backslash && next_next && *next_next == '|');
-}
-
-
-/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
- false if it's not. */
-
-static boolean
-group_in_compile_stack (compile_stack, regnum)
- compile_stack_type compile_stack;
- regnum_t regnum;
-{
- int this_element;
-
- for (this_element = compile_stack.avail - 1;
- this_element >= 0;
- this_element--)
- if (compile_stack.stack[this_element].regnum == regnum)
- return true;
-
- return false;
-}
-
-#ifdef MBS_SUPPORT
-/* This insert space, which size is "num", into the pattern at "loc".
- "end" must point the end of the allocated buffer. */
-static void
-insert_space (num, loc, end)
- int num;
- CHAR_TYPE *loc;
- CHAR_TYPE *end;
-{
- register CHAR_TYPE *pto = end;
- register CHAR_TYPE *pfrom = end - num;
-
- while (pfrom >= loc)
- *pto-- = *pfrom--;
-}
-#endif /* MBS_SUPPORT */
-
-#ifdef MBS_SUPPORT
-static reg_errcode_t
-compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
- char_set)
- CHAR_TYPE range_start_char;
- const CHAR_TYPE **p_ptr, *pend;
- CHAR_TYPE *char_set, *b;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
-{
- const CHAR_TYPE *p = *p_ptr;
- CHAR_TYPE range_start, range_end;
- reg_errcode_t ret;
-# ifdef _LIBC
- uint32_t nrules;
- uint32_t start_val, end_val;
-# endif
- if (p == pend)
- return REG_ERANGE;
-
-# ifdef _LIBC
- nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
- if (nrules != 0)
- {
- const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
- _NL_COLLATE_COLLSEQWC);
- const unsigned char *extra = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
-
- if (range_start_char < -1)
- {
- /* range_start is a collating symbol. */
- int32_t *wextra;
- /* Retreive the index and get collation sequence value. */
- wextra = (int32_t*)(extra + char_set[-range_start_char]);
- start_val = wextra[1 + *wextra];
- }
- else
- start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
-
- end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
-
- /* Report an error if the range is empty and the syntax prohibits
- this. */
- ret = ((syntax & RE_NO_EMPTY_RANGES)
- && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
-
- /* Insert space to the end of the char_ranges. */
- insert_space(2, b - char_set[5] - 2, b - 1);
- *(b - char_set[5] - 2) = (wchar_t)start_val;
- *(b - char_set[5] - 1) = (wchar_t)end_val;
- char_set[4]++; /* ranges_index */
- }
- else
-# endif
- {
- range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
- range_start_char;
- range_end = TRANSLATE (p[0]);
- /* Report an error if the range is empty and the syntax prohibits
- this. */
- ret = ((syntax & RE_NO_EMPTY_RANGES)
- && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
-
- /* Insert space to the end of the char_ranges. */
- insert_space(2, b - char_set[5] - 2, b - 1);
- *(b - char_set[5] - 2) = range_start;
- *(b - char_set[5] - 1) = range_end;
- char_set[4]++; /* ranges_index */
- }
- /* Have to increment the pointer into the pattern string, so the
- caller isn't still at the ending character. */
- (*p_ptr)++;
-
- return ret;
-}
-#else
-/* Read the ending character of a range (in a bracket expression) from the
- uncompiled pattern *P_PTR (which ends at PEND). We assume the
- starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
- Then we set the translation of all bits between the starting and
- ending characters (inclusive) in the compiled pattern B.
-
- Return an error code.
-
- We use these short variable names so we can use the same macros as
- `regex_compile' itself. */
-
-static reg_errcode_t
-compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
- unsigned int range_start_char;
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
-{
- unsigned this_char;
- const char *p = *p_ptr;
- reg_errcode_t ret;
-# if _LIBC
- const unsigned char *collseq;
- unsigned int start_colseq;
- unsigned int end_colseq;
-# else
- unsigned end_char;
-# endif
-
- if (p == pend)
- return REG_ERANGE;
-
- /* Have to increment the pointer into the pattern string, so the
- caller isn't still at the ending character. */
- (*p_ptr)++;
-
- /* Report an error if the range is empty and the syntax prohibits this. */
- ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-
-# if _LIBC
- collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
- _NL_COLLATE_COLLSEQMB);
-
- start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
- end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
- for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
- {
- unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
-
- if (start_colseq <= this_colseq && this_colseq <= end_colseq)
- {
- SET_LIST_BIT (TRANSLATE (this_char));
- ret = REG_NOERROR;
- }
- }
-# else
- /* Here we see why `this_char' has to be larger than an `unsigned
- char' -- we would otherwise go into an infinite loop, since all
- characters <= 0xff. */
- range_start_char = TRANSLATE (range_start_char);
- /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
- and some compilers cast it to int implicitly, so following for_loop
- may fall to (almost) infinite loop.
- e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
- To avoid this, we cast p[0] to unsigned int and truncate it. */
- end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
-
- for (this_char = range_start_char; this_char <= end_char; ++this_char)
- {
- SET_LIST_BIT (TRANSLATE (this_char));
- ret = REG_NOERROR;
- }
-# endif
-
- return ret;
-}
-#endif /* MBS_SUPPORT */
-
-/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
- BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
- characters can start a string that matches the pattern. This fastmap
- is used by re_search to skip quickly over impossible starting points.
-
- The caller must supply the address of a (1 << BYTEWIDTH)-byte data
- area as BUFP->fastmap.
-
- We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
- the pattern buffer.
-
- Returns 0 if we succeed, -2 if an internal error. */
-
-#ifdef MBS_SUPPORT
-/* local function for re_compile_fastmap.
- truncate wchar_t character to char. */
-static unsigned char truncate_wchar (CHAR_TYPE c);
-
-static unsigned char
-truncate_wchar (c)
- CHAR_TYPE c;
-{
- unsigned char buf[MB_LEN_MAX];
- int retval = wctomb(buf, c);
- return retval > 0 ? buf[0] : (unsigned char)c;
-}
-#endif /* MBS_SUPPORT */
-
-int
-re_compile_fastmap (bufp)
- struct re_pattern_buffer *bufp;
-{
- int j, k;
-#ifdef MATCH_MAY_ALLOCATE
- fail_stack_type fail_stack;
-#endif
-#ifndef REGEX_MALLOC
- char *destination;
-#endif
-
- register char *fastmap = bufp->fastmap;
-
-#ifdef MBS_SUPPORT
- /* We need to cast pattern to (wchar_t*), because we casted this compiled
- pattern to (char*) in regex_compile. */
- US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer;
- register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used);
-#else
- US_CHAR_TYPE *pattern = bufp->buffer;
- register US_CHAR_TYPE *pend = pattern + bufp->used;
-#endif /* MBS_SUPPORT */
- US_CHAR_TYPE *p = pattern;
-
-#ifdef REL_ALLOC
- /* This holds the pointer to the failure stack, when
- it is allocated relocatably. */
- fail_stack_elt_t *failure_stack_ptr;
-#endif
-
- /* Assume that each path through the pattern can be null until
- proven otherwise. We set this false at the bottom of switch
- statement, to which we get only if a particular path doesn't
- match the empty string. */
- boolean path_can_be_null = true;
-
- /* We aren't doing a `succeed_n' to begin with. */
- boolean succeed_n_p = false;
-
- assert (fastmap != NULL && p != NULL);
-
- INIT_FAIL_STACK ();
- bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
- bufp->fastmap_accurate = 1; /* It will be when we're done. */
- bufp->can_be_null = 0;
-
- while (1)
- {
- if (p == pend || *p == succeed)
- {
- /* We have reached the (effective) end of pattern. */
- if (!FAIL_STACK_EMPTY ())
- {
- bufp->can_be_null |= path_can_be_null;
-
- /* Reset for next path. */
- path_can_be_null = true;
-
- p = fail_stack.stack[--fail_stack.avail].pointer;
-
- continue;
- }
- else
- break;
- }
-
- /* We should never be about to go beyond the end of the pattern. */
- assert (p < pend);
-
- switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
- {
-
- /* I guess the idea here is to simply not bother with a fastmap
- if a backreference is used, since it's too hard to figure out
- the fastmap for the corresponding group. Setting
- `can_be_null' stops `re_search_2' from using the fastmap, so
- that is all we do. */
- case duplicate:
- bufp->can_be_null = 1;
- goto done;
-
-
- /* Following are the cases which match a character. These end
- with `break'. */
-
-#ifdef MBS_SUPPORT
- case exactn:
- fastmap[truncate_wchar(p[1])] = 1;
- break;
- case exactn_bin:
- fastmap[p[1]] = 1;
- break;
-#else
- case exactn:
- fastmap[p[1]] = 1;
- break;
-#endif /* MBS_SUPPORT */
-
-
-#ifdef MBS_SUPPORT
- /* It is hard to distinguish fastmap from (multi byte) characters
- which depends on current locale. */
- case charset:
- case charset_not:
- case wordchar:
- case notwordchar:
- bufp->can_be_null = 1;
- goto done;
-#else
- case charset:
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
- if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- fastmap[j] = 1;
- break;
-
-
- case charset_not:
- /* Chars beyond end of map must be allowed. */
- for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
-
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
- if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
- fastmap[j] = 1;
- break;
-
-
- case wordchar:
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) == Sword)
- fastmap[j] = 1;
- break;
-
-
- case notwordchar:
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) != Sword)
- fastmap[j] = 1;
- break;
-#endif
-
- case anychar:
- {
- int fastmap_newline = fastmap['\n'];
-
- /* `.' matches anything ... */
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
-
- /* ... except perhaps newline. */
- if (!(bufp->syntax & RE_DOT_NEWLINE))
- fastmap['\n'] = fastmap_newline;
-
- /* Return if we have already set `can_be_null'; if we have,
- then the fastmap is irrelevant. Something's wrong here. */
- else if (bufp->can_be_null)
- goto done;
-
- /* Otherwise, have to check alternative paths. */
- break;
- }
-
-#ifdef emacs
- case syntaxspec:
- k = *p++;
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) == (enum syntaxcode) k)
- fastmap[j] = 1;
- break;
-
-
- case notsyntaxspec:
- k = *p++;
- for (j = 0; j < (1 << BYTEWIDTH); j++)
- if (SYNTAX (j) != (enum syntaxcode) k)
- fastmap[j] = 1;
- break;
-
-
- /* All cases after this match the empty string. These end with
- `continue'. */
-
-
- case before_dot:
- case at_dot:
- case after_dot:
- continue;
-#endif /* emacs */
-
-
- case no_op:
- case begline:
- case endline:
- case begbuf:
- case endbuf:
- case wordbound:
- case notwordbound:
- case wordbeg:
- case wordend:
- case push_dummy_failure:
- continue;
-
-
- case jump_n:
- case pop_failure_jump:
- case maybe_pop_jump:
- case jump:
- case jump_past_alt:
- case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
- p += j;
- if (j > 0)
- continue;
-
- /* Jump backward implies we just went through the body of a
- loop and matched nothing. Opcode jumped to should be
- `on_failure_jump' or `succeed_n'. Just treat it like an
- ordinary jump. For a * loop, it has pushed its failure
- point already; if so, discard that as redundant. */
- if ((re_opcode_t) *p != on_failure_jump
- && (re_opcode_t) *p != succeed_n)
- continue;
-
- p++;
- EXTRACT_NUMBER_AND_INCR (j, p);
- p += j;
-
- /* If what's on the stack is where we are now, pop it. */
- if (!FAIL_STACK_EMPTY ()
- && fail_stack.stack[fail_stack.avail - 1].pointer == p)
- fail_stack.avail--;
-
- continue;
-
-
- case on_failure_jump:
- case on_failure_keep_string_jump:
- handle_on_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
-
- /* For some patterns, e.g., `(a?)?', `p+j' here points to the
- end of the pattern. We don't want to push such a point,
- since when we restore it above, entering the switch will
- increment `p' past the end of the pattern. We don't need
- to push such a point since we obviously won't find any more
- fastmap entries beyond `pend'. Such a pattern can match
- the null string, though. */
- if (p + j < pend)
- {
- if (!PUSH_PATTERN_OP (p + j, fail_stack))
- {
- RESET_FAIL_STACK ();
- return -2;
- }
- }
- else
- bufp->can_be_null = 1;
-
- if (succeed_n_p)
- {
- EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
- succeed_n_p = false;
- }
-
- continue;
-
-
- case succeed_n:
- /* Get to the number of times to succeed. */
- p += OFFSET_ADDRESS_SIZE;
-
- /* Increment p past the n for when k != 0. */
- EXTRACT_NUMBER_AND_INCR (k, p);
- if (k == 0)
- {
- p -= 2 * OFFSET_ADDRESS_SIZE;
- succeed_n_p = true; /* Spaghetti code alert. */
- goto handle_on_failure_jump;
- }
- continue;
-
-
- case set_number_at:
- p += 2 * OFFSET_ADDRESS_SIZE;
- continue;
-
-
- case start_memory:
- case stop_memory:
- p += 2;
- continue;
-
-
- default:
- abort (); /* We have listed all the cases. */
- } /* switch *p++ */
-
- /* Getting here means we have found the possible starting
- characters for one path of the pattern -- and that the empty
- string does not match. We need not follow this path further.
- Instead, look at the next alternative (remembered on the
- stack), or quit if no more. The test at the top of the loop
- does these things. */
- path_can_be_null = false;
- p = pend;
- } /* while p */
-
- /* Set `can_be_null' for the last path (also the first path, if the
- pattern is empty). */
- bufp->can_be_null |= path_can_be_null;
-
- done:
- RESET_FAIL_STACK ();
- return 0;
-} /* re_compile_fastmap */
-#ifdef _LIBC
-weak_alias (__re_compile_fastmap, re_compile_fastmap)
-#endif
-
-/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
- ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
- this memory for recording register information. STARTS and ENDS
- must be allocated using the malloc library routine, and must each
- be at least NUM_REGS * sizeof (regoff_t) bytes long.
-
- If NUM_REGS == 0, then subsequent matches should allocate their own
- register data.
-
- Unless this function is called, the first search or match using
- PATTERN_BUFFER will allocate its own register data, without
- freeing the old data. */
-
-void
-re_set_registers (bufp, regs, num_regs, starts, ends)
- struct re_pattern_buffer *bufp;
- struct re_registers *regs;
- unsigned num_regs;
- regoff_t *starts, *ends;
-{
- if (num_regs)
- {
- bufp->regs_allocated = REGS_REALLOCATE;
- regs->num_regs = num_regs;
- regs->start = starts;
- regs->end = ends;
- }
- else
- {
- bufp->regs_allocated = REGS_UNALLOCATED;
- regs->num_regs = 0;
- regs->start = regs->end = (regoff_t *) 0;
- }
-}
-#ifdef _LIBC
-weak_alias (__re_set_registers, re_set_registers)
-#endif
-
-/* Searching routines. */
-
-/* Like re_search_2, below, but only one string is specified, and
- doesn't let you say where to stop matching. */
-
-int
-re_search (bufp, string, size, startpos, range, regs)
- struct re_pattern_buffer *bufp;
- const char *string;
- int size, startpos, range;
- struct re_registers *regs;
-{
- return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
- regs, size);
-}
-#ifdef _LIBC
-weak_alias (__re_search, re_search)
-#endif
-
-
-/* Using the compiled pattern in BUFP->buffer, first tries to match the
- virtual concatenation of STRING1 and STRING2, starting first at index
- STARTPOS, then at STARTPOS + 1, and so on.
-
- STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
-
- RANGE is how far to scan while trying to match. RANGE = 0 means try
- only at STARTPOS; in general, the last start tried is STARTPOS +
- RANGE.
-
- In REGS, return the indices of the virtual concatenation of STRING1
- and STRING2 that matched the entire BUFP->buffer and its contained
- subexpressions.
-
- Do not consider matching one past the index STOP in the virtual
- concatenation of STRING1 and STRING2.
-
- We return either the position in the strings at which the match was
- found, -1 if no match, or -2 if error (such as failure
- stack overflow). */
-
-int
-re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
- struct re_pattern_buffer *bufp;
- const char *string1, *string2;
- int size1, size2;
- int startpos;
- int range;
- struct re_registers *regs;
- int stop;
-{
- int val;
- register char *fastmap = bufp->fastmap;
- register RE_TRANSLATE_TYPE translate = bufp->translate;
- int total_size = size1 + size2;
- int endpos = startpos + range;
-
- /* Check for out-of-range STARTPOS. */
- if (startpos < 0 || startpos > total_size)
- return -1;
-
- /* Fix up RANGE if it might eventually take us outside
- the virtual concatenation of STRING1 and STRING2.
- Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
- if (endpos < 0)
- range = 0 - startpos;
- else if (endpos > total_size)
- range = total_size - startpos;
-
- /* If the search isn't to be a backwards one, don't waste time in a
- search for a pattern that must be anchored. */
- if (bufp->used > 0 && range > 0
- && ((re_opcode_t) bufp->buffer[0] == begbuf
- /* `begline' is like `begbuf' if it cannot match at newlines. */
- || ((re_opcode_t) bufp->buffer[0] == begline
- && !bufp->newline_anchor)))
- {
- if (startpos > 0)
- return -1;
- else
- range = 1;
- }
-
-#ifdef emacs
- /* In a forward search for something that starts with \=.
- don't keep searching past point. */
- if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
- {
- range = PT - startpos;
- if (range <= 0)
- return -1;
- }
-#endif /* emacs */
-
- /* Update the fastmap now if not correct already. */
- if (fastmap && !bufp->fastmap_accurate)
- if (re_compile_fastmap (bufp) == -2)
- return -2;
-
- /* Loop through the string, looking for a place to start matching. */
- for (;;)
- {
- /* If a fastmap is supplied, skip quickly over characters that
- cannot be the start of a match. If the pattern can match the
- null string, however, we don't need to skip characters; we want
- the first null string. */
- if (fastmap && startpos < total_size && !bufp->can_be_null)
- {
- if (range > 0) /* Searching forwards. */
- {
- register const char *d;
- register int lim = 0;
- int irange = range;
-
- if (startpos < size1 && startpos + range >= size1)
- lim = range - (size1 - startpos);
-
- d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
-
- /* Written out as an if-else to avoid testing `translate'
- inside the loop. */
- if (translate)
- while (range > lim
- && !fastmap[(unsigned char)
- translate[(unsigned char) *d++]])
- range--;
- else
- while (range > lim && !fastmap[(unsigned char) *d++])
- range--;
-
- startpos += irange - range;
- }
- else /* Searching backwards. */
- {
- register CHAR_TYPE c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
-
- if (!fastmap[(unsigned char) TRANSLATE (c)])
- goto advance;
- }
- }
-
- /* If can't match the null string, and that's all we have left, fail. */
- if (range >= 0 && startpos == total_size && fastmap
- && !bufp->can_be_null)
- return -1;
-
- val = re_match_2_internal (bufp, string1, size1, string2, size2,
- startpos, regs, stop);
-#ifndef REGEX_MALLOC
-# ifdef C_ALLOCA
- alloca (0);
-# endif
-#endif
-
- if (val >= 0)
- return startpos;
-
- if (val == -2)
- return -2;
-
- advance:
- if (!range)
- break;
- else if (range > 0)
- {
- range--;
- startpos++;
- }
- else
- {
- range++;
- startpos--;
- }
- }
- return -1;
-} /* re_search_2 */
-#ifdef _LIBC
-weak_alias (__re_search_2, re_search_2)
-#endif
-
-#ifdef MBS_SUPPORT
-/* This converts PTR, a pointer into one of the search wchar_t strings
- `string1' and `string2' into an multibyte string offset from the
- beginning of that string. We use mbs_offset to optimize.
- See convert_mbs_to_wcs. */
-# define POINTER_TO_OFFSET(ptr) \
- (FIRST_STRING_P (ptr) \
- ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
- : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
- + csize1)))
-#else
-/* This converts PTR, a pointer into one of the search strings `string1'
- and `string2' into an offset from the beginning of that string. */
-# define POINTER_TO_OFFSET(ptr) \
- (FIRST_STRING_P (ptr) \
- ? ((regoff_t) ((ptr) - string1)) \
- : ((regoff_t) ((ptr) - string2 + size1)))
-#endif /* MBS_SUPPORT */
-
-/* Macros for dealing with the split strings in re_match_2. */
-
-#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
-
-/* Call before fetching a character with *d. This switches over to
- string2 if necessary. */
-#define PREFETCH() \
- while (d == dend) \
- { \
- /* End of string2 => fail. */ \
- if (dend == end_match_2) \
- goto fail; \
- /* End of string1 => advance to string2. */ \
- d = string2; \
- dend = end_match_2; \
- }
-
-
-/* Test if at very beginning or at very end of the virtual concatenation
- of `string1' and `string2'. If only one string, it's `string2'. */
-#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
-#define AT_STRINGS_END(d) ((d) == end2)
-
-
-/* Test if D points to a character which is word-constituent. We have
- two special cases to check for: if past the end of string1, look at
- the first character in string2; and if before the beginning of
- string2, look at the last character in string1. */
-#ifdef MBS_SUPPORT
-/* Use internationalized API instead of SYNTAX. */
-# define WORDCHAR_P(d) \
- (iswalnum ((wint_t)((d) == end1 ? *string2 \
- : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0)
-#else
-# define WORDCHAR_P(d) \
- (SYNTAX ((d) == end1 ? *string2 \
- : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
- == Sword)
-#endif /* MBS_SUPPORT */
-
-/* Disabled due to a compiler bug -- see comment at case wordbound */
-#if 0
-/* Test if the character before D and the one at D differ with respect
- to being word-constituent. */
-#define AT_WORD_BOUNDARY(d) \
- (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
- || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
-#endif
-
-/* Free everything we malloc. */
-#ifdef MATCH_MAY_ALLOCATE
-# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
-# ifdef MBS_SUPPORT
-# define FREE_VARIABLES() \
- do { \
- REGEX_FREE_STACK (fail_stack.stack); \
- FREE_VAR (regstart); \
- FREE_VAR (regend); \
- FREE_VAR (old_regstart); \
- FREE_VAR (old_regend); \
- FREE_VAR (best_regstart); \
- FREE_VAR (best_regend); \
- FREE_VAR (reg_info); \
- FREE_VAR (reg_dummy); \
- FREE_VAR (reg_info_dummy); \
- FREE_VAR (string1); \
- FREE_VAR (string2); \
- FREE_VAR (mbs_offset1); \
- FREE_VAR (mbs_offset2); \
- } while (0)
-# else /* not MBS_SUPPORT */
-# define FREE_VARIABLES() \
- do { \
- REGEX_FREE_STACK (fail_stack.stack); \
- FREE_VAR (regstart); \
- FREE_VAR (regend); \
- FREE_VAR (old_regstart); \
- FREE_VAR (old_regend); \
- FREE_VAR (best_regstart); \
- FREE_VAR (best_regend); \
- FREE_VAR (reg_info); \
- FREE_VAR (reg_dummy); \
- FREE_VAR (reg_info_dummy); \
- } while (0)
-# endif /* MBS_SUPPORT */
-#else
-# define FREE_VAR(var) if (var) free (var); var = NULL
-# ifdef MBS_SUPPORT
-# define FREE_VARIABLES() \
- do { \
- FREE_VAR (string1); \
- FREE_VAR (string2); \
- FREE_VAR (mbs_offset1); \
- FREE_VAR (mbs_offset2); \
- } while (0)
-# else
-# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
-# endif /* MBS_SUPPORT */
-#endif /* not MATCH_MAY_ALLOCATE */
-
-/* These values must meet several constraints. They must not be valid
- register values; since we have a limit of 255 registers (because
- we use only one byte in the pattern for the register number), we can
- use numbers larger than 255. They must differ by 1, because of
- NUM_FAILURE_ITEMS above. And the value for the lowest register must
- be larger than the value for the highest register, so we do not try
- to actually save any registers when none are active. */
-#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
-#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
-
-/* Matching routines. */
-
-#ifndef emacs /* Emacs never uses this. */
-/* re_match is like re_match_2 except it takes only a single string. */
-
-int
-re_match (bufp, string, size, pos, regs)
- struct re_pattern_buffer *bufp;
- const char *string;
- int size, pos;
- struct re_registers *regs;
-{
- int result = re_match_2_internal (bufp, NULL, 0, string, size,
- pos, regs, size);
-# ifndef REGEX_MALLOC
-# ifdef C_ALLOCA
- alloca (0);
-# endif
-# endif
- return result;
-}
-# ifdef _LIBC
-weak_alias (__re_match, re_match)
-# endif
-#endif /* not emacs */
-
-static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
- US_CHAR_TYPE *end,
- register_info_type *reg_info));
-static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p,
- US_CHAR_TYPE *end,
- register_info_type *reg_info));
-static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
- US_CHAR_TYPE *end,
- register_info_type *reg_info));
-static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2,
- int len, char *translate));
-
-/* re_match_2 matches the compiled pattern in BUFP against the
- the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
- and SIZE2, respectively). We start matching at POS, and stop
- matching at STOP.
-
- If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
- store offsets for the substring each group matched in REGS. See the
- documentation for exactly how many groups we fill.
-
- We return -1 if no match, -2 if an internal error (such as the
- failure stack overflowing). Otherwise, we return the length of the
- matched substring. */
-
-int
-re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
- struct re_pattern_buffer *bufp;
- const char *string1, *string2;
- int size1, size2;
- int pos;
- struct re_registers *regs;
- int stop;
-{
- int result = re_match_2_internal (bufp, string1, size1, string2, size2,
- pos, regs, stop);
-#ifndef REGEX_MALLOC
-# ifdef C_ALLOCA
- alloca (0);
-# endif
-#endif
- return result;
-}
-#ifdef _LIBC
-weak_alias (__re_match_2, re_match_2)
-#endif
-
-#ifdef MBS_SUPPORT
-
-static int count_mbs_length PARAMS ((int *, int));
-
-/* This check the substring (from 0, to length) of the multibyte string,
- to which offset_buffer correspond. And count how many wchar_t_characters
- the substring occupy. We use offset_buffer to optimization.
- See convert_mbs_to_wcs. */
-
-static int
-count_mbs_length(offset_buffer, length)
- int *offset_buffer;
- int length;
-{
- int wcs_size;
-
- /* Check whether the size is valid. */
- if (length < 0)
- return -1;
-
- if (offset_buffer == NULL)
- return 0;
-
- for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++)
- {
- if (offset_buffer[wcs_size] == length)
- return wcs_size;
- if (offset_buffer[wcs_size] > length)
- /* It is a fragment of a wide character. */
- return -1;
- }
-
- /* We reached at the sentinel. */
- return -1;
-}
-#endif /* MBS_SUPPORT */
-
-/* This is a separate function so that we can force an alloca cleanup
- afterwards. */
-static int
-#ifdef MBS_SUPPORT
-re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop)
- struct re_pattern_buffer *bufp;
- const char *cstring1, *cstring2;
- int csize1, csize2;
-#else
-re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
- struct re_pattern_buffer *bufp;
- const char *string1, *string2;
- int size1, size2;
-#endif
- int pos;
- struct re_registers *regs;
- int stop;
-{
- /* General temporaries. */
- int mcnt;
- US_CHAR_TYPE *p1;
-#ifdef MBS_SUPPORT
- /* We need wchar_t* buffers correspond to string1, string2. */
- CHAR_TYPE *string1 = NULL, *string2 = NULL;
- /* We need the size of wchar_t buffers correspond to csize1, csize2. */
- int size1 = 0, size2 = 0;
- /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
- int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
- /* They hold whether each wchar_t is binary data or not. */
- char *is_binary = NULL;
-#endif /* MBS_SUPPORT */
-
- /* Just past the end of the corresponding string. */
- const CHAR_TYPE *end1, *end2;
-
- /* Pointers into string1 and string2, just past the last characters in
- each to consider matching. */
- const CHAR_TYPE *end_match_1, *end_match_2;
-
- /* Where we are in the data, and the end of the current string. */
- const CHAR_TYPE *d, *dend;
-
- /* Where we are in the pattern, and the end of the pattern. */
-#ifdef MBS_SUPPORT
- US_CHAR_TYPE *pattern, *p;
- register US_CHAR_TYPE *pend;
-#else
- US_CHAR_TYPE *p = bufp->buffer;
- register US_CHAR_TYPE *pend = p + bufp->used;
-#endif /* MBS_SUPPORT */
-
- /* Mark the opcode just after a start_memory, so we can test for an
- empty subpattern when we get to the stop_memory. */
- US_CHAR_TYPE *just_past_start_mem = 0;
-
- /* We use this to map every character in the string. */
- RE_TRANSLATE_TYPE translate = bufp->translate;
-
- /* Failure point stack. Each place that can handle a failure further
- down the line pushes a failure point on this stack. It consists of
- restart, regend, and reg_info for all registers corresponding to
- the subexpressions we're currently inside, plus the number of such
- registers, and, finally, two char *'s. The first char * is where
- to resume scanning the pattern; the second one is where to resume
- scanning the strings. If the latter is zero, the failure point is
- a ``dummy''; if a failure happens and the failure point is a dummy,
- it gets discarded and the next next one is tried. */
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
- fail_stack_type fail_stack;
-#endif
-#ifdef DEBUG
- static unsigned failure_id;
- unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
-#endif
-
-#ifdef REL_ALLOC
- /* This holds the pointer to the failure stack, when
- it is allocated relocatably. */
- fail_stack_elt_t *failure_stack_ptr;
-#endif
-
- /* We fill all the registers internally, independent of what we
- return, for use in backreferences. The number here includes
- an element for register zero. */
- size_t num_regs = bufp->re_nsub + 1;
-
- /* The currently active registers. */
- active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
- active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
-
- /* Information on the contents of registers. These are pointers into
- the input strings; they record just what was matched (on this
- attempt) by a subexpression part of the pattern, that is, the
- regnum-th regstart pointer points to where in the pattern we began
- matching and the regnum-th regend points to right after where we
- stopped matching the regnum-th subexpression. (The zeroth register
- keeps track of what the whole pattern matches.) */
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const CHAR_TYPE **regstart, **regend;
-#endif
-
- /* If a group that's operated upon by a repetition operator fails to
- match anything, then the register for its start will need to be
- restored because it will have been set to wherever in the string we
- are when we last see its open-group operator. Similarly for a
- register's end. */
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const CHAR_TYPE **old_regstart, **old_regend;
-#endif
-
- /* The is_active field of reg_info helps us keep track of which (possibly
- nested) subexpressions we are currently in. The matched_something
- field of reg_info[reg_num] helps us tell whether or not we have
- matched any of the pattern so far this time through the reg_num-th
- subexpression. These two fields get reset each time through any
- loop their register is in. */
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
- register_info_type *reg_info;
-#endif
-
- /* The following record the register info as found in the above
- variables when we find a match better than any we've seen before.
- This happens as we backtrack through the failure points, which in
- turn happens only if we have not yet matched the entire string. */
- unsigned best_regs_set = false;
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const CHAR_TYPE **best_regstart, **best_regend;
-#endif
-
- /* Logically, this is `best_regend[0]'. But we don't want to have to
- allocate space for that if we're not allocating space for anything
- else (see below). Also, we never need info about register 0 for
- any of the other register vectors, and it seems rather a kludge to
- treat `best_regend' differently than the rest. So we keep track of
- the end of the best match so far in a separate variable. We
- initialize this to NULL so that when we backtrack the first time
- and need to test it, it's not garbage. */
- const CHAR_TYPE *match_end = NULL;
-
- /* This helps SET_REGS_MATCHED avoid doing redundant work. */
- int set_regs_matched_done = 0;
-
- /* Used when we pop values we don't care about. */
-#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const CHAR_TYPE **reg_dummy;
- register_info_type *reg_info_dummy;
-#endif
-
-#ifdef DEBUG
- /* Counts the total number of registers pushed. */
- unsigned num_regs_pushed = 0;
-#endif
-
- DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
-
- INIT_FAIL_STACK ();
-
-#ifdef MATCH_MAY_ALLOCATE
- /* Do not bother to initialize all the register variables if there are
- no groups in the pattern, as it takes a fair amount of time. If
- there are groups, we include space for register 0 (the whole
- pattern), even though we never use it, since it simplifies the
- array indexing. We should fix this. */
- if (bufp->re_nsub)
- {
- regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- reg_info = REGEX_TALLOC (num_regs, register_info_type);
- reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
- reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
-
- if (!(regstart && regend && old_regstart && old_regend && reg_info
- && best_regstart && best_regend && reg_dummy && reg_info_dummy))
- {
- FREE_VARIABLES ();
- return -2;
- }
- }
- else
- {
- /* We must initialize all our variables to NULL, so that
- `FREE_VARIABLES' doesn't try to free them. */
- regstart = regend = old_regstart = old_regend = best_regstart
- = best_regend = reg_dummy = NULL;
- reg_info = reg_info_dummy = (register_info_type *) NULL;
- }
-#endif /* MATCH_MAY_ALLOCATE */
-
- /* The starting position is bogus. */
-#ifdef MBS_SUPPORT
- if (pos < 0 || pos > csize1 + csize2)
-#else
- if (pos < 0 || pos > size1 + size2)
-#endif
- {
- FREE_VARIABLES ();
- return -1;
- }
-
-#ifdef MBS_SUPPORT
- /* Allocate wchar_t array for string1 and string2 and
- fill them with converted string. */
- if (csize1 != 0)
- {
- string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
- mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
- is_binary = REGEX_TALLOC (csize1 + 1, char);
- if (!string1 || !mbs_offset1 || !is_binary)
- {
- FREE_VAR (string1);
- FREE_VAR (mbs_offset1);
- FREE_VAR (is_binary);
- return -2;
- }
- size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
- mbs_offset1, is_binary);
- string1[size1] = L'\0'; /* for a sentinel */
- FREE_VAR (is_binary);
- }
- if (csize2 != 0)
- {
- string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
- mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
- is_binary = REGEX_TALLOC (csize2 + 1, char);
- if (!string2 || !mbs_offset2 || !is_binary)
- {
- FREE_VAR (string1);
- FREE_VAR (mbs_offset1);
- FREE_VAR (string2);
- FREE_VAR (mbs_offset2);
- FREE_VAR (is_binary);
- return -2;
- }
- size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
- mbs_offset2, is_binary);
- string2[size2] = L'\0'; /* for a sentinel */
- FREE_VAR (is_binary);
- }
-
- /* We need to cast pattern to (wchar_t*), because we casted this compiled
- pattern to (char*) in regex_compile. */
- p = pattern = (CHAR_TYPE*)bufp->buffer;
- pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
-
-#endif /* MBS_SUPPORT */
-
- /* Initialize subexpression text positions to -1 to mark ones that no
- start_memory/stop_memory has been seen for. Also initialize the
- register information struct. */
- for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
- {
- regstart[mcnt] = regend[mcnt]
- = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
-
- REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
- IS_ACTIVE (reg_info[mcnt]) = 0;
- MATCHED_SOMETHING (reg_info[mcnt]) = 0;
- EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
- }
-
- /* We move `string1' into `string2' if the latter's empty -- but not if
- `string1' is null. */
- if (size2 == 0 && string1 != NULL)
- {
- string2 = string1;
- size2 = size1;
- string1 = 0;
- size1 = 0;
- }
- end1 = string1 + size1;
- end2 = string2 + size2;
-
- /* Compute where to stop matching, within the two strings. */
-#ifdef MBS_SUPPORT
- if (stop <= csize1)
- {
- mcnt = count_mbs_length(mbs_offset1, stop);
- end_match_1 = string1 + mcnt;
- end_match_2 = string2;
- }
- else
- {
- end_match_1 = end1;
- mcnt = count_mbs_length(mbs_offset2, stop-csize1);
- end_match_2 = string2 + mcnt;
- }
- if (mcnt < 0)
- { /* count_mbs_length return error. */
- FREE_VARIABLES ();
- return -1;
- }
-#else
- if (stop <= size1)
- {
- end_match_1 = string1 + stop;
- end_match_2 = string2;
- }
- else
- {
- end_match_1 = end1;
- end_match_2 = string2 + stop - size1;
- }
-#endif /* MBS_SUPPORT */
-
- /* `p' scans through the pattern as `d' scans through the data.
- `dend' is the end of the input string that `d' points within. `d'
- is advanced into the following input string whenever necessary, but
- this happens before fetching; therefore, at the beginning of the
- loop, `d' can be pointing at the end of a string, but it cannot
- equal `string2'. */
-#ifdef MBS_SUPPORT
- if (size1 > 0 && pos <= csize1)
- {
- mcnt = count_mbs_length(mbs_offset1, pos);
- d = string1 + mcnt;
- dend = end_match_1;
- }
- else
- {
- mcnt = count_mbs_length(mbs_offset2, pos-csize1);
- d = string2 + mcnt;
- dend = end_match_2;
- }
-
- if (mcnt < 0)
- { /* count_mbs_length return error. */
- FREE_VARIABLES ();
- return -1;
- }
-#else
- if (size1 > 0 && pos <= size1)
- {
- d = string1 + pos;
- dend = end_match_1;
- }
- else
- {
- d = string2 + pos - size1;
- dend = end_match_2;
- }
-#endif /* MBS_SUPPORT */
-
- DEBUG_PRINT1 ("The compiled pattern is:\n");
- DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
- DEBUG_PRINT1 ("The string to match is: `");
- DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
- DEBUG_PRINT1 ("'\n");
-
- /* This loops over pattern commands. It exits by returning from the
- function if the match is complete, or it drops through if the match
- fails at this starting point in the input data. */
- for (;;)
- {
-#ifdef _LIBC
- DEBUG_PRINT2 ("\n%p: ", p);
-#else
- DEBUG_PRINT2 ("\n0x%x: ", p);
-#endif
-
- if (p == pend)
- { /* End of pattern means we might have succeeded. */
- DEBUG_PRINT1 ("end of pattern ... ");
-
- /* If we haven't matched the entire string, and we want the
- longest match, try backtracking. */
- if (d != end_match_2)
- {
- /* 1 if this match ends in the same string (string1 or string2)
- as the best previous match. */
- boolean same_str_p = (FIRST_STRING_P (match_end)
- == MATCHING_IN_FIRST_STRING);
- /* 1 if this match is the best seen so far. */
- boolean best_match_p;
-
- /* AIX compiler got confused when this was combined
- with the previous declaration. */
- if (same_str_p)
- best_match_p = d > match_end;
- else
- best_match_p = !MATCHING_IN_FIRST_STRING;
-
- DEBUG_PRINT1 ("backtracking.\n");
-
- if (!FAIL_STACK_EMPTY ())
- { /* More failure points to try. */
-
- /* If exceeds best match so far, save it. */
- if (!best_regs_set || best_match_p)
- {
- best_regs_set = true;
- match_end = d;
-
- DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
-
- for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
- {
- best_regstart[mcnt] = regstart[mcnt];
- best_regend[mcnt] = regend[mcnt];
- }
- }
- goto fail;
- }
-
- /* If no failure points, don't restore garbage. And if
- last match is real best match, don't restore second
- best one. */
- else if (best_regs_set && !best_match_p)
- {
- restore_best_regs:
- /* Restore best match. It may happen that `dend ==
- end_match_1' while the restored d is in string2.
- For example, the pattern `x.*y.*z' against the
- strings `x-' and `y-z-', if the two strings are
- not consecutive in memory. */
- DEBUG_PRINT1 ("Restoring best registers.\n");
-
- d = match_end;
- dend = ((d >= string1 && d <= end1)
- ? end_match_1 : end_match_2);
-
- for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
- {
- regstart[mcnt] = best_regstart[mcnt];
- regend[mcnt] = best_regend[mcnt];
- }
- }
- } /* d != end_match_2 */
-
- succeed_label:
- DEBUG_PRINT1 ("Accepting match.\n");
- /* If caller wants register contents data back, do it. */
- if (regs && !bufp->no_sub)
- {
- /* Have the register data arrays been allocated? */
- if (bufp->regs_allocated == REGS_UNALLOCATED)
- { /* No. So allocate them with malloc. We need one
- extra element beyond `num_regs' for the `-1' marker
- GNU code uses. */
- regs->num_regs = MAX (RE_NREGS, num_regs + 1);
- regs->start = TALLOC (regs->num_regs, regoff_t);
- regs->end = TALLOC (regs->num_regs, regoff_t);
- if (regs->start == NULL || regs->end == NULL)
- {
- FREE_VARIABLES ();
- return -2;
- }
- bufp->regs_allocated = REGS_REALLOCATE;
- }
- else if (bufp->regs_allocated == REGS_REALLOCATE)
- { /* Yes. If we need more elements than were already
- allocated, reallocate them. If we need fewer, just
- leave it alone. */
- if (regs->num_regs < num_regs + 1)
- {
- regs->num_regs = num_regs + 1;
- RETALLOC (regs->start, regs->num_regs, regoff_t);
- RETALLOC (regs->end, regs->num_regs, regoff_t);
- if (regs->start == NULL || regs->end == NULL)
- {
- FREE_VARIABLES ();
- return -2;
- }
- }
- }
- else
- {
- /* These braces fend off a "empty body in an else-statement"
- warning under GCC when assert expands to nothing. */
- assert (bufp->regs_allocated == REGS_FIXED);
- }
-
- /* Convert the pointer data in `regstart' and `regend' to
- indices. Register zero has to be set differently,
- since we haven't kept track of any info for it. */
- if (regs->num_regs > 0)
- {
- regs->start[0] = pos;
-#ifdef MBS_SUPPORT
- if (MATCHING_IN_FIRST_STRING)
- regs->end[0] = mbs_offset1 != NULL ?
- mbs_offset1[d-string1] : 0;
- else
- regs->end[0] = csize1 + (mbs_offset2 != NULL ?
- mbs_offset2[d-string2] : 0);
-#else
- regs->end[0] = (MATCHING_IN_FIRST_STRING
- ? ((regoff_t) (d - string1))
- : ((regoff_t) (d - string2 + size1)));
-#endif /* MBS_SUPPORT */
- }
-
- /* Go through the first `min (num_regs, regs->num_regs)'
- registers, since that is all we initialized. */
- for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
- mcnt++)
- {
- if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
- regs->start[mcnt] = regs->end[mcnt] = -1;
- else
- {
- regs->start[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
- regs->end[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
- }
- }
-
- /* If the regs structure we return has more elements than
- were in the pattern, set the extra elements to -1. If
- we (re)allocated the registers, this is the case,
- because we always allocate enough to have at least one
- -1 at the end. */
- for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
- regs->start[mcnt] = regs->end[mcnt] = -1;
- } /* regs && !bufp->no_sub */
-
- DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
- nfailure_points_pushed, nfailure_points_popped,
- nfailure_points_pushed - nfailure_points_popped);
- DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
-
-#ifdef MBS_SUPPORT
- if (MATCHING_IN_FIRST_STRING)
- mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
- else
- mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
- csize1;
- mcnt -= pos;
-#else
- mcnt = d - pos - (MATCHING_IN_FIRST_STRING
- ? string1
- : string2 - size1);
-#endif /* MBS_SUPPORT */
-
- DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
-
- FREE_VARIABLES ();
- return mcnt;
- }
-
- /* Otherwise match next pattern command. */
- switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
- {
- /* Ignore these. Used to ignore the n of succeed_n's which
- currently have n == 0. */
- case no_op:
- DEBUG_PRINT1 ("EXECUTING no_op.\n");
- break;
-
- case succeed:
- DEBUG_PRINT1 ("EXECUTING succeed.\n");
- goto succeed_label;
-
- /* Match the next n pattern characters exactly. The following
- byte in the pattern defines n, and the n bytes after that
- are the characters to match. */
- case exactn:
-#ifdef MBS_SUPPORT
- case exactn_bin:
-#endif
- mcnt = *p++;
- DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
-
- /* This is written out as an if-else so we don't waste time
- testing `translate' inside the loop. */
- if (translate)
- {
- do
- {
- PREFETCH ();
-#ifdef MBS_SUPPORT
- if (*d <= 0xff)
- {
- if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
- != (US_CHAR_TYPE) *p++)
- goto fail;
- }
- else
- {
- if (*d++ != (CHAR_TYPE) *p++)
- goto fail;
- }
-#else
- if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
- != (US_CHAR_TYPE) *p++)
- goto fail;
-#endif /* MBS_SUPPORT */
- }
- while (--mcnt);
- }
- else
- {
- do
- {
- PREFETCH ();
- if (*d++ != (CHAR_TYPE) *p++) goto fail;
- }
- while (--mcnt);
- }
- SET_REGS_MATCHED ();
- break;
-
-
- /* Match any character except possibly a newline or a null. */
- case anychar:
- DEBUG_PRINT1 ("EXECUTING anychar.\n");
-
- PREFETCH ();
-
- if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
- || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
- goto fail;
-
- SET_REGS_MATCHED ();
- DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
- d++;
- break;
-
-
- case charset:
- case charset_not:
- {
- register US_CHAR_TYPE c;
-#ifdef MBS_SUPPORT
- unsigned int i, char_class_length, coll_symbol_length,
- equiv_class_length, ranges_length, chars_length, length;
- CHAR_TYPE *workp, *workp2, *charset_top;
-#define WORK_BUFFER_SIZE 128
- CHAR_TYPE str_buf[WORK_BUFFER_SIZE];
-# ifdef _LIBC
- uint32_t nrules;
-# endif /* _LIBC */
-#endif /* MBS_SUPPORT */
- boolean not = (re_opcode_t) *(p - 1) == charset_not;
-
- DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
- PREFETCH ();
- c = TRANSLATE (*d); /* The character to match. */
-#ifdef MBS_SUPPORT
-# ifdef _LIBC
- nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif /* _LIBC */
- charset_top = p - 1;
- char_class_length = *p++;
- coll_symbol_length = *p++;
- equiv_class_length = *p++;
- ranges_length = *p++;
- chars_length = *p++;
- /* p points charset[6], so the address of the next instruction
- (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
- where l=length of char_classes, m=length of collating_symbol,
- n=equivalence_class, o=length of char_range,
- p'=length of character. */
- workp = p;
- /* Update p to indicate the next instruction. */
- p += char_class_length + coll_symbol_length+ equiv_class_length +
- 2*ranges_length + chars_length;
-
- /* match with char_class? */
- for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
- {
- wctype_t wctype;
- uintptr_t alignedp = ((uintptr_t)workp
- + __alignof__(wctype_t) - 1)
- & ~(uintptr_t)(__alignof__(wctype_t) - 1);
- wctype = *((wctype_t*)alignedp);
- workp += CHAR_CLASS_SIZE;
- if (iswctype((wint_t)c, wctype))
- goto char_set_matched;
- }
-
- /* match with collating_symbol? */
-# ifdef _LIBC
- if (nrules != 0)
- {
- const unsigned char *extra = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
-
- for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
- workp++)
- {
- int32_t *wextra;
- wextra = (int32_t*)(extra + *workp++);
- for (i = 0; i < *wextra; ++i)
- if (TRANSLATE(d[i]) != wextra[1 + i])
- break;
-
- if (i == *wextra)
- {
- /* Update d, however d will be incremented at
- char_set_matched:, we decrement d here. */
- d += i - 1;
- goto char_set_matched;
- }
- }
- }
- else /* (nrules == 0) */
-# endif
- /* If we can't look up collation data, we use wcscoll
- instead. */
- {
- for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
- {
- const CHAR_TYPE *backup_d = d, *backup_dend = dend;
- length = wcslen(workp);
-
- /* If wcscoll(the collating symbol, whole string) > 0,
- any substring of the string never match with the
- collating symbol. */
- if (wcscoll(workp, d) > 0)
- {
- workp += length + 1;
- continue;
- }
-
- /* First, we compare the collating symbol with
- the first character of the string.
- If it don't match, we add the next character to
- the compare buffer in turn. */
- for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
- {
- int match;
- if (d == dend)
- {
- if (dend == end_match_2)
- break;
- d = string2;
- dend = end_match_2;
- }
-
- /* add next character to the compare buffer. */
- str_buf[i] = TRANSLATE(*d);
- str_buf[i+1] = '\0';
-
- match = wcscoll(workp, str_buf);
- if (match == 0)
- goto char_set_matched;
-
- if (match < 0)
- /* (str_buf > workp) indicate (str_buf + X > workp),
- because for all X (str_buf + X > str_buf).
- So we don't need continue this loop. */
- break;
-
- /* Otherwise(str_buf < workp),
- (str_buf+next_character) may equals (workp).
- So we continue this loop. */
- }
- /* not matched */
- d = backup_d;
- dend = backup_dend;
- workp += length + 1;
- }
- }
- /* match with equivalence_class? */
-# ifdef _LIBC
- if (nrules != 0)
- {
- const CHAR_TYPE *backup_d = d, *backup_dend = dend;
- /* Try to match the equivalence class against
- those known to the collate implementation. */
- const int32_t *table;
- const int32_t *weights;
- const int32_t *extra;
- const int32_t *indirect;
- int32_t idx, idx2;
- wint_t *cp;
- size_t len;
-
- /* This #include defines a local function! */
-# include <locale/weightwc.h>
-
- table = (const int32_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
- weights = (const wint_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
- extra = (const wint_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
- indirect = (const int32_t *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
-
- /* Write 1 collating element to str_buf, and
- get its index. */
- idx2 = 0;
-
- for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
- {
- cp = (wint_t*)str_buf;
- if (d == dend)
- {
- if (dend == end_match_2)
- break;
- d = string2;
- dend = end_match_2;
- }
- str_buf[i] = TRANSLATE(*(d+i));
- str_buf[i+1] = '\0'; /* sentinel */
- idx2 = findidx ((const wint_t**)&cp);
- }
-
- /* Update d, however d will be incremented at
- char_set_matched:, we decrement d here. */
- d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
- if (d >= dend)
- {
- if (dend == end_match_2)
- d = dend;
- else
- {
- d = string2;
- dend = end_match_2;
- }
- }
-
- len = weights[idx2];
-
- for (workp2 = workp + equiv_class_length ; workp < workp2 ;
- workp++)
- {
- idx = (int32_t)*workp;
- /* We already checked idx != 0 in regex_compile. */
-
- if (idx2 != 0 && len == weights[idx])
- {
- int cnt = 0;
- while (cnt < len && (weights[idx + 1 + cnt]
- == weights[idx2 + 1 + cnt]))
- ++cnt;
-
- if (cnt == len)
- goto char_set_matched;
- }
- }
- /* not matched */
- d = backup_d;
- dend = backup_dend;
- }
- else /* (nrules == 0) */
-# endif
- /* If we can't look up collation data, we use wcscoll
- instead. */
- {
- for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
- {
- const CHAR_TYPE *backup_d = d, *backup_dend = dend;
- length = wcslen(workp);
-
- /* If wcscoll(the collating symbol, whole string) > 0,
- any substring of the string never match with the
- collating symbol. */
- if (wcscoll(workp, d) > 0)
- {
- workp += length + 1;
- break;
- }
-
- /* First, we compare the equivalence class with
- the first character of the string.
- If it don't match, we add the next character to
- the compare buffer in turn. */
- for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
- {
- int match;
- if (d == dend)
- {
- if (dend == end_match_2)
- break;
- d = string2;
- dend = end_match_2;
- }
-
- /* add next character to the compare buffer. */
- str_buf[i] = TRANSLATE(*d);
- str_buf[i+1] = '\0';
-
- match = wcscoll(workp, str_buf);
-
- if (match == 0)
- goto char_set_matched;
-
- if (match < 0)
- /* (str_buf > workp) indicate (str_buf + X > workp),
- because for all X (str_buf + X > str_buf).
- So we don't need continue this loop. */
- break;
-
- /* Otherwise(str_buf < workp),
- (str_buf+next_character) may equals (workp).
- So we continue this loop. */
- }
- /* not matched */
- d = backup_d;
- dend = backup_dend;
- workp += length + 1;
- }
- }
-
- /* match with char_range? */
-#ifdef _LIBC
- if (nrules != 0)
- {
- uint32_t collseqval;
- const char *collseq = (const char *)
- _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
-
- collseqval = collseq_table_lookup (collseq, c);
-
- for (; workp < p - chars_length ;)
- {
- uint32_t start_val, end_val;
-
- /* We already compute the collation sequence value
- of the characters (or collating symbols). */
- start_val = (uint32_t) *workp++; /* range_start */
- end_val = (uint32_t) *workp++; /* range_end */
-
- if (start_val <= collseqval && collseqval <= end_val)
- goto char_set_matched;
- }
- }
- else
-#endif
- {
- /* We set range_start_char at str_buf[0], range_end_char
- at str_buf[4], and compared char at str_buf[2]. */
- str_buf[1] = 0;
- str_buf[2] = c;
- str_buf[3] = 0;
- str_buf[5] = 0;
- for (; workp < p - chars_length ;)
- {
- wchar_t *range_start_char, *range_end_char;
-
- /* match if (range_start_char <= c <= range_end_char). */
-
- /* If range_start(or end) < 0, we assume -range_start(end)
- is the offset of the collating symbol which is specified
- as the character of the range start(end). */
-
- /* range_start */
- if (*workp < 0)
- range_start_char = charset_top - (*workp++);
- else
- {
- str_buf[0] = *workp++;
- range_start_char = str_buf;
- }
-
- /* range_end */
- if (*workp < 0)
- range_end_char = charset_top - (*workp++);
- else
- {
- str_buf[4] = *workp++;
- range_end_char = str_buf + 4;
- }
-
- if (wcscoll(range_start_char, str_buf+2) <= 0 &&
- wcscoll(str_buf+2, range_end_char) <= 0)
-
- goto char_set_matched;
- }
- }
-
- /* match with char? */
- for (; workp < p ; workp++)
- if (c == *workp)
- goto char_set_matched;
-
- not = !not;
-
- char_set_matched:
- if (not) goto fail;
-#else
- /* Cast to `unsigned' instead of `unsigned char' in case the
- bit list is a full 32 bytes long. */
- if (c < (unsigned) (*p * BYTEWIDTH)
- && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
-
- p += 1 + *p;
-
- if (!not) goto fail;
-#undef WORK_BUFFER_SIZE
-#endif /* MBS_SUPPORT */
- SET_REGS_MATCHED ();
- d++;
- break;
- }
-
-
- /* The beginning of a group is represented by start_memory.
- The arguments are the register number in the next byte, and the
- number of groups inner to this one in the next. The text
- matched within the group is recorded (in the internal
- registers data structure) under the register number. */
- case start_memory:
- DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
- (long int) *p, (long int) p[1]);
-
- /* Find out if this group can match the empty string. */
- p1 = p; /* To send to group_match_null_string_p. */
-
- if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
- REG_MATCH_NULL_STRING_P (reg_info[*p])
- = group_match_null_string_p (&p1, pend, reg_info);
-
- /* Save the position in the string where we were the last time
- we were at this open-group operator in case the group is
- operated upon by a repetition operator, e.g., with `(a*)*b'
- against `ab'; then we want to ignore where we are now in
- the string in case this attempt to match fails. */
- old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
- ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
- : regstart[*p];
- DEBUG_PRINT2 (" old_regstart: %d\n",
- POINTER_TO_OFFSET (old_regstart[*p]));
-
- regstart[*p] = d;
- DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
-
- IS_ACTIVE (reg_info[*p]) = 1;
- MATCHED_SOMETHING (reg_info[*p]) = 0;
-
- /* Clear this whenever we change the register activity status. */
- set_regs_matched_done = 0;
-
- /* This is the new highest active register. */
- highest_active_reg = *p;
-
- /* If nothing was active before, this is the new lowest active
- register. */
- if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
- lowest_active_reg = *p;
-
- /* Move past the register number and inner group count. */
- p += 2;
- just_past_start_mem = p;
-
- break;
-
-
- /* The stop_memory opcode represents the end of a group. Its
- arguments are the same as start_memory's: the register
- number, and the number of inner groups. */
- case stop_memory:
- DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
- (long int) *p, (long int) p[1]);
-
- /* We need to save the string position the last time we were at
- this close-group operator in case the group is operated
- upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
- against `aba'; then we want to ignore where we are now in
- the string in case this attempt to match fails. */
- old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
- ? REG_UNSET (regend[*p]) ? d : regend[*p]
- : regend[*p];
- DEBUG_PRINT2 (" old_regend: %d\n",
- POINTER_TO_OFFSET (old_regend[*p]));
-
- regend[*p] = d;
- DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
-
- /* This register isn't active anymore. */
- IS_ACTIVE (reg_info[*p]) = 0;
-
- /* Clear this whenever we change the register activity status. */
- set_regs_matched_done = 0;
-
- /* If this was the only register active, nothing is active
- anymore. */
- if (lowest_active_reg == highest_active_reg)
- {
- lowest_active_reg = NO_LOWEST_ACTIVE_REG;
- highest_active_reg = NO_HIGHEST_ACTIVE_REG;
- }
- else
- { /* We must scan for the new highest active register, since
- it isn't necessarily one less than now: consider
- (a(b)c(d(e)f)g). When group 3 ends, after the f), the
- new highest active register is 1. */
- US_CHAR_TYPE r = *p - 1;
- while (r > 0 && !IS_ACTIVE (reg_info[r]))
- r--;
-
- /* If we end up at register zero, that means that we saved
- the registers as the result of an `on_failure_jump', not
- a `start_memory', and we jumped to past the innermost
- `stop_memory'. For example, in ((.)*) we save
- registers 1 and 2 as a result of the *, but when we pop
- back to the second ), we are at the stop_memory 1.
- Thus, nothing is active. */
- if (r == 0)
- {
- lowest_active_reg = NO_LOWEST_ACTIVE_REG;
- highest_active_reg = NO_HIGHEST_ACTIVE_REG;
- }
- else
- highest_active_reg = r;
- }
-
- /* If just failed to match something this time around with a
- group that's operated on by a repetition operator, try to
- force exit from the ``loop'', and restore the register
- information for this group that we had before trying this
- last match. */
- if ((!MATCHED_SOMETHING (reg_info[*p])
- || just_past_start_mem == p - 1)
- && (p + 2) < pend)
- {
- boolean is_a_jump_n = false;
-
- p1 = p + 2;
- mcnt = 0;
- switch ((re_opcode_t) *p1++)
- {
- case jump_n:
- is_a_jump_n = true;
- case pop_failure_jump:
- case maybe_pop_jump:
- case jump:
- case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if (is_a_jump_n)
- p1 += OFFSET_ADDRESS_SIZE;
- break;
-
- default:
- /* do nothing */ ;
- }
- p1 += mcnt;
-
- /* If the next operation is a jump backwards in the pattern
- to an on_failure_jump right before the start_memory
- corresponding to this stop_memory, exit from the loop
- by forcing a failure after pushing on the stack the
- on_failure_jump's jump in the pattern, and d. */
- if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
- && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
- && p1[2+OFFSET_ADDRESS_SIZE] == *p)
- {
- /* If this group ever matched anything, then restore
- what its registers were before trying this last
- failed match, e.g., with `(a*)*b' against `ab' for
- regstart[1], and, e.g., with `((a*)*(b*)*)*'
- against `aba' for regend[3].
-
- Also restore the registers for inner groups for,
- e.g., `((a*)(b*))*' against `aba' (register 3 would
- otherwise get trashed). */
-
- if (EVER_MATCHED_SOMETHING (reg_info[*p]))
- {
- unsigned r;
-
- EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
-
- /* Restore this and inner groups' (if any) registers. */
- for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
- r++)
- {
- regstart[r] = old_regstart[r];
-
- /* xx why this test? */
- if (old_regend[r] >= regstart[r])
- regend[r] = old_regend[r];
- }
- }
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
-
- goto fail;
- }
- }
-
- /* Move past the register number and the inner group count. */
- p += 2;
- break;
-
-
- /* \<digit> has been turned into a `duplicate' command which is
- followed by the numeric value of <digit> as the register number. */
- case duplicate:
- {
- register const CHAR_TYPE *d2, *dend2;
- int regno = *p++; /* Get which register to match against. */
- DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
-
- /* Can't back reference a group which we've never matched. */
- if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
- goto fail;
-
- /* Where in input to try to start matching. */
- d2 = regstart[regno];
-
- /* Where to stop matching; if both the place to start and
- the place to stop matching are in the same string, then
- set to the place to stop, otherwise, for now have to use
- the end of the first string. */
-
- dend2 = ((FIRST_STRING_P (regstart[regno])
- == FIRST_STRING_P (regend[regno]))
- ? regend[regno] : end_match_1);
- for (;;)
- {
- /* If necessary, advance to next segment in register
- contents. */
- while (d2 == dend2)
- {
- if (dend2 == end_match_2) break;
- if (dend2 == regend[regno]) break;
-
- /* End of string1 => advance to string2. */
- d2 = string2;
- dend2 = regend[regno];
- }
- /* At end of register contents => success */
- if (d2 == dend2) break;
-
- /* If necessary, advance to next segment in data. */
- PREFETCH ();
-
- /* How many characters left in this segment to match. */
- mcnt = dend - d;
-
- /* Want how many consecutive characters we can match in
- one shot, so, if necessary, adjust the count. */
- if (mcnt > dend2 - d2)
- mcnt = dend2 - d2;
-
- /* Compare that many; failure if mismatch, else move
- past them. */
- if (translate
- ? bcmp_translate (d, d2, mcnt, translate)
- : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
- goto fail;
- d += mcnt, d2 += mcnt;
-
- /* Do this because we've match some characters. */
- SET_REGS_MATCHED ();
- }
- }
- break;
-
-
- /* begline matches the empty string at the beginning of the string
- (unless `not_bol' is set in `bufp'), and, if
- `newline_anchor' is set, after newlines. */
- case begline:
- DEBUG_PRINT1 ("EXECUTING begline.\n");
-
- if (AT_STRINGS_BEG (d))
- {
- if (!bufp->not_bol) break;
- }
- else if (d[-1] == '\n' && bufp->newline_anchor)
- {
- break;
- }
- /* In all other cases, we fail. */
- goto fail;
-
-
- /* endline is the dual of begline. */
- case endline:
- DEBUG_PRINT1 ("EXECUTING endline.\n");
-
- if (AT_STRINGS_END (d))
- {
- if (!bufp->not_eol) break;
- }
-
- /* We have to ``prefetch'' the next character. */
- else if ((d == end1 ? *string2 : *d) == '\n'
- && bufp->newline_anchor)
- {
- break;
- }
- goto fail;
-
-
- /* Match at the very beginning of the data. */
- case begbuf:
- DEBUG_PRINT1 ("EXECUTING begbuf.\n");
- if (AT_STRINGS_BEG (d))
- break;
- goto fail;
-
-
- /* Match at the very end of the data. */
- case endbuf:
- DEBUG_PRINT1 ("EXECUTING endbuf.\n");
- if (AT_STRINGS_END (d))
- break;
- goto fail;
-
-
- /* on_failure_keep_string_jump is used to optimize `.*\n'. It
- pushes NULL as the value for the string on the stack. Then
- `pop_failure_point' will keep the current value for the
- string, instead of restoring it. To see why, consider
- matching `foo\nbar' against `.*\n'. The .* matches the foo;
- then the . fails against the \n. But the next thing we want
- to do is match the \n against the \n; if we restored the
- string value, we would be back at the foo.
-
- Because this is used only in specific cases, we don't need to
- check all the things that `on_failure_jump' does, to make
- sure the right things get saved on the stack. Hence we don't
- share its code. The only reason to push anything on the
- stack at all is that otherwise we would have to change
- `anychar's code to do something besides goto fail in this
- case; that seems worse than this. */
- case on_failure_keep_string_jump:
- DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
-
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
-#ifdef _LIBC
- DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
-#else
- DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
-#endif
-
- PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
- break;
-
-
- /* Uses of on_failure_jump:
-
- Each alternative starts with an on_failure_jump that points
- to the beginning of the next alternative. Each alternative
- except the last ends with a jump that in effect jumps past
- the rest of the alternatives. (They really jump to the
- ending jump of the following alternative, because tensioning
- these jumps is a hassle.)
-
- Repeats start with an on_failure_jump that points past both
- the repetition text and either the following jump or
- pop_failure_jump back to this on_failure_jump. */
- case on_failure_jump:
- on_failure:
- DEBUG_PRINT1 ("EXECUTING on_failure_jump");
-
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
-#ifdef _LIBC
- DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
-#else
- DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
-#endif
-
- /* If this on_failure_jump comes right before a group (i.e.,
- the original * applied to a group), save the information
- for that group and all inner ones, so that if we fail back
- to this point, the group's information will be correct.
- For example, in \(a*\)*\1, we need the preceding group,
- and in \(zz\(a*\)b*\)\2, we need the inner group. */
-
- /* We can't use `p' to check ahead because we push
- a failure point to `p + mcnt' after we do this. */
- p1 = p;
-
- /* We need to skip no_op's before we look for the
- start_memory in case this on_failure_jump is happening as
- the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
- against aba. */
- while (p1 < pend && (re_opcode_t) *p1 == no_op)
- p1++;
-
- if (p1 < pend && (re_opcode_t) *p1 == start_memory)
- {
- /* We have a new highest active register now. This will
- get reset at the start_memory we are about to get to,
- but we will have saved all the registers relevant to
- this repetition op, as described above. */
- highest_active_reg = *(p1 + 1) + *(p1 + 2);
- if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
- lowest_active_reg = *(p1 + 1);
- }
-
- DEBUG_PRINT1 (":\n");
- PUSH_FAILURE_POINT (p + mcnt, d, -2);
- break;
-
-
- /* A smart repeat ends with `maybe_pop_jump'.
- We change it to either `pop_failure_jump' or `jump'. */
- case maybe_pop_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
- {
- register US_CHAR_TYPE *p2 = p;
-
- /* Compare the beginning of the repeat with what in the
- pattern follows its end. If we can establish that there
- is nothing that they would both match, i.e., that we
- would have to backtrack because of (as in, e.g., `a*a')
- then we can change to pop_failure_jump, because we'll
- never have to backtrack.
-
- This is not true in the case of alternatives: in
- `(a|ab)*' we do need to backtrack to the `ab' alternative
- (e.g., if the string was `ab'). But instead of trying to
- detect that here, the alternative has put on a dummy
- failure point which is what we will end up popping. */
-
- /* Skip over open/close-group commands.
- If what follows this loop is a ...+ construct,
- look at what begins its body, since we will have to
- match at least one of that. */
- while (1)
- {
- if (p2 + 2 < pend
- && ((re_opcode_t) *p2 == stop_memory
- || (re_opcode_t) *p2 == start_memory))
- p2 += 3;
- else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
- && (re_opcode_t) *p2 == dummy_failure_jump)
- p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
- else
- break;
- }
-
- p1 = p + mcnt;
- /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
- to the `maybe_finalize_jump' of this case. Examine what
- follows. */
-
- /* If we're at the end of the pattern, we can change. */
- if (p2 == pend)
- {
- /* Consider what happens when matching ":\(.*\)"
- against ":/". I don't really understand this code
- yet. */
- p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
- pop_failure_jump;
- DEBUG_PRINT1
- (" End of pattern: change to `pop_failure_jump'.\n");
- }
-
- else if ((re_opcode_t) *p2 == exactn
-#ifdef MBS_SUPPORT
- || (re_opcode_t) *p2 == exactn_bin
-#endif
- || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
- {
- register US_CHAR_TYPE c
- = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2];
-
- if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
-#ifdef MBS_SUPPORT
- || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
-#endif
- ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
- {
- p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
- pop_failure_jump;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX != 1)
- DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
- (wint_t) c,
- (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
- else
-#endif
- DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
- (char) c,
- (char) p1[3+OFFSET_ADDRESS_SIZE]);
- }
-
-#ifndef MBS_SUPPORT
- else if ((re_opcode_t) p1[3] == charset
- || (re_opcode_t) p1[3] == charset_not)
- {
- int not = (re_opcode_t) p1[3] == charset_not;
-
- if (c < (unsigned) (p1[4] * BYTEWIDTH)
- && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
-
- /* `not' is equal to 1 if c would match, which means
- that we can't change to pop_failure_jump. */
- if (!not)
- {
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
- }
- }
-#endif /* not MBS_SUPPORT */
- }
-#ifndef MBS_SUPPORT
- else if ((re_opcode_t) *p2 == charset)
- {
- /* We win if the first character of the loop is not part
- of the charset. */
- if ((re_opcode_t) p1[3] == exactn
- && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
- && (p2[2 + p1[5] / BYTEWIDTH]
- & (1 << (p1[5] % BYTEWIDTH)))))
- {
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
- }
-
- else if ((re_opcode_t) p1[3] == charset_not)
- {
- int idx;
- /* We win if the charset_not inside the loop
- lists every character listed in the charset after. */
- for (idx = 0; idx < (int) p2[1]; idx++)
- if (! (p2[2 + idx] == 0
- || (idx < (int) p1[4]
- && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
- break;
-
- if (idx == p2[1])
- {
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
- }
- }
- else if ((re_opcode_t) p1[3] == charset)
- {
- int idx;
- /* We win if the charset inside the loop
- has no overlap with the one after the loop. */
- for (idx = 0;
- idx < (int) p2[1] && idx < (int) p1[4];
- idx++)
- if ((p2[2 + idx] & p1[5 + idx]) != 0)
- break;
-
- if (idx == p2[1] || idx == p1[4])
- {
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
- }
- }
- }
-#endif /* not MBS_SUPPORT */
- }
- p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
- if ((re_opcode_t) p[-1] != pop_failure_jump)
- {
- p[-1] = (US_CHAR_TYPE) jump;
- DEBUG_PRINT1 (" Match => jump.\n");
- goto unconditional_jump;
- }
- /* Note fall through. */
-
-
- /* The end of a simple repeat has a pop_failure_jump back to
- its matching on_failure_jump, where the latter will push a
- failure point. The pop_failure_jump takes off failure
- points put on by this pop_failure_jump's matching
- on_failure_jump; we got through the pattern to here from the
- matching on_failure_jump, so didn't fail. */
- case pop_failure_jump:
- {
- /* We need to pass separate storage for the lowest and
- highest registers, even though we don't care about the
- actual values. Otherwise, we will restore only one
- register from the stack, since lowest will == highest in
- `pop_failure_point'. */
- active_reg_t dummy_low_reg, dummy_high_reg;
- US_CHAR_TYPE *pdummy = NULL;
- const CHAR_TYPE *sdummy = NULL;
-
- DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
- POP_FAILURE_POINT (sdummy, pdummy,
- dummy_low_reg, dummy_high_reg,
- reg_dummy, reg_dummy, reg_info_dummy);
- }
- /* Note fall through. */
-
- unconditional_jump:
-#ifdef _LIBC
- DEBUG_PRINT2 ("\n%p: ", p);
-#else
- DEBUG_PRINT2 ("\n0x%x: ", p);
-#endif
- /* Note fall through. */
-
- /* Unconditionally jump (without popping any failure points). */
- case jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
- DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
- p += mcnt; /* Do the jump. */
-#ifdef _LIBC
- DEBUG_PRINT2 ("(to %p).\n", p);
-#else
- DEBUG_PRINT2 ("(to 0x%x).\n", p);
-#endif
- break;
-
-
- /* We need this opcode so we can detect where alternatives end
- in `group_match_null_string_p' et al. */
- case jump_past_alt:
- DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
- goto unconditional_jump;
-
-
- /* Normally, the on_failure_jump pushes a failure point, which
- then gets popped at pop_failure_jump. We will end up at
- pop_failure_jump, also, and with a pattern of, say, `a+', we
- are skipping over the on_failure_jump, so we have to push
- something meaningless for pop_failure_jump to pop. */
- case dummy_failure_jump:
- DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
- /* It doesn't matter what we push for the string here. What
- the code at `fail' tests is the value for the pattern. */
- PUSH_FAILURE_POINT (NULL, NULL, -2);
- goto unconditional_jump;
-
-
- /* At the end of an alternative, we need to push a dummy failure
- point in case we are followed by a `pop_failure_jump', because
- we don't want the failure point for the alternative to be
- popped. For example, matching `(a|ab)*' against `aab'
- requires that we match the `ab' alternative. */
- case push_dummy_failure:
- DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
- /* See comments just above at `dummy_failure_jump' about the
- two zeroes. */
- PUSH_FAILURE_POINT (NULL, NULL, -2);
- break;
-
- /* Have to succeed matching what follows at least n times.
- After that, handle like `on_failure_jump'. */
- case succeed_n:
- EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
- DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
-
- assert (mcnt >= 0);
- /* Originally, this is how many times we HAVE to succeed. */
- if (mcnt > 0)
- {
- mcnt--;
- p += OFFSET_ADDRESS_SIZE;
- STORE_NUMBER_AND_INCR (p, mcnt);
-#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
- , mcnt);
-#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
- , mcnt);
-#endif
- }
- else if (mcnt == 0)
- {
-#ifdef _LIBC
- DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
- p + OFFSET_ADDRESS_SIZE);
-#else
- DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
- p + OFFSET_ADDRESS_SIZE);
-#endif /* _LIBC */
-
-#ifdef MBS_SUPPORT
- p[1] = (US_CHAR_TYPE) no_op;
-#else
- p[2] = (US_CHAR_TYPE) no_op;
- p[3] = (US_CHAR_TYPE) no_op;
-#endif /* MBS_SUPPORT */
- goto on_failure;
- }
- break;
-
- case jump_n:
- EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
- DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
-
- /* Originally, this is how many times we CAN jump. */
- if (mcnt)
- {
- mcnt--;
- STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
-
-#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
- mcnt);
-#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
- mcnt);
-#endif /* _LIBC */
- goto unconditional_jump;
- }
- /* If don't have to jump any more, skip over the rest of command. */
- else
- p += 2 * OFFSET_ADDRESS_SIZE;
- break;
-
- case set_number_at:
- {
- DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
-
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- p1 = p + mcnt;
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
-#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
-#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
-#endif
- STORE_NUMBER (p1, mcnt);
- break;
- }
-
-#if 0
- /* The DEC Alpha C compiler 3.x generates incorrect code for the
- test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
- AT_WORD_BOUNDARY, so this code is disabled. Expanding the
- macro and introducing temporary variables works around the bug. */
-
- case wordbound:
- DEBUG_PRINT1 ("EXECUTING wordbound.\n");
- if (AT_WORD_BOUNDARY (d))
- break;
- goto fail;
-
- case notwordbound:
- DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
- if (AT_WORD_BOUNDARY (d))
- goto fail;
- break;
-#else
- case wordbound:
- {
- boolean prevchar, thischar;
-
- DEBUG_PRINT1 ("EXECUTING wordbound.\n");
- if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
- break;
-
- prevchar = WORDCHAR_P (d - 1);
- thischar = WORDCHAR_P (d);
- if (prevchar != thischar)
- break;
- goto fail;
- }
-
- case notwordbound:
- {
- boolean prevchar, thischar;
-
- DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
- if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
- goto fail;
-
- prevchar = WORDCHAR_P (d - 1);
- thischar = WORDCHAR_P (d);
- if (prevchar != thischar)
- goto fail;
- break;
- }
-#endif
-
- case wordbeg:
- DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
- if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
- break;
- goto fail;
-
- case wordend:
- DEBUG_PRINT1 ("EXECUTING wordend.\n");
- if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
- && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
- break;
- goto fail;
-
-#ifdef emacs
- case before_dot:
- DEBUG_PRINT1 ("EXECUTING before_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) >= point)
- goto fail;
- break;
-
- case at_dot:
- DEBUG_PRINT1 ("EXECUTING at_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) != point)
- goto fail;
- break;
-
- case after_dot:
- DEBUG_PRINT1 ("EXECUTING after_dot.\n");
- if (PTR_CHAR_POS ((unsigned char *) d) <= point)
- goto fail;
- break;
-
- case syntaxspec:
- DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
- mcnt = *p++;
- goto matchsyntax;
-
- case wordchar:
- DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
- mcnt = (int) Sword;
- matchsyntax:
- PREFETCH ();
- /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
- d++;
- if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
- goto fail;
- SET_REGS_MATCHED ();
- break;
-
- case notsyntaxspec:
- DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
- mcnt = *p++;
- goto matchnotsyntax;
-
- case notwordchar:
- DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
- mcnt = (int) Sword;
- matchnotsyntax:
- PREFETCH ();
- /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
- d++;
- if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
- goto fail;
- SET_REGS_MATCHED ();
- break;
-
-#else /* not emacs */
- case wordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
- PREFETCH ();
- if (!WORDCHAR_P (d))
- goto fail;
- SET_REGS_MATCHED ();
- d++;
- break;
-
- case notwordchar:
- DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
- PREFETCH ();
- if (WORDCHAR_P (d))
- goto fail;
- SET_REGS_MATCHED ();
- d++;
- break;
-#endif /* not emacs */
-
- default:
- abort ();
- }
- continue; /* Successfully executed one pattern command; keep going. */
-
-
- /* We goto here if a matching operation fails. */
- fail:
- if (!FAIL_STACK_EMPTY ())
- { /* A restart point is known. Restore to that state. */
- DEBUG_PRINT1 ("\nFAIL:\n");
- POP_FAILURE_POINT (d, p,
- lowest_active_reg, highest_active_reg,
- regstart, regend, reg_info);
-
- /* If this failure point is a dummy, try the next one. */
- if (!p)
- goto fail;
-
- /* If we failed to the end of the pattern, don't examine *p. */
- assert (p <= pend);
- if (p < pend)
- {
- boolean is_a_jump_n = false;
-
- /* If failed to a backwards jump that's part of a repetition
- loop, need to pop this failure point and use the next one. */
- switch ((re_opcode_t) *p)
- {
- case jump_n:
- is_a_jump_n = true;
- case maybe_pop_jump:
- case pop_failure_jump:
- case jump:
- p1 = p + 1;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
-
- if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
- || (!is_a_jump_n
- && (re_opcode_t) *p1 == on_failure_jump))
- goto fail;
- break;
- default:
- /* do nothing */ ;
- }
- }
-
- if (d >= string1 && d <= end1)
- dend = end_match_1;
- }
- else
- break; /* Matching at this starting point really fails. */
- } /* for (;;) */
-
- if (best_regs_set)
- goto restore_best_regs;
-
- FREE_VARIABLES ();
-
- return -1; /* Failure to match. */
-} /* re_match_2 */
-
-/* Subroutine definitions for re_match_2. */
-
-
-/* We are passed P pointing to a register number after a start_memory.
-
- Return true if the pattern up to the corresponding stop_memory can
- match the empty string, and false otherwise.
-
- If we find the matching stop_memory, sets P to point to one past its number.
- Otherwise, sets P to an undefined byte less than or equal to END.
-
- We don't handle duplicates properly (yet). */
-
-static boolean
-group_match_null_string_p (p, end, reg_info)
- US_CHAR_TYPE **p, *end;
- register_info_type *reg_info;
-{
- int mcnt;
- /* Point to after the args to the start_memory. */
- US_CHAR_TYPE *p1 = *p + 2;
-
- while (p1 < end)
- {
- /* Skip over opcodes that can match nothing, and return true or
- false, as appropriate, when we get to one that can't, or to the
- matching stop_memory. */
-
- switch ((re_opcode_t) *p1)
- {
- /* Could be either a loop or a series of alternatives. */
- case on_failure_jump:
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
-
- /* If the next operation is not a jump backwards in the
- pattern. */
-
- if (mcnt >= 0)
- {
- /* Go through the on_failure_jumps of the alternatives,
- seeing if any of the alternatives cannot match nothing.
- The last alternative starts with only a jump,
- whereas the rest start with on_failure_jump and end
- with a jump, e.g., here is the pattern for `a|b|c':
-
- /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
- /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
- /exactn/1/c
-
- So, we have to first go through the first (n-1)
- alternatives and then deal with the last one separately. */
-
-
- /* Deal with the first (n-1) alternatives, which start
- with an on_failure_jump (see above) that jumps to right
- past a jump_past_alt. */
-
- while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
- jump_past_alt)
- {
- /* `mcnt' holds how many bytes long the alternative
- is, including the ending `jump_past_alt' and
- its number. */
-
- if (!alt_match_null_string_p (p1, p1 + mcnt -
- (1 + OFFSET_ADDRESS_SIZE),
- reg_info))
- return false;
-
- /* Move to right after this alternative, including the
- jump_past_alt. */
- p1 += mcnt;
-
- /* Break if it's the beginning of an n-th alternative
- that doesn't begin with an on_failure_jump. */
- if ((re_opcode_t) *p1 != on_failure_jump)
- break;
-
- /* Still have to check that it's not an n-th
- alternative that starts with an on_failure_jump. */
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
- jump_past_alt)
- {
- /* Get to the beginning of the n-th alternative. */
- p1 -= 1 + OFFSET_ADDRESS_SIZE;
- break;
- }
- }
-
- /* Deal with the last alternative: go back and get number
- of the `jump_past_alt' just before it. `mcnt' contains
- the length of the alternative. */
- EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
-
- if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
- return false;
-
- p1 += mcnt; /* Get past the n-th alternative. */
- } /* if mcnt > 0 */
- break;
-
-
- case stop_memory:
- assert (p1[1] == **p);
- *p = p1 + 2;
- return true;
-
-
- default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
- return false;
- }
- } /* while p1 < end */
-
- return false;
-} /* group_match_null_string_p */
-
-
-/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
- It expects P to be the first byte of a single alternative and END one
- byte past the last. The alternative can contain groups. */
-
-static boolean
-alt_match_null_string_p (p, end, reg_info)
- US_CHAR_TYPE *p, *end;
- register_info_type *reg_info;
-{
- int mcnt;
- US_CHAR_TYPE *p1 = p;
-
- while (p1 < end)
- {
- /* Skip over opcodes that can match nothing, and break when we get
- to one that can't. */
-
- switch ((re_opcode_t) *p1)
- {
- /* It's a loop. */
- case on_failure_jump:
- p1++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
- break;
-
- default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
- return false;
- }
- } /* while p1 < end */
-
- return true;
-} /* alt_match_null_string_p */
-
-
-/* Deals with the ops common to group_match_null_string_p and
- alt_match_null_string_p.
-
- Sets P to one after the op and its arguments, if any. */
-
-static boolean
-common_op_match_null_string_p (p, end, reg_info)
- US_CHAR_TYPE **p, *end;
- register_info_type *reg_info;
-{
- int mcnt;
- boolean ret;
- int reg_no;
- US_CHAR_TYPE *p1 = *p;
-
- switch ((re_opcode_t) *p1++)
- {
- case no_op:
- case begline:
- case endline:
- case begbuf:
- case endbuf:
- case wordbeg:
- case wordend:
- case wordbound:
- case notwordbound:
-#ifdef emacs
- case before_dot:
- case at_dot:
- case after_dot:
-#endif
- break;
-
- case start_memory:
- reg_no = *p1;
- assert (reg_no > 0 && reg_no <= MAX_REGNUM);
- ret = group_match_null_string_p (&p1, end, reg_info);
-
- /* Have to set this here in case we're checking a group which
- contains a group and a back reference to it. */
-
- if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
- REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
-
- if (!ret)
- return false;
- break;
-
- /* If this is an optimized succeed_n for zero times, make the jump. */
- case jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if (mcnt >= 0)
- p1 += mcnt;
- else
- return false;
- break;
-
- case succeed_n:
- /* Get to the number of times to succeed. */
- p1 += OFFSET_ADDRESS_SIZE;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
-
- if (mcnt == 0)
- {
- p1 -= 2 * OFFSET_ADDRESS_SIZE;
- EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- p1 += mcnt;
- }
- else
- return false;
- break;
-
- case duplicate:
- if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
- return false;
- break;
-
- case set_number_at:
- p1 += 2 * OFFSET_ADDRESS_SIZE;
-
- default:
- /* All other opcodes mean we cannot match the empty string. */
- return false;
- }
-
- *p = p1;
- return true;
-} /* common_op_match_null_string_p */
-
-
-/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
- bytes; nonzero otherwise. */
-
-static int
-bcmp_translate (s1, s2, len, translate)
- const CHAR_TYPE *s1, *s2;
- register int len;
- RE_TRANSLATE_TYPE translate;
-{
- register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1;
- register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2;
- while (len)
- {
-#ifdef MBS_SUPPORT
- if (((*p1<=0xff)?translate[*p1++]:*p1++)
- != ((*p2<=0xff)?translate[*p2++]:*p2++))
- return 1;
-#else
- if (translate[*p1++] != translate[*p2++]) return 1;
-#endif /* MBS_SUPPORT */
- len--;
- }
- return 0;
-}
-
-/* Entry points for GNU code. */
-
-/* re_compile_pattern is the GNU regular expression compiler: it
- compiles PATTERN (of length SIZE) and puts the result in BUFP.
- Returns 0 if the pattern was valid, otherwise an error string.
-
- Assumes the `allocated' (and perhaps `buffer') and `translate' fields
- are set in BUFP on entry.
-
- We call regex_compile to do the actual compilation. */
-
-const char *
-re_compile_pattern (pattern, length, bufp)
- const char *pattern;
- size_t length;
- struct re_pattern_buffer *bufp;
-{
- reg_errcode_t ret;
-
- /* GNU code is written to assume at least RE_NREGS registers will be set
- (and at least one extra will be -1). */
- bufp->regs_allocated = REGS_UNALLOCATED;
-
- /* And GNU code determines whether or not to get register information
- by passing null for the REGS argument to re_match, etc., not by
- setting no_sub. */
- bufp->no_sub = 0;
-
- /* Match anchors at newline. */
- bufp->newline_anchor = 1;
-
- ret = regex_compile (pattern, length, re_syntax_options, bufp);
-
- if (!ret)
- return NULL;
- return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
-}
-#ifdef _LIBC
-weak_alias (__re_compile_pattern, re_compile_pattern)
-#endif
-
-/* Entry points compatible with 4.2 BSD regex library. We don't define
- them unless specifically requested. */
-
-#if defined _REGEX_RE_COMP || defined _LIBC
-
-/* BSD has one and only one pattern buffer. */
-static struct re_pattern_buffer re_comp_buf;
-
-char *
-#ifdef _LIBC
-/* Make these definitions weak in libc, so POSIX programs can redefine
- these names if they don't use our functions, and still use
- regcomp/regexec below without link errors. */
-weak_function
-#endif
-re_comp (s)
- const char *s;
-{
- reg_errcode_t ret;
-
- if (!s)
- {
- if (!re_comp_buf.buffer)
- return gettext ("No previous regular expression");
- return 0;
- }
-
- if (!re_comp_buf.buffer)
- {
- re_comp_buf.buffer = (unsigned char *) malloc (200);
- if (re_comp_buf.buffer == NULL)
- return (char *) gettext (re_error_msgid
- + re_error_msgid_idx[(int) REG_ESPACE]);
- re_comp_buf.allocated = 200;
-
- re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
- if (re_comp_buf.fastmap == NULL)
- return (char *) gettext (re_error_msgid
- + re_error_msgid_idx[(int) REG_ESPACE]);
- }
-
- /* Since `re_exec' always passes NULL for the `regs' argument, we
- don't need to initialize the pattern buffer fields which affect it. */
-
- /* Match anchors at newlines. */
- re_comp_buf.newline_anchor = 1;
-
- ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
-
- if (!ret)
- return NULL;
-
- /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
- return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
-}
-
-
-int
-#ifdef _LIBC
-weak_function
-#endif
-re_exec (s)
- const char *s;
-{
- const int len = strlen (s);
- return
- 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
-}
-
-#endif /* _REGEX_RE_COMP */
-
-/* POSIX.2 functions. Don't define these for Emacs. */
-
-#ifndef emacs
-
-/* regcomp takes a regular expression as a string and compiles it.
-
- PREG is a regex_t *. We do not expect any fields to be initialized,
- since POSIX says we shouldn't. Thus, we set
-
- `buffer' to the compiled pattern;
- `used' to the length of the compiled pattern;
- `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
- REG_EXTENDED bit in CFLAGS is set; otherwise, to
- RE_SYNTAX_POSIX_BASIC;
- `newline_anchor' to REG_NEWLINE being set in CFLAGS;
- `fastmap' to an allocated space for the fastmap;
- `fastmap_accurate' to zero;
- `re_nsub' to the number of subexpressions in PATTERN.
-
- PATTERN is the address of the pattern string.
-
- CFLAGS is a series of bits which affect compilation.
-
- If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
- use POSIX basic syntax.
-
- If REG_NEWLINE is set, then . and [^...] don't match newline.
- Also, regexec will try a match beginning after every newline.
-
- If REG_ICASE is set, then we considers upper- and lowercase
- versions of letters to be equivalent when matching.
-
- If REG_NOSUB is set, then when PREG is passed to regexec, that
- routine will report only success or failure, and nothing about the
- registers.
-
- It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
- the return codes and their meanings.) */
-
-int
-regcomp (preg, pattern, cflags)
- regex_t *preg;
- const char *pattern;
- int cflags;
-{
- reg_errcode_t ret;
- reg_syntax_t syntax
- = (cflags & REG_EXTENDED) ?
- RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
-
- /* regex_compile will allocate the space for the compiled pattern. */
- preg->buffer = 0;
- preg->allocated = 0;
- preg->used = 0;
-
- /* Try to allocate space for the fastmap. */
- preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
-
- if (cflags & REG_ICASE)
- {
- unsigned i;
-
- preg->translate
- = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
- * sizeof (*(RE_TRANSLATE_TYPE)0));
- if (preg->translate == NULL)
- return (int) REG_ESPACE;
-
- /* Map uppercase characters to corresponding lowercase ones. */
- for (i = 0; i < CHAR_SET_SIZE; i++)
- preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
- }
- else
- preg->translate = NULL;
-
- /* If REG_NEWLINE is set, newlines are treated differently. */
- if (cflags & REG_NEWLINE)
- { /* REG_NEWLINE implies neither . nor [^...] match newline. */
- syntax &= ~RE_DOT_NEWLINE;
- syntax |= RE_HAT_LISTS_NOT_NEWLINE;
- /* It also changes the matching behavior. */
- preg->newline_anchor = 1;
- }
- else
- preg->newline_anchor = 0;
-
- preg->no_sub = !!(cflags & REG_NOSUB);
-
- /* POSIX says a null character in the pattern terminates it, so we
- can use strlen here in compiling the pattern. */
- ret = regex_compile (pattern, strlen (pattern), syntax, preg);
-
- /* POSIX doesn't distinguish between an unmatched open-group and an
- unmatched close-group: both are REG_EPAREN. */
- if (ret == REG_ERPAREN) ret = REG_EPAREN;
-
- if (ret == REG_NOERROR && preg->fastmap)
- {
- /* Compute the fastmap now, since regexec cannot modify the pattern
- buffer. */
- if (re_compile_fastmap (preg) == -2)
- {
- /* Some error occurred while computing the fastmap, just forget
- about it. */
- free (preg->fastmap);
- preg->fastmap = NULL;
- }
- }
-
- return (int) ret;
-}
-#ifdef _LIBC
-weak_alias (__regcomp, regcomp)
-#endif
-
-
-/* regexec searches for a given pattern, specified by PREG, in the
- string STRING.
-
- If NMATCH is zero or REG_NOSUB was set in the cflags argument to
- `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
- least NMATCH elements, and we set them to the offsets of the
- corresponding matched substrings.
-
- EFLAGS specifies `execution flags' which affect matching: if
- REG_NOTBOL is set, then ^ does not match at the beginning of the
- string; if REG_NOTEOL is set, then $ does not match at the end.
-
- We return 0 if we find a match and REG_NOMATCH if not. */
-
-int
-regexec (preg, string, nmatch, pmatch, eflags)
- const regex_t *preg;
- const char *string;
- size_t nmatch;
- regmatch_t pmatch[];
- int eflags;
-{
- int ret;
- struct re_registers regs;
- regex_t private_preg;
- int len = strlen (string);
- boolean want_reg_info = !preg->no_sub && nmatch > 0;
-
- private_preg = *preg;
-
- private_preg.not_bol = !!(eflags & REG_NOTBOL);
- private_preg.not_eol = !!(eflags & REG_NOTEOL);
-
- /* The user has told us exactly how many registers to return
- information about, via `nmatch'. We have to pass that on to the
- matching routines. */
- private_preg.regs_allocated = REGS_FIXED;
-
- if (want_reg_info)
- {
- regs.num_regs = nmatch;
- regs.start = TALLOC (nmatch * 2, regoff_t);
- if (regs.start == NULL)
- return (int) REG_NOMATCH;
- regs.end = regs.start + nmatch;
- }
-
- /* Perform the searching operation. */
- ret = re_search (&private_preg, string, len,
- /* start: */ 0, /* range: */ len,
- want_reg_info ? &regs : (struct re_registers *) 0);
-
- /* Copy the register information to the POSIX structure. */
- if (want_reg_info)
- {
- if (ret >= 0)
- {
- unsigned r;
-
- for (r = 0; r < nmatch; r++)
- {
- pmatch[r].rm_so = regs.start[r];
- pmatch[r].rm_eo = regs.end[r];
- }
- }
-
- /* If we needed the temporary register info, free the space now. */
- free (regs.start);
- }
-
- /* We want zero return to mean success, unlike `re_search'. */
- return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
-}
-#ifdef _LIBC
-weak_alias (__regexec, regexec)
-#endif
-
-
-/* Returns a message corresponding to an error code, ERRCODE, returned
- from either regcomp or regexec. We don't use PREG here. */
-
-size_t
-regerror (errcode, preg, errbuf, errbuf_size)
- int errcode;
- const regex_t *preg;
- char *errbuf;
- size_t errbuf_size;
-{
- const char *msg;
- size_t msg_size;
-
- if (errcode < 0
- || errcode >= (int) (sizeof (re_error_msgid_idx)
- / sizeof (re_error_msgid_idx[0])))
- /* Only error codes returned by the rest of the code should be passed
- to this routine. If we are given anything else, or if other regex
- code generates an invalid error code, then the program has a bug.
- Dump core so we can fix it. */
- abort ();
-
- msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
-
- msg_size = strlen (msg) + 1; /* Includes the null. */
-
- if (errbuf_size != 0)
- {
- if (msg_size > errbuf_size)
- {
-#if defined HAVE_MEMPCPY || defined _LIBC
- *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
-#else
- memcpy (errbuf, msg, errbuf_size - 1);
- errbuf[errbuf_size - 1] = 0;
-#endif
- }
- else
- memcpy (errbuf, msg, msg_size);
- }
-
- return msg_size;
-}
-#ifdef _LIBC
-weak_alias (__regerror, regerror)
-#endif
-
-
-/* Free dynamically allocated space used by PREG. */
-
-void
-regfree (preg)
- regex_t *preg;
-{
- if (preg->buffer != NULL)
- free (preg->buffer);
- preg->buffer = NULL;
-
- preg->allocated = 0;
- preg->used = 0;
-
- if (preg->fastmap != NULL)
- free (preg->fastmap);
- preg->fastmap = NULL;
- preg->fastmap_accurate = 0;
-
- if (preg->translate != NULL)
- free (preg->translate);
- preg->translate = NULL;
-}
-#ifdef _LIBC
-weak_alias (__regfree, regfree)
-#endif
-
-#endif /* not emacs */
diff --git a/gnu/lib/libregex/regex_internal.c b/gnu/lib/libregex/regex_internal.c
new file mode 100644
index 000000000000..b3d44c368dd4
--- /dev/null
+++ b/gnu/lib/libregex/regex_internal.c
@@ -0,0 +1,1674 @@
+/* Extended regular expression matching and search library.
+ Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+static void re_string_construct_common (const char *str, int len,
+ re_string_t *pstr,
+ RE_TRANSLATE_TYPE trans, int icase,
+ const re_dfa_t *dfa) internal_function;
+#ifdef RE_ENABLE_I18N
+static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
+ wint_t *last_wc) internal_function;
+#endif /* RE_ENABLE_I18N */
+static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
+ unsigned int hash) internal_function;
+static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
+ const re_node_set *nodes,
+ unsigned int hash) internal_function;
+static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
+ const re_node_set *nodes,
+ unsigned int context,
+ unsigned int hash) internal_function;
+static unsigned int inline calc_state_hash (const re_node_set *nodes,
+ unsigned int context) internal_function;
+
+/* Functions for string operation. */
+
+/* This function allocate the buffers. It is necessary to call
+ re_string_reconstruct before using the object. */
+
+static reg_errcode_t
+re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
+ re_string_t *pstr;
+ const char *str;
+ int len, init_len, icase;
+ RE_TRANSLATE_TYPE trans;
+ const re_dfa_t *dfa;
+{
+ reg_errcode_t ret;
+ int init_buf_len;
+
+ /* Ensure at least one character fits into the buffers. */
+ if (init_len < dfa->mb_cur_max)
+ init_len = dfa->mb_cur_max;
+ init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
+ re_string_construct_common (str, len, pstr, trans, icase, dfa);
+
+ ret = re_string_realloc_buffers (pstr, init_buf_len);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+
+ pstr->word_char = dfa->word_char;
+ pstr->word_ops_used = dfa->word_ops_used;
+ pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
+ pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
+ pstr->valid_raw_len = pstr->valid_len;
+ return REG_NOERROR;
+}
+
+/* This function allocate the buffers, and initialize them. */
+
+static reg_errcode_t
+re_string_construct (pstr, str, len, trans, icase, dfa)
+ re_string_t *pstr;
+ const char *str;
+ int len, icase;
+ RE_TRANSLATE_TYPE trans;
+ const re_dfa_t *dfa;
+{
+ reg_errcode_t ret;
+ memset (pstr, '\0', sizeof (re_string_t));
+ re_string_construct_common (str, len, pstr, trans, icase, dfa);
+
+ if (len > 0)
+ {
+ ret = re_string_realloc_buffers (pstr, len + 1);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
+ pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
+
+ if (icase)
+ {
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ {
+ while (1)
+ {
+ ret = build_wcs_upper_buffer (pstr);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ if (pstr->valid_raw_len >= len)
+ break;
+ if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
+ break;
+ ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
+ }
+ else
+#endif /* RE_ENABLE_I18N */
+ build_upper_buffer (pstr);
+ }
+ else
+ {
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ build_wcs_buffer (pstr);
+ else
+#endif /* RE_ENABLE_I18N */
+ {
+ if (trans != NULL)
+ re_string_translate_buffer (pstr);
+ else
+ {
+ pstr->valid_len = pstr->bufs_len;
+ pstr->valid_raw_len = pstr->bufs_len;
+ }
+ }
+ }
+
+ return REG_NOERROR;
+}
+
+/* Helper functions for re_string_allocate, and re_string_construct. */
+
+static reg_errcode_t
+re_string_realloc_buffers (pstr, new_buf_len)
+ re_string_t *pstr;
+ int new_buf_len;
+{
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ {
+ wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ pstr->wcs = new_array;
+ if (pstr->offsets != NULL)
+ {
+ int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ pstr->offsets = new_array;
+ }
+ }
+#endif /* RE_ENABLE_I18N */
+ if (pstr->mbs_allocated)
+ {
+ unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
+ new_buf_len);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ pstr->mbs = new_array;
+ }
+ pstr->bufs_len = new_buf_len;
+ return REG_NOERROR;
+}
+
+
+static void
+re_string_construct_common (str, len, pstr, trans, icase, dfa)
+ const char *str;
+ int len;
+ re_string_t *pstr;
+ RE_TRANSLATE_TYPE trans;
+ int icase;
+ const re_dfa_t *dfa;
+{
+ pstr->raw_mbs = (const unsigned char *) str;
+ pstr->len = len;
+ pstr->raw_len = len;
+ pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
+ pstr->icase = icase ? 1 : 0;
+ pstr->mbs_allocated = (trans != NULL || icase);
+ pstr->mb_cur_max = dfa->mb_cur_max;
+ pstr->is_utf8 = dfa->is_utf8;
+ pstr->map_notascii = dfa->map_notascii;
+ pstr->stop = pstr->len;
+ pstr->raw_stop = pstr->stop;
+}
+
+#ifdef RE_ENABLE_I18N
+
+/* Build wide character buffer PSTR->WCS.
+ If the byte sequence of the string are:
+ <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
+ Then wide character buffer will be:
+ <wc1> , WEOF , <wc2> , WEOF , <wc3>
+ We use WEOF for padding, they indicate that the position isn't
+ a first byte of a multibyte character.
+
+ Note that this function assumes PSTR->VALID_LEN elements are already
+ built and starts from PSTR->VALID_LEN. */
+
+static void
+build_wcs_buffer (pstr)
+ re_string_t *pstr;
+{
+#ifdef _LIBC
+ unsigned char buf[MB_CUR_MAX];
+ assert (MB_CUR_MAX >= pstr->mb_cur_max);
+#else
+ unsigned char buf[64];
+#endif
+ mbstate_t prev_st;
+ int byte_idx, end_idx, remain_len;
+ size_t mbclen;
+
+ /* Build the buffers from pstr->valid_len to either pstr->len or
+ pstr->bufs_len. */
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
+ for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
+ {
+ wchar_t wc;
+ const char *p;
+
+ remain_len = end_idx - byte_idx;
+ prev_st = pstr->cur_state;
+ /* Apply the translation if we need. */
+ if (BE (pstr->trans != NULL, 0))
+ {
+ int i, ch;
+
+ for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
+ {
+ ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
+ buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
+ }
+ p = (const char *) buf;
+ }
+ else
+ p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
+ mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
+ if (BE (mbclen == (size_t) -2, 0))
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
+ else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
+ {
+ /* We treat these cases as a singlebyte character. */
+ mbclen = 1;
+ wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ if (BE (pstr->trans != NULL, 0))
+ wc = pstr->trans[wc];
+ pstr->cur_state = prev_st;
+ }
+
+ /* Write wide character and padding. */
+ pstr->wcs[byte_idx++] = wc;
+ /* Write paddings. */
+ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
+ pstr->wcs[byte_idx++] = WEOF;
+ }
+ pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = byte_idx;
+}
+
+/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
+ but for REG_ICASE. */
+
+static int
+build_wcs_upper_buffer (pstr)
+ re_string_t *pstr;
+{
+ mbstate_t prev_st;
+ int src_idx, byte_idx, end_idx, remain_len;
+ size_t mbclen;
+#ifdef _LIBC
+ char buf[MB_CUR_MAX];
+ assert (MB_CUR_MAX >= pstr->mb_cur_max);
+#else
+ char buf[64];
+#endif
+
+ byte_idx = pstr->valid_len;
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
+
+ /* The following optimization assumes that ASCII characters can be
+ mapped to wide characters with a simple cast. */
+ if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
+ {
+ while (byte_idx < end_idx)
+ {
+ wchar_t wc;
+
+ if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
+ && mbsinit (&pstr->cur_state))
+ {
+ /* In case of a singlebyte character. */
+ pstr->mbs[byte_idx]
+ = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
+ /* The next step uses the assumption that wchar_t is encoded
+ ASCII-safe: all ASCII values can be converted like this. */
+ pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
+ ++byte_idx;
+ continue;
+ }
+
+ remain_len = end_idx - byte_idx;
+ prev_st = pstr->cur_state;
+ mbclen = mbrtowc (&wc,
+ ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
+ + byte_idx), remain_len, &pstr->cur_state);
+ if (BE (mbclen + 2 > 2, 1))
+ {
+ wchar_t wcu = wc;
+ if (iswlower (wc))
+ {
+ size_t mbcdlen;
+
+ wcu = towupper (wc);
+ mbcdlen = wcrtomb (buf, wcu, &prev_st);
+ if (BE (mbclen == mbcdlen, 1))
+ memcpy (pstr->mbs + byte_idx, buf, mbclen);
+ else
+ {
+ src_idx = byte_idx;
+ goto offsets_needed;
+ }
+ }
+ else
+ memcpy (pstr->mbs + byte_idx,
+ pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
+ pstr->wcs[byte_idx++] = wcu;
+ /* Write paddings. */
+ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
+ pstr->wcs[byte_idx++] = WEOF;
+ }
+ else if (mbclen == (size_t) -1 || mbclen == 0)
+ {
+ /* It is an invalid character or '\0'. Just use the byte. */
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ pstr->mbs[byte_idx] = ch;
+ /* And also cast it to wide char. */
+ pstr->wcs[byte_idx++] = (wchar_t) ch;
+ if (BE (mbclen == (size_t) -1, 0))
+ pstr->cur_state = prev_st;
+ }
+ else
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
+ }
+ pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = byte_idx;
+ return REG_NOERROR;
+ }
+ else
+ for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
+ {
+ wchar_t wc;
+ const char *p;
+ offsets_needed:
+ remain_len = end_idx - byte_idx;
+ prev_st = pstr->cur_state;
+ if (BE (pstr->trans != NULL, 0))
+ {
+ int i, ch;
+
+ for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
+ {
+ ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
+ buf[i] = pstr->trans[ch];
+ }
+ p = (const char *) buf;
+ }
+ else
+ p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
+ mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
+ if (BE (mbclen + 2 > 2, 1))
+ {
+ wchar_t wcu = wc;
+ if (iswlower (wc))
+ {
+ size_t mbcdlen;
+
+ wcu = towupper (wc);
+ mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
+ if (BE (mbclen == mbcdlen, 1))
+ memcpy (pstr->mbs + byte_idx, buf, mbclen);
+ else if (mbcdlen != (size_t) -1)
+ {
+ size_t i;
+
+ if (byte_idx + mbcdlen > pstr->bufs_len)
+ {
+ pstr->cur_state = prev_st;
+ break;
+ }
+
+ if (pstr->offsets == NULL)
+ {
+ pstr->offsets = re_malloc (int, pstr->bufs_len);
+
+ if (pstr->offsets == NULL)
+ return REG_ESPACE;
+ }
+ if (!pstr->offsets_needed)
+ {
+ for (i = 0; i < (size_t) byte_idx; ++i)
+ pstr->offsets[i] = i;
+ pstr->offsets_needed = 1;
+ }
+
+ memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
+ pstr->wcs[byte_idx] = wcu;
+ pstr->offsets[byte_idx] = src_idx;
+ for (i = 1; i < mbcdlen; ++i)
+ {
+ pstr->offsets[byte_idx + i]
+ = src_idx + (i < mbclen ? i : mbclen - 1);
+ pstr->wcs[byte_idx + i] = WEOF;
+ }
+ pstr->len += mbcdlen - mbclen;
+ if (pstr->raw_stop > src_idx)
+ pstr->stop += mbcdlen - mbclen;
+ end_idx = (pstr->bufs_len > pstr->len)
+ ? pstr->len : pstr->bufs_len;
+ byte_idx += mbcdlen;
+ src_idx += mbclen;
+ continue;
+ }
+ else
+ memcpy (pstr->mbs + byte_idx, p, mbclen);
+ }
+ else
+ memcpy (pstr->mbs + byte_idx, p, mbclen);
+
+ if (BE (pstr->offsets_needed != 0, 0))
+ {
+ size_t i;
+ for (i = 0; i < mbclen; ++i)
+ pstr->offsets[byte_idx + i] = src_idx + i;
+ }
+ src_idx += mbclen;
+
+ pstr->wcs[byte_idx++] = wcu;
+ /* Write paddings. */
+ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
+ pstr->wcs[byte_idx++] = WEOF;
+ }
+ else if (mbclen == (size_t) -1 || mbclen == 0)
+ {
+ /* It is an invalid character or '\0'. Just use the byte. */
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
+
+ if (BE (pstr->trans != NULL, 0))
+ ch = pstr->trans [ch];
+ pstr->mbs[byte_idx] = ch;
+
+ if (BE (pstr->offsets_needed != 0, 0))
+ pstr->offsets[byte_idx] = src_idx;
+ ++src_idx;
+
+ /* And also cast it to wide char. */
+ pstr->wcs[byte_idx++] = (wchar_t) ch;
+ if (BE (mbclen == (size_t) -1, 0))
+ pstr->cur_state = prev_st;
+ }
+ else
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
+ }
+ pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = src_idx;
+ return REG_NOERROR;
+}
+
+/* Skip characters until the index becomes greater than NEW_RAW_IDX.
+ Return the index. */
+
+static int
+re_string_skip_chars (pstr, new_raw_idx, last_wc)
+ re_string_t *pstr;
+ int new_raw_idx;
+ wint_t *last_wc;
+{
+ mbstate_t prev_st;
+ int rawbuf_idx;
+ size_t mbclen;
+ wchar_t wc = 0;
+
+ /* Skip the characters which are not necessary to check. */
+ for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
+ rawbuf_idx < new_raw_idx;)
+ {
+ int remain_len;
+ remain_len = pstr->len - rawbuf_idx;
+ prev_st = pstr->cur_state;
+ mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
+ remain_len, &pstr->cur_state);
+ if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
+ {
+ /* We treat these cases as a singlebyte character. */
+ mbclen = 1;
+ pstr->cur_state = prev_st;
+ }
+ /* Then proceed the next character. */
+ rawbuf_idx += mbclen;
+ }
+ *last_wc = (wint_t) wc;
+ return rawbuf_idx;
+}
+#endif /* RE_ENABLE_I18N */
+
+/* Build the buffer PSTR->MBS, and apply the translation if we need.
+ This function is used in case of REG_ICASE. */
+
+static void
+build_upper_buffer (pstr)
+ re_string_t *pstr;
+{
+ int char_idx, end_idx;
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
+
+ for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
+ {
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
+ if (BE (pstr->trans != NULL, 0))
+ ch = pstr->trans[ch];
+ if (islower (ch))
+ pstr->mbs[char_idx] = toupper (ch);
+ else
+ pstr->mbs[char_idx] = ch;
+ }
+ pstr->valid_len = char_idx;
+ pstr->valid_raw_len = char_idx;
+}
+
+/* Apply TRANS to the buffer in PSTR. */
+
+static void
+re_string_translate_buffer (pstr)
+ re_string_t *pstr;
+{
+ int buf_idx, end_idx;
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
+
+ for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
+ {
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
+ pstr->mbs[buf_idx] = pstr->trans[ch];
+ }
+
+ pstr->valid_len = buf_idx;
+ pstr->valid_raw_len = buf_idx;
+}
+
+/* This function re-construct the buffers.
+ Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
+ convert to upper case in case of REG_ICASE, apply translation. */
+
+static reg_errcode_t
+re_string_reconstruct (pstr, idx, eflags)
+ re_string_t *pstr;
+ int idx, eflags;
+{
+ int offset = idx - pstr->raw_mbs_idx;
+ if (BE (offset < 0, 0))
+ {
+ /* Reset buffer. */
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+#endif /* RE_ENABLE_I18N */
+ pstr->len = pstr->raw_len;
+ pstr->stop = pstr->raw_stop;
+ pstr->valid_len = 0;
+ pstr->raw_mbs_idx = 0;
+ pstr->valid_raw_len = 0;
+ pstr->offsets_needed = 0;
+ pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
+ : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
+ if (!pstr->mbs_allocated)
+ pstr->mbs = (unsigned char *) pstr->raw_mbs;
+ offset = idx;
+ }
+
+ if (BE (offset != 0, 1))
+ {
+ /* Are the characters which are already checked remain? */
+ if (BE (offset < pstr->valid_raw_len, 1)
+#ifdef RE_ENABLE_I18N
+ /* Handling this would enlarge the code too much.
+ Accept a slowdown in that case. */
+ && pstr->offsets_needed == 0
+#endif
+ )
+ {
+ /* Yes, move them to the front of the buffer. */
+ pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
+#endif /* RE_ENABLE_I18N */
+ if (BE (pstr->mbs_allocated, 0))
+ memmove (pstr->mbs, pstr->mbs + offset,
+ pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
+#if DEBUG
+ assert (pstr->valid_len > 0);
+#endif
+ }
+ else
+ {
+ /* No, skip all characters until IDX. */
+#ifdef RE_ENABLE_I18N
+ if (BE (pstr->offsets_needed, 0))
+ {
+ pstr->len = pstr->raw_len - idx + offset;
+ pstr->stop = pstr->raw_stop - idx + offset;
+ pstr->offsets_needed = 0;
+ }
+#endif
+ pstr->valid_len = 0;
+ pstr->valid_raw_len = 0;
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ {
+ int wcs_idx;
+ wint_t wc = WEOF;
+
+ if (pstr->is_utf8)
+ {
+ const unsigned char *raw, *p, *q, *end;
+
+ /* Special case UTF-8. Multi-byte chars start with any
+ byte other than 0x80 - 0xbf. */
+ raw = pstr->raw_mbs + pstr->raw_mbs_idx;
+ end = raw + (offset - pstr->mb_cur_max);
+ for (p = raw + offset - 1; p >= end; --p)
+ if ((*p & 0xc0) != 0x80)
+ {
+ mbstate_t cur_state;
+ wchar_t wc2;
+ int mlen = raw + pstr->len - p;
+ unsigned char buf[6];
+
+ q = p;
+ if (BE (pstr->trans != NULL, 0))
+ {
+ int i = mlen < 6 ? mlen : 6;
+ while (--i >= 0)
+ buf[i] = pstr->trans[p[i]];
+ q = buf;
+ }
+ /* XXX Don't use mbrtowc, we know which conversion
+ to use (UTF-8 -> UCS4). */
+ memset (&cur_state, 0, sizeof (cur_state));
+ mlen = (mbrtowc (&wc2, (const char *) p, mlen,
+ &cur_state)
+ - (raw + offset - p));
+ if (mlen >= 0)
+ {
+ memset (&pstr->cur_state, '\0',
+ sizeof (mbstate_t));
+ pstr->valid_len = mlen;
+ wc = wc2;
+ }
+ break;
+ }
+ }
+
+ if (wc == WEOF)
+ pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+ if (BE (pstr->valid_len, 0))
+ {
+ for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+ pstr->wcs[wcs_idx] = WEOF;
+ if (pstr->mbs_allocated)
+ memset (pstr->mbs, 255, pstr->valid_len);
+ }
+ pstr->valid_raw_len = pstr->valid_len;
+ pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
+ && IS_WIDE_WORD_CHAR (wc))
+ ? CONTEXT_WORD
+ : ((IS_WIDE_NEWLINE (wc)
+ && pstr->newline_anchor)
+ ? CONTEXT_NEWLINE : 0));
+ }
+ else
+#endif /* RE_ENABLE_I18N */
+ {
+ int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
+ if (pstr->trans)
+ c = pstr->trans[c];
+ pstr->tip_context = (bitset_contain (pstr->word_char, c)
+ ? CONTEXT_WORD
+ : ((IS_NEWLINE (c) && pstr->newline_anchor)
+ ? CONTEXT_NEWLINE : 0));
+ }
+ }
+ if (!BE (pstr->mbs_allocated, 0))
+ pstr->mbs += offset;
+ }
+ pstr->raw_mbs_idx = idx;
+ pstr->len -= offset;
+ pstr->stop -= offset;
+
+ /* Then build the buffers. */
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ {
+ if (pstr->icase)
+ {
+ int ret = build_wcs_upper_buffer (pstr);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
+ else
+ build_wcs_buffer (pstr);
+ }
+ else
+#endif /* RE_ENABLE_I18N */
+ if (BE (pstr->mbs_allocated, 0))
+ {
+ if (pstr->icase)
+ build_upper_buffer (pstr);
+ else if (pstr->trans != NULL)
+ re_string_translate_buffer (pstr);
+ }
+ else
+ pstr->valid_len = pstr->len;
+
+ pstr->cur_idx = 0;
+ return REG_NOERROR;
+}
+
+static unsigned char
+re_string_peek_byte_case (pstr, idx)
+ const re_string_t *pstr;
+ int idx;
+{
+ int ch, off;
+
+ /* Handle the common (easiest) cases first. */
+ if (BE (!pstr->mbs_allocated, 1))
+ return re_string_peek_byte (pstr, idx);
+
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1
+ && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
+ return re_string_peek_byte (pstr, idx);
+#endif
+
+ off = pstr->cur_idx + idx;
+#ifdef RE_ENABLE_I18N
+ if (pstr->offsets_needed)
+ off = pstr->offsets[off];
+#endif
+
+ ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
+
+#ifdef RE_ENABLE_I18N
+ /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
+ this function returns CAPITAL LETTER I instead of first byte of
+ DOTLESS SMALL LETTER I. The latter would confuse the parser,
+ since peek_byte_case doesn't advance cur_idx in any way. */
+ if (pstr->offsets_needed && !isascii (ch))
+ return re_string_peek_byte (pstr, idx);
+#endif
+
+ return ch;
+}
+
+static unsigned char
+re_string_fetch_byte_case (pstr)
+ re_string_t *pstr;
+{
+ if (BE (!pstr->mbs_allocated, 1))
+ return re_string_fetch_byte (pstr);
+
+#ifdef RE_ENABLE_I18N
+ if (pstr->offsets_needed)
+ {
+ int off, ch;
+
+ /* For tr_TR.UTF-8 [[:islower:]] there is
+ [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
+ in that case the whole multi-byte character and return
+ the original letter. On the other side, with
+ [[: DOTLESS SMALL LETTER I return [[:I, as doing
+ anything else would complicate things too much. */
+
+ if (!re_string_first_byte (pstr, pstr->cur_idx))
+ return re_string_fetch_byte (pstr);
+
+ off = pstr->offsets[pstr->cur_idx];
+ ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
+
+ if (! isascii (ch))
+ return re_string_fetch_byte (pstr);
+
+ re_string_skip_bytes (pstr,
+ re_string_char_size_at (pstr, pstr->cur_idx));
+ return ch;
+ }
+#endif
+
+ return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
+}
+
+static void
+re_string_destruct (pstr)
+ re_string_t *pstr;
+{
+#ifdef RE_ENABLE_I18N
+ re_free (pstr->wcs);
+ re_free (pstr->offsets);
+#endif /* RE_ENABLE_I18N */
+ if (pstr->mbs_allocated)
+ re_free (pstr->mbs);
+}
+
+/* Return the context at IDX in INPUT. */
+
+static unsigned int
+re_string_context_at (input, idx, eflags)
+ const re_string_t *input;
+ int idx, eflags;
+{
+ int c;
+ if (BE (idx < 0, 0))
+ /* In this case, we use the value stored in input->tip_context,
+ since we can't know the character in input->mbs[-1] here. */
+ return input->tip_context;
+ if (BE (idx == input->len, 0))
+ return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
+ : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1)
+ {
+ wint_t wc;
+ int wc_idx = idx;
+ while(input->wcs[wc_idx] == WEOF)
+ {
+#ifdef DEBUG
+ /* It must not happen. */
+ assert (wc_idx >= 0);
+#endif
+ --wc_idx;
+ if (wc_idx < 0)
+ return input->tip_context;
+ }
+ wc = input->wcs[wc_idx];
+ if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
+ return CONTEXT_WORD;
+ return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
+ ? CONTEXT_NEWLINE : 0);
+ }
+ else
+#endif
+ {
+ c = re_string_byte_at (input, idx);
+ if (bitset_contain (input->word_char, c))
+ return CONTEXT_WORD;
+ return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
+ }
+}
+
+/* Functions for set operation. */
+
+static reg_errcode_t
+re_node_set_alloc (set, size)
+ re_node_set *set;
+ int size;
+{
+ set->alloc = size;
+ set->nelem = 0;
+ set->elems = re_malloc (int, size);
+ if (BE (set->elems == NULL, 0))
+ return REG_ESPACE;
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+re_node_set_init_1 (set, elem)
+ re_node_set *set;
+ int elem;
+{
+ set->alloc = 1;
+ set->nelem = 1;
+ set->elems = re_malloc (int, 1);
+ if (BE (set->elems == NULL, 0))
+ {
+ set->alloc = set->nelem = 0;
+ return REG_ESPACE;
+ }
+ set->elems[0] = elem;
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+re_node_set_init_2 (set, elem1, elem2)
+ re_node_set *set;
+ int elem1, elem2;
+{
+ set->alloc = 2;
+ set->elems = re_malloc (int, 2);
+ if (BE (set->elems == NULL, 0))
+ return REG_ESPACE;
+ if (elem1 == elem2)
+ {
+ set->nelem = 1;
+ set->elems[0] = elem1;
+ }
+ else
+ {
+ set->nelem = 2;
+ if (elem1 < elem2)
+ {
+ set->elems[0] = elem1;
+ set->elems[1] = elem2;
+ }
+ else
+ {
+ set->elems[0] = elem2;
+ set->elems[1] = elem1;
+ }
+ }
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+re_node_set_init_copy (dest, src)
+ re_node_set *dest;
+ const re_node_set *src;
+{
+ dest->nelem = src->nelem;
+ if (src->nelem > 0)
+ {
+ dest->alloc = dest->nelem;
+ dest->elems = re_malloc (int, dest->alloc);
+ if (BE (dest->elems == NULL, 0))
+ {
+ dest->alloc = dest->nelem = 0;
+ return REG_ESPACE;
+ }
+ memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
+ }
+ else
+ re_node_set_init_empty (dest);
+ return REG_NOERROR;
+}
+
+/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
+ DEST. Return value indicate the error code or REG_NOERROR if succeeded.
+ Note: We assume dest->elems is NULL, when dest->alloc is 0. */
+
+static reg_errcode_t
+re_node_set_add_intersect (dest, src1, src2)
+ re_node_set *dest;
+ const re_node_set *src1, *src2;
+{
+ int i1, i2, is, id, delta, sbase;
+ if (src1->nelem == 0 || src2->nelem == 0)
+ return REG_NOERROR;
+
+ /* We need dest->nelem + 2 * elems_in_intersection; this is a
+ conservative estimate. */
+ if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
+ {
+ int new_alloc = src1->nelem + src2->nelem + dest->alloc;
+ int *new_elems = re_realloc (dest->elems, int, new_alloc);
+ if (BE (new_elems == NULL, 0))
+ return REG_ESPACE;
+ dest->elems = new_elems;
+ dest->alloc = new_alloc;
+ }
+
+ /* Find the items in the intersection of SRC1 and SRC2, and copy
+ into the top of DEST those that are not already in DEST itself. */
+ sbase = dest->nelem + src1->nelem + src2->nelem;
+ i1 = src1->nelem - 1;
+ i2 = src2->nelem - 1;
+ id = dest->nelem - 1;
+ for (;;)
+ {
+ if (src1->elems[i1] == src2->elems[i2])
+ {
+ /* Try to find the item in DEST. Maybe we could binary search? */
+ while (id >= 0 && dest->elems[id] > src1->elems[i1])
+ --id;
+
+ if (id < 0 || dest->elems[id] != src1->elems[i1])
+ dest->elems[--sbase] = src1->elems[i1];
+
+ if (--i1 < 0 || --i2 < 0)
+ break;
+ }
+
+ /* Lower the highest of the two items. */
+ else if (src1->elems[i1] < src2->elems[i2])
+ {
+ if (--i2 < 0)
+ break;
+ }
+ else
+ {
+ if (--i1 < 0)
+ break;
+ }
+ }
+
+ id = dest->nelem - 1;
+ is = dest->nelem + src1->nelem + src2->nelem - 1;
+ delta = is - sbase + 1;
+
+ /* Now copy. When DELTA becomes zero, the remaining
+ DEST elements are already in place; this is more or
+ less the same loop that is in re_node_set_merge. */
+ dest->nelem += delta;
+ if (delta > 0 && id >= 0)
+ for (;;)
+ {
+ if (dest->elems[is] > dest->elems[id])
+ {
+ /* Copy from the top. */
+ dest->elems[id + delta--] = dest->elems[is--];
+ if (delta == 0)
+ break;
+ }
+ else
+ {
+ /* Slide from the bottom. */
+ dest->elems[id + delta] = dest->elems[id];
+ if (--id < 0)
+ break;
+ }
+ }
+
+ /* Copy remaining SRC elements. */
+ memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
+
+ return REG_NOERROR;
+}
+
+/* Calculate the union set of the sets SRC1 and SRC2. And store it to
+ DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
+
+static reg_errcode_t
+re_node_set_init_union (dest, src1, src2)
+ re_node_set *dest;
+ const re_node_set *src1, *src2;
+{
+ int i1, i2, id;
+ if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
+ {
+ dest->alloc = src1->nelem + src2->nelem;
+ dest->elems = re_malloc (int, dest->alloc);
+ if (BE (dest->elems == NULL, 0))
+ return REG_ESPACE;
+ }
+ else
+ {
+ if (src1 != NULL && src1->nelem > 0)
+ return re_node_set_init_copy (dest, src1);
+ else if (src2 != NULL && src2->nelem > 0)
+ return re_node_set_init_copy (dest, src2);
+ else
+ re_node_set_init_empty (dest);
+ return REG_NOERROR;
+ }
+ for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
+ {
+ if (src1->elems[i1] > src2->elems[i2])
+ {
+ dest->elems[id++] = src2->elems[i2++];
+ continue;
+ }
+ if (src1->elems[i1] == src2->elems[i2])
+ ++i2;
+ dest->elems[id++] = src1->elems[i1++];
+ }
+ if (i1 < src1->nelem)
+ {
+ memcpy (dest->elems + id, src1->elems + i1,
+ (src1->nelem - i1) * sizeof (int));
+ id += src1->nelem - i1;
+ }
+ else if (i2 < src2->nelem)
+ {
+ memcpy (dest->elems + id, src2->elems + i2,
+ (src2->nelem - i2) * sizeof (int));
+ id += src2->nelem - i2;
+ }
+ dest->nelem = id;
+ return REG_NOERROR;
+}
+
+/* Calculate the union set of the sets DEST and SRC. And store it to
+ DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
+
+static reg_errcode_t
+re_node_set_merge (dest, src)
+ re_node_set *dest;
+ const re_node_set *src;
+{
+ int is, id, sbase, delta;
+ if (src == NULL || src->nelem == 0)
+ return REG_NOERROR;
+ if (dest->alloc < 2 * src->nelem + dest->nelem)
+ {
+ int new_alloc = 2 * (src->nelem + dest->alloc);
+ int *new_buffer = re_realloc (dest->elems, int, new_alloc);
+ if (BE (new_buffer == NULL, 0))
+ return REG_ESPACE;
+ dest->elems = new_buffer;
+ dest->alloc = new_alloc;
+ }
+
+ if (BE (dest->nelem == 0, 0))
+ {
+ dest->nelem = src->nelem;
+ memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
+ return REG_NOERROR;
+ }
+
+ /* Copy into the top of DEST the items of SRC that are not
+ found in DEST. Maybe we could binary search in DEST? */
+ for (sbase = dest->nelem + 2 * src->nelem,
+ is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
+ {
+ if (dest->elems[id] == src->elems[is])
+ is--, id--;
+ else if (dest->elems[id] < src->elems[is])
+ dest->elems[--sbase] = src->elems[is--];
+ else /* if (dest->elems[id] > src->elems[is]) */
+ --id;
+ }
+
+ if (is >= 0)
+ {
+ /* If DEST is exhausted, the remaining items of SRC must be unique. */
+ sbase -= is + 1;
+ memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
+ }
+
+ id = dest->nelem - 1;
+ is = dest->nelem + 2 * src->nelem - 1;
+ delta = is - sbase + 1;
+ if (delta == 0)
+ return REG_NOERROR;
+
+ /* Now copy. When DELTA becomes zero, the remaining
+ DEST elements are already in place. */
+ dest->nelem += delta;
+ for (;;)
+ {
+ if (dest->elems[is] > dest->elems[id])
+ {
+ /* Copy from the top. */
+ dest->elems[id + delta--] = dest->elems[is--];
+ if (delta == 0)
+ break;
+ }
+ else
+ {
+ /* Slide from the bottom. */
+ dest->elems[id + delta] = dest->elems[id];
+ if (--id < 0)
+ {
+ /* Copy remaining SRC elements. */
+ memcpy (dest->elems, dest->elems + sbase,
+ delta * sizeof (int));
+ break;
+ }
+ }
+ }
+
+ return REG_NOERROR;
+}
+
+/* Insert the new element ELEM to the re_node_set* SET.
+ SET should not already have ELEM.
+ return -1 if an error is occured, return 1 otherwise. */
+
+static int
+re_node_set_insert (set, elem)
+ re_node_set *set;
+ int elem;
+{
+ int idx;
+ /* In case the set is empty. */
+ if (set->alloc == 0)
+ {
+ if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
+ return 1;
+ else
+ return -1;
+ }
+
+ if (BE (set->nelem, 0) == 0)
+ {
+ /* We already guaranteed above that set->alloc != 0. */
+ set->elems[0] = elem;
+ ++set->nelem;
+ return 1;
+ }
+
+ /* Realloc if we need. */
+ if (set->alloc == set->nelem)
+ {
+ int *new_array;
+ set->alloc = set->alloc * 2;
+ new_array = re_realloc (set->elems, int, set->alloc);
+ if (BE (new_array == NULL, 0))
+ return -1;
+ set->elems = new_array;
+ }
+
+ /* Move the elements which follows the new element. Test the
+ first element separately to skip a check in the inner loop. */
+ if (elem < set->elems[0])
+ {
+ idx = 0;
+ for (idx = set->nelem; idx > 0; idx--)
+ set->elems[idx] = set->elems[idx - 1];
+ }
+ else
+ {
+ for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
+ set->elems[idx] = set->elems[idx - 1];
+ }
+
+ /* Insert the new element. */
+ set->elems[idx] = elem;
+ ++set->nelem;
+ return 1;
+}
+
+/* Insert the new element ELEM to the re_node_set* SET.
+ SET should not already have any element greater than or equal to ELEM.
+ Return -1 if an error is occured, return 1 otherwise. */
+
+static int
+re_node_set_insert_last (set, elem)
+ re_node_set *set;
+ int elem;
+{
+ /* Realloc if we need. */
+ if (set->alloc == set->nelem)
+ {
+ int *new_array;
+ set->alloc = (set->alloc + 1) * 2;
+ new_array = re_realloc (set->elems, int, set->alloc);
+ if (BE (new_array == NULL, 0))
+ return -1;
+ set->elems = new_array;
+ }
+
+ /* Insert the new element. */
+ set->elems[set->nelem++] = elem;
+ return 1;
+}
+
+/* Compare two node sets SET1 and SET2.
+ return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
+
+static int
+re_node_set_compare (set1, set2)
+ const re_node_set *set1, *set2;
+{
+ int i;
+ if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
+ return 0;
+ for (i = set1->nelem ; --i >= 0 ; )
+ if (set1->elems[i] != set2->elems[i])
+ return 0;
+ return 1;
+}
+
+/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
+
+static int
+re_node_set_contains (set, elem)
+ const re_node_set *set;
+ int elem;
+{
+ unsigned int idx, right, mid;
+ if (set->nelem <= 0)
+ return 0;
+
+ /* Binary search the element. */
+ idx = 0;
+ right = set->nelem - 1;
+ while (idx < right)
+ {
+ mid = (idx + right) / 2;
+ if (set->elems[mid] < elem)
+ idx = mid + 1;
+ else
+ right = mid;
+ }
+ return set->elems[idx] == elem ? idx + 1 : 0;
+}
+
+static void
+re_node_set_remove_at (set, idx)
+ re_node_set *set;
+ int idx;
+{
+ if (idx < 0 || idx >= set->nelem)
+ return;
+ --set->nelem;
+ for (; idx < set->nelem; idx++)
+ set->elems[idx] = set->elems[idx + 1];
+}
+
+
+/* Add the token TOKEN to dfa->nodes, and return the index of the token.
+ Or return -1, if an error will be occured. */
+
+static int
+re_dfa_add_node (dfa, token)
+ re_dfa_t *dfa;
+ re_token_t token;
+{
+ int type = token.type;
+ if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
+ {
+ int new_nodes_alloc = dfa->nodes_alloc * 2;
+ int *new_nexts, *new_indices;
+ re_node_set *new_edests, *new_eclosures;
+
+ re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
+ new_nodes_alloc);
+ if (BE (new_array == NULL, 0))
+ return -1;
+ dfa->nodes = new_array;
+ new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
+ new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
+ new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
+ new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
+ if (BE (new_nexts == NULL || new_indices == NULL
+ || new_edests == NULL || new_eclosures == NULL, 0))
+ return -1;
+ dfa->nexts = new_nexts;
+ dfa->org_indices = new_indices;
+ dfa->edests = new_edests;
+ dfa->eclosures = new_eclosures;
+ dfa->nodes_alloc = new_nodes_alloc;
+ }
+ dfa->nodes[dfa->nodes_len] = token;
+ dfa->nodes[dfa->nodes_len].constraint = 0;
+#ifdef RE_ENABLE_I18N
+ dfa->nodes[dfa->nodes_len].accept_mb =
+ (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+#endif
+ dfa->nexts[dfa->nodes_len] = -1;
+ re_node_set_init_empty (dfa->edests + dfa->nodes_len);
+ re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
+ return dfa->nodes_len++;
+}
+
+static unsigned int inline
+calc_state_hash (nodes, context)
+ const re_node_set *nodes;
+ unsigned int context;
+{
+ unsigned int hash = nodes->nelem + context;
+ int i;
+ for (i = 0 ; i < nodes->nelem ; i++)
+ hash += nodes->elems[i];
+ return hash;
+}
+
+/* Search for the state whose node_set is equivalent to NODES.
+ Return the pointer to the state, if we found it in the DFA.
+ Otherwise create the new one and return it. In case of an error
+ return NULL and set the error code in ERR.
+ Note: - We assume NULL as the invalid state, then it is possible that
+ return value is NULL and ERR is REG_NOERROR.
+ - We never return non-NULL value in case of any errors, it is for
+ optimization. */
+
+static re_dfastate_t*
+re_acquire_state (err, dfa, nodes)
+ reg_errcode_t *err;
+ re_dfa_t *dfa;
+ const re_node_set *nodes;
+{
+ unsigned int hash;
+ re_dfastate_t *new_state;
+ struct re_state_table_entry *spot;
+ int i;
+ if (BE (nodes->nelem == 0, 0))
+ {
+ *err = REG_NOERROR;
+ return NULL;
+ }
+ hash = calc_state_hash (nodes, 0);
+ spot = dfa->state_table + (hash & dfa->state_hash_mask);
+
+ for (i = 0 ; i < spot->num ; i++)
+ {
+ re_dfastate_t *state = spot->array[i];
+ if (hash != state->hash)
+ continue;
+ if (re_node_set_compare (&state->nodes, nodes))
+ return state;
+ }
+
+ /* There are no appropriate state in the dfa, create the new one. */
+ new_state = create_ci_newstate (dfa, nodes, hash);
+ if (BE (new_state != NULL, 1))
+ return new_state;
+ else
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+}
+
+/* Search for the state whose node_set is equivalent to NODES and
+ whose context is equivalent to CONTEXT.
+ Return the pointer to the state, if we found it in the DFA.
+ Otherwise create the new one and return it. In case of an error
+ return NULL and set the error code in ERR.
+ Note: - We assume NULL as the invalid state, then it is possible that
+ return value is NULL and ERR is REG_NOERROR.
+ - We never return non-NULL value in case of any errors, it is for
+ optimization. */
+
+static re_dfastate_t*
+re_acquire_state_context (err, dfa, nodes, context)
+ reg_errcode_t *err;
+ re_dfa_t *dfa;
+ const re_node_set *nodes;
+ unsigned int context;
+{
+ unsigned int hash;
+ re_dfastate_t *new_state;
+ struct re_state_table_entry *spot;
+ int i;
+ if (nodes->nelem == 0)
+ {
+ *err = REG_NOERROR;
+ return NULL;
+ }
+ hash = calc_state_hash (nodes, context);
+ spot = dfa->state_table + (hash & dfa->state_hash_mask);
+
+ for (i = 0 ; i < spot->num ; i++)
+ {
+ re_dfastate_t *state = spot->array[i];
+ if (state->hash == hash
+ && state->context == context
+ && re_node_set_compare (state->entrance_nodes, nodes))
+ return state;
+ }
+ /* There are no appropriate state in `dfa', create the new one. */
+ new_state = create_cd_newstate (dfa, nodes, context, hash);
+ if (BE (new_state != NULL, 1))
+ return new_state;
+ else
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+}
+
+/* Finish initialization of the new state NEWSTATE, and using its hash value
+ HASH put in the appropriate bucket of DFA's state table. Return value
+ indicates the error code if failed. */
+
+static reg_errcode_t
+register_state (dfa, newstate, hash)
+ re_dfa_t *dfa;
+ re_dfastate_t *newstate;
+ unsigned int hash;
+{
+ struct re_state_table_entry *spot;
+ reg_errcode_t err;
+ int i;
+
+ newstate->hash = hash;
+ err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
+ if (BE (err != REG_NOERROR, 0))
+ return REG_ESPACE;
+ for (i = 0; i < newstate->nodes.nelem; i++)
+ {
+ int elem = newstate->nodes.elems[i];
+ if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
+ re_node_set_insert_last (&newstate->non_eps_nodes, elem);
+ }
+
+ spot = dfa->state_table + (hash & dfa->state_hash_mask);
+ if (BE (spot->alloc <= spot->num, 0))
+ {
+ int new_alloc = 2 * spot->num + 2;
+ re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
+ new_alloc);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ spot->array = new_array;
+ spot->alloc = new_alloc;
+ }
+ spot->array[spot->num++] = newstate;
+ return REG_NOERROR;
+}
+
+/* Create the new state which is independ of contexts.
+ Return the new state if succeeded, otherwise return NULL. */
+
+static re_dfastate_t *
+create_ci_newstate (dfa, nodes, hash)
+ re_dfa_t *dfa;
+ const re_node_set *nodes;
+ unsigned int hash;
+{
+ int i;
+ reg_errcode_t err;
+ re_dfastate_t *newstate;
+
+ newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
+ if (BE (newstate == NULL, 0))
+ return NULL;
+ err = re_node_set_init_copy (&newstate->nodes, nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_free (newstate);
+ return NULL;
+ }
+
+ newstate->entrance_nodes = &newstate->nodes;
+ for (i = 0 ; i < nodes->nelem ; i++)
+ {
+ re_token_t *node = dfa->nodes + nodes->elems[i];
+ re_token_type_t type = node->type;
+ if (type == CHARACTER && !node->constraint)
+ continue;
+#ifdef RE_ENABLE_I18N
+ newstate->accept_mb |= node->accept_mb;
+#endif /* RE_ENABLE_I18N */
+
+ /* If the state has the halt node, the state is a halt state. */
+ if (type == END_OF_RE)
+ newstate->halt = 1;
+ else if (type == OP_BACK_REF)
+ newstate->has_backref = 1;
+ else if (type == ANCHOR || node->constraint)
+ newstate->has_constraint = 1;
+ }
+ err = register_state (dfa, newstate, hash);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ free_state (newstate);
+ newstate = NULL;
+ }
+ return newstate;
+}
+
+/* Create the new state which is depend on the context CONTEXT.
+ Return the new state if succeeded, otherwise return NULL. */
+
+static re_dfastate_t *
+create_cd_newstate (dfa, nodes, context, hash)
+ re_dfa_t *dfa;
+ const re_node_set *nodes;
+ unsigned int context, hash;
+{
+ int i, nctx_nodes = 0;
+ reg_errcode_t err;
+ re_dfastate_t *newstate;
+
+ newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
+ if (BE (newstate == NULL, 0))
+ return NULL;
+ err = re_node_set_init_copy (&newstate->nodes, nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_free (newstate);
+ return NULL;
+ }
+
+ newstate->context = context;
+ newstate->entrance_nodes = &newstate->nodes;
+
+ for (i = 0 ; i < nodes->nelem ; i++)
+ {
+ unsigned int constraint = 0;
+ re_token_t *node = dfa->nodes + nodes->elems[i];
+ re_token_type_t type = node->type;
+ if (node->constraint)
+ constraint = node->constraint;
+
+ if (type == CHARACTER && !constraint)
+ continue;
+#ifdef RE_ENABLE_I18N
+ newstate->accept_mb |= node->accept_mb;
+#endif /* RE_ENABLE_I18N */
+
+ /* If the state has the halt node, the state is a halt state. */
+ if (type == END_OF_RE)
+ newstate->halt = 1;
+ else if (type == OP_BACK_REF)
+ newstate->has_backref = 1;
+ else if (type == ANCHOR)
+ constraint = node->opr.ctx_type;
+
+ if (constraint)
+ {
+ if (newstate->entrance_nodes == &newstate->nodes)
+ {
+ newstate->entrance_nodes = re_malloc (re_node_set, 1);
+ if (BE (newstate->entrance_nodes == NULL, 0))
+ {
+ free_state (newstate);
+ return NULL;
+ }
+ re_node_set_init_copy (newstate->entrance_nodes, nodes);
+ nctx_nodes = 0;
+ newstate->has_constraint = 1;
+ }
+
+ if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
+ {
+ re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
+ ++nctx_nodes;
+ }
+ }
+ }
+ err = register_state (dfa, newstate, hash);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ free_state (newstate);
+ newstate = NULL;
+ }
+ return newstate;
+}
+
+static void
+free_state (state)
+ re_dfastate_t *state;
+{
+ re_node_set_free (&state->non_eps_nodes);
+ re_node_set_free (&state->inveclosure);
+ if (state->entrance_nodes != &state->nodes)
+ {
+ re_node_set_free (state->entrance_nodes);
+ re_free (state->entrance_nodes);
+ }
+ re_node_set_free (&state->nodes);
+ re_free (state->word_trtable);
+ re_free (state->trtable);
+ re_free (state);
+}
diff --git a/gnu/lib/libregex/regex_internal.h b/gnu/lib/libregex/regex_internal.h
new file mode 100644
index 000000000000..58fa749e9002
--- /dev/null
+++ b/gnu/lib/libregex/regex_internal.h
@@ -0,0 +1,798 @@
+/* Extended regular expression matching and search library.
+ Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef _REGEX_INTERNAL_H
+#define _REGEX_INTERNAL_H 1
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif
+#if defined HAVE_LOCALE_H || defined _LIBC
+# include <locale.h>
+#endif
+#if defined HAVE_WCHAR_H || defined _LIBC
+# include <wchar.h>
+#endif /* HAVE_WCHAR_H || _LIBC */
+#if defined HAVE_WCTYPE_H || defined _LIBC
+# include <wctype.h>
+#endif /* HAVE_WCTYPE_H || _LIBC */
+
+/* In case that the system doesn't have isblank(). */
+#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
+# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
+#endif
+
+#ifdef _LIBC
+# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
+# define _RE_DEFINE_LOCALE_FUNCTIONS 1
+# include <locale/localeinfo.h>
+# include <locale/elem-hash.h>
+# include <locale/coll-lookup.h>
+# endif
+#endif
+
+/* This is for other GNU distributions with internationalized messages. */
+#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
+# include <libintl.h>
+# ifdef _LIBC
+# undef gettext
+# define gettext(msgid) \
+ INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
+# endif
+#else
+# define gettext(msgid) (msgid)
+#endif
+
+#ifndef gettext_noop
+/* This define is so xgettext can find the internationalizable
+ strings. */
+# define gettext_noop(String) String
+#endif
+
+#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
+# define RE_ENABLE_I18N
+#endif
+
+#if __GNUC__ >= 3
+# define BE(expr, val) __builtin_expect (expr, val)
+#else
+# define BE(expr, val) (expr)
+# define inline
+#endif
+
+/* Number of bits in a byte. */
+#define BYTE_BITS 8
+/* Number of single byte character. */
+#define SBC_MAX 256
+
+#define COLL_ELEM_LEN_MAX 8
+
+/* The character which represents newline. */
+#define NEWLINE_CHAR '\n'
+#define WIDE_NEWLINE_CHAR L'\n'
+
+/* Rename to standard API for using out of glibc. */
+#ifndef _LIBC
+# define __wctype wctype
+# define __iswctype iswctype
+# define __btowc btowc
+# define __mempcpy mempcpy
+# define __wcrtomb wcrtomb
+# define __regfree regfree
+# define attribute_hidden
+#endif /* not _LIBC */
+
+#ifdef __GNUC__
+# define __attribute(arg) __attribute__ (arg)
+#else
+# define __attribute(arg)
+#endif
+
+extern const char __re_error_msgid[] attribute_hidden;
+extern const size_t __re_error_msgid_idx[] attribute_hidden;
+
+/* Number of bits in an unsinged int. */
+#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS)
+/* Number of unsigned int in an bit_set. */
+#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)
+typedef unsigned int bitset[BITSET_UINTS];
+typedef unsigned int *re_bitset_ptr_t;
+typedef const unsigned int *re_const_bitset_ptr_t;
+
+#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS)
+#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS))
+#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS))
+#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS)
+#define bitset_set_all(set) \
+ memset (set, 255, sizeof (unsigned int) * BITSET_UINTS)
+#define bitset_copy(dest,src) \
+ memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS)
+static inline void bitset_not (bitset set);
+static inline void bitset_merge (bitset dest, const bitset src);
+static inline void bitset_not_merge (bitset dest, const bitset src);
+static inline void bitset_mask (bitset dest, const bitset src);
+
+#define PREV_WORD_CONSTRAINT 0x0001
+#define PREV_NOTWORD_CONSTRAINT 0x0002
+#define NEXT_WORD_CONSTRAINT 0x0004
+#define NEXT_NOTWORD_CONSTRAINT 0x0008
+#define PREV_NEWLINE_CONSTRAINT 0x0010
+#define NEXT_NEWLINE_CONSTRAINT 0x0020
+#define PREV_BEGBUF_CONSTRAINT 0x0040
+#define NEXT_ENDBUF_CONSTRAINT 0x0080
+#define WORD_DELIM_CONSTRAINT 0x0100
+#define NOT_WORD_DELIM_CONSTRAINT 0x0200
+
+typedef enum
+{
+ INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
+ WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
+ WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
+ INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
+ LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
+ LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
+ BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
+ BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
+ WORD_DELIM = WORD_DELIM_CONSTRAINT,
+ NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
+} re_context_type;
+
+typedef struct
+{
+ int alloc;
+ int nelem;
+ int *elems;
+} re_node_set;
+
+typedef enum
+{
+ NON_TYPE = 0,
+
+ /* Node type, These are used by token, node, tree. */
+ CHARACTER = 1,
+ END_OF_RE = 2,
+ SIMPLE_BRACKET = 3,
+ OP_BACK_REF = 4,
+ OP_PERIOD = 5,
+#ifdef RE_ENABLE_I18N
+ COMPLEX_BRACKET = 6,
+ OP_UTF8_PERIOD = 7,
+#endif /* RE_ENABLE_I18N */
+
+ /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
+ when the debugger shows values of this enum type. */
+#define EPSILON_BIT 8
+ OP_OPEN_SUBEXP = EPSILON_BIT | 0,
+ OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
+ OP_ALT = EPSILON_BIT | 2,
+ OP_DUP_ASTERISK = EPSILON_BIT | 3,
+ ANCHOR = EPSILON_BIT | 4,
+
+ /* Tree type, these are used only by tree. */
+ CONCAT = 16,
+ SUBEXP = 17,
+
+ /* Token type, these are used only by token. */
+ OP_DUP_PLUS = 18,
+ OP_DUP_QUESTION,
+ OP_OPEN_BRACKET,
+ OP_CLOSE_BRACKET,
+ OP_CHARSET_RANGE,
+ OP_OPEN_DUP_NUM,
+ OP_CLOSE_DUP_NUM,
+ OP_NON_MATCH_LIST,
+ OP_OPEN_COLL_ELEM,
+ OP_CLOSE_COLL_ELEM,
+ OP_OPEN_EQUIV_CLASS,
+ OP_CLOSE_EQUIV_CLASS,
+ OP_OPEN_CHAR_CLASS,
+ OP_CLOSE_CHAR_CLASS,
+ OP_WORD,
+ OP_NOTWORD,
+ OP_SPACE,
+ OP_NOTSPACE,
+ BACK_SLASH
+
+} re_token_type_t;
+
+#ifdef RE_ENABLE_I18N
+typedef struct
+{
+ /* Multibyte characters. */
+ wchar_t *mbchars;
+
+ /* Collating symbols. */
+# ifdef _LIBC
+ int32_t *coll_syms;
+# endif
+
+ /* Equivalence classes. */
+# ifdef _LIBC
+ int32_t *equiv_classes;
+# endif
+
+ /* Range expressions. */
+# ifdef _LIBC
+ uint32_t *range_starts;
+ uint32_t *range_ends;
+# else /* not _LIBC */
+ wchar_t *range_starts;
+ wchar_t *range_ends;
+# endif /* not _LIBC */
+
+ /* Character classes. */
+ wctype_t *char_classes;
+
+ /* If this character set is the non-matching list. */
+ unsigned int non_match : 1;
+
+ /* # of multibyte characters. */
+ int nmbchars;
+
+ /* # of collating symbols. */
+ int ncoll_syms;
+
+ /* # of equivalence classes. */
+ int nequiv_classes;
+
+ /* # of range expressions. */
+ int nranges;
+
+ /* # of character classes. */
+ int nchar_classes;
+} re_charset_t;
+#endif /* RE_ENABLE_I18N */
+
+typedef struct
+{
+ union
+ {
+ unsigned char c; /* for CHARACTER */
+ re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
+#ifdef RE_ENABLE_I18N
+ re_charset_t *mbcset; /* for COMPLEX_BRACKET */
+#endif /* RE_ENABLE_I18N */
+ int idx; /* for BACK_REF */
+ re_context_type ctx_type; /* for ANCHOR */
+ } opr;
+#if __GNUC__ >= 2
+ re_token_type_t type : 8;
+#else
+ re_token_type_t type;
+#endif
+ unsigned int constraint : 10; /* context constraint */
+ unsigned int duplicated : 1;
+ unsigned int opt_subexp : 1;
+#ifdef RE_ENABLE_I18N
+ unsigned int accept_mb : 1;
+ /* These 2 bits can be moved into the union if needed (e.g. if running out
+ of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
+ unsigned int mb_partial : 1;
+#endif
+ unsigned int word_char : 1;
+} re_token_t;
+
+#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
+
+struct re_string_t
+{
+ /* Indicate the raw buffer which is the original string passed as an
+ argument of regexec(), re_search(), etc.. */
+ const unsigned char *raw_mbs;
+ /* Store the multibyte string. In case of "case insensitive mode" like
+ REG_ICASE, upper cases of the string are stored, otherwise MBS points
+ the same address that RAW_MBS points. */
+ unsigned char *mbs;
+#ifdef RE_ENABLE_I18N
+ /* Store the wide character string which is corresponding to MBS. */
+ wint_t *wcs;
+ int *offsets;
+ mbstate_t cur_state;
+#endif
+ /* Index in RAW_MBS. Each character mbs[i] corresponds to
+ raw_mbs[raw_mbs_idx + i]. */
+ int raw_mbs_idx;
+ /* The length of the valid characters in the buffers. */
+ int valid_len;
+ /* The corresponding number of bytes in raw_mbs array. */
+ int valid_raw_len;
+ /* The length of the buffers MBS and WCS. */
+ int bufs_len;
+ /* The index in MBS, which is updated by re_string_fetch_byte. */
+ int cur_idx;
+ /* length of RAW_MBS array. */
+ int raw_len;
+ /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
+ int len;
+ /* End of the buffer may be shorter than its length in the cases such
+ as re_match_2, re_search_2. Then, we use STOP for end of the buffer
+ instead of LEN. */
+ int raw_stop;
+ /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
+ int stop;
+
+ /* The context of mbs[0]. We store the context independently, since
+ the context of mbs[0] may be different from raw_mbs[0], which is
+ the beginning of the input string. */
+ unsigned int tip_context;
+ /* The translation passed as a part of an argument of re_compile_pattern. */
+ unsigned RE_TRANSLATE_TYPE trans;
+ /* Copy of re_dfa_t's word_char. */
+ re_const_bitset_ptr_t word_char;
+ /* 1 if REG_ICASE. */
+ unsigned char icase;
+ unsigned char is_utf8;
+ unsigned char map_notascii;
+ unsigned char mbs_allocated;
+ unsigned char offsets_needed;
+ unsigned char newline_anchor;
+ unsigned char word_ops_used;
+ int mb_cur_max;
+};
+typedef struct re_string_t re_string_t;
+
+
+struct re_dfa_t;
+typedef struct re_dfa_t re_dfa_t;
+
+#ifndef _LIBC
+# ifdef __i386__
+# define internal_function __attribute ((regparm (3), stdcall))
+# else
+# define internal_function
+# endif
+#endif
+
+#ifndef RE_NO_INTERNAL_PROTOTYPES
+static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
+ int len, int init_len,
+ RE_TRANSLATE_TYPE trans, int icase,
+ const re_dfa_t *dfa)
+ internal_function;
+static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str,
+ int len, RE_TRANSLATE_TYPE trans,
+ int icase, const re_dfa_t *dfa)
+ internal_function;
+static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
+ int eflags) internal_function;
+static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
+ int new_buf_len)
+ internal_function;
+# ifdef RE_ENABLE_I18N
+static void build_wcs_buffer (re_string_t *pstr) internal_function;
+static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
+# endif /* RE_ENABLE_I18N */
+static void build_upper_buffer (re_string_t *pstr) internal_function;
+static void re_string_translate_buffer (re_string_t *pstr) internal_function;
+static void re_string_destruct (re_string_t *pstr) internal_function;
+# ifdef RE_ENABLE_I18N
+static int re_string_elem_size_at (const re_string_t *pstr, int idx)
+ internal_function __attribute ((pure));
+static inline int re_string_char_size_at (const re_string_t *pstr, int idx)
+ internal_function __attribute ((pure));
+static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx)
+ internal_function __attribute ((pure));
+# endif /* RE_ENABLE_I18N */
+static unsigned int re_string_context_at (const re_string_t *input, int idx,
+ int eflags)
+ internal_function __attribute ((pure));
+static unsigned char re_string_peek_byte_case (const re_string_t *pstr,
+ int idx)
+ internal_function __attribute ((pure));
+static unsigned char re_string_fetch_byte_case (re_string_t *pstr)
+ internal_function __attribute ((pure));
+#endif
+#define re_string_peek_byte(pstr, offset) \
+ ((pstr)->mbs[(pstr)->cur_idx + offset])
+#define re_string_fetch_byte(pstr) \
+ ((pstr)->mbs[(pstr)->cur_idx++])
+#define re_string_first_byte(pstr, idx) \
+ ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
+#define re_string_is_single_byte_char(pstr, idx) \
+ ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
+ || (pstr)->wcs[(idx) + 1] != WEOF))
+#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
+#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
+#define re_string_get_buffer(pstr) ((pstr)->mbs)
+#define re_string_length(pstr) ((pstr)->len)
+#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
+#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
+#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
+
+#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
+#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
+#define re_free(p) free (p)
+
+struct bin_tree_t
+{
+ struct bin_tree_t *parent;
+ struct bin_tree_t *left;
+ struct bin_tree_t *right;
+ struct bin_tree_t *first;
+ struct bin_tree_t *next;
+
+ re_token_t token;
+
+ /* `node_idx' is the index in dfa->nodes, if `type' == 0.
+ Otherwise `type' indicate the type of this node. */
+ int node_idx;
+};
+typedef struct bin_tree_t bin_tree_t;
+
+#define BIN_TREE_STORAGE_SIZE \
+ ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
+
+struct bin_tree_storage_t
+{
+ struct bin_tree_storage_t *next;
+ bin_tree_t data[BIN_TREE_STORAGE_SIZE];
+};
+typedef struct bin_tree_storage_t bin_tree_storage_t;
+
+#define CONTEXT_WORD 1
+#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
+#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
+#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
+
+#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
+#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
+#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
+#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
+#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
+
+#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
+#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
+#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
+#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
+
+#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
+ ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
+ || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
+ || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
+ || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
+
+#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
+ ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
+ || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
+ || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
+ || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
+
+struct re_dfastate_t
+{
+ unsigned int hash;
+ re_node_set nodes;
+ re_node_set non_eps_nodes;
+ re_node_set inveclosure;
+ re_node_set *entrance_nodes;
+ struct re_dfastate_t **trtable, **word_trtable;
+ unsigned int context : 4;
+ unsigned int halt : 1;
+ /* If this state can accept `multi byte'.
+ Note that we refer to multibyte characters, and multi character
+ collating elements as `multi byte'. */
+ unsigned int accept_mb : 1;
+ /* If this state has backreference node(s). */
+ unsigned int has_backref : 1;
+ unsigned int has_constraint : 1;
+};
+typedef struct re_dfastate_t re_dfastate_t;
+
+struct re_state_table_entry
+{
+ int num;
+ int alloc;
+ re_dfastate_t **array;
+};
+
+/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
+
+typedef struct
+{
+ int next_idx;
+ int alloc;
+ re_dfastate_t **array;
+} state_array_t;
+
+/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
+
+typedef struct
+{
+ int node;
+ int str_idx; /* The position NODE match at. */
+ state_array_t path;
+} re_sub_match_last_t;
+
+/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
+ And information about the node, whose type is OP_CLOSE_SUBEXP,
+ corresponding to NODE is stored in LASTS. */
+
+typedef struct
+{
+ int str_idx;
+ int node;
+ int next_last_offset;
+ state_array_t *path;
+ int alasts; /* Allocation size of LASTS. */
+ int nlasts; /* The number of LASTS. */
+ re_sub_match_last_t **lasts;
+} re_sub_match_top_t;
+
+struct re_backref_cache_entry
+{
+ int node;
+ int str_idx;
+ int subexp_from;
+ int subexp_to;
+ char more;
+ char unused;
+ unsigned short int eps_reachable_subexps_map;
+};
+
+typedef struct
+{
+ /* The string object corresponding to the input string. */
+ re_string_t input;
+#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
+ re_dfa_t *const dfa;
+#else
+ re_dfa_t *dfa;
+#endif
+ /* EFLAGS of the argument of regexec. */
+ int eflags;
+ /* Where the matching ends. */
+ int match_last;
+ int last_node;
+ /* The state log used by the matcher. */
+ re_dfastate_t **state_log;
+ int state_log_top;
+ /* Back reference cache. */
+ int nbkref_ents;
+ int abkref_ents;
+ struct re_backref_cache_entry *bkref_ents;
+ int max_mb_elem_len;
+ int nsub_tops;
+ int asub_tops;
+ re_sub_match_top_t **sub_tops;
+} re_match_context_t;
+
+typedef struct
+{
+ re_dfastate_t **sifted_states;
+ re_dfastate_t **limited_states;
+ int last_node;
+ int last_str_idx;
+ re_node_set limits;
+} re_sift_context_t;
+
+struct re_fail_stack_ent_t
+{
+ int idx;
+ int node;
+ regmatch_t *regs;
+ re_node_set eps_via_nodes;
+};
+
+struct re_fail_stack_t
+{
+ int num;
+ int alloc;
+ struct re_fail_stack_ent_t *stack;
+};
+
+struct re_dfa_t
+{
+ re_token_t *nodes;
+ int nodes_alloc;
+ int nodes_len;
+ int *nexts;
+ int *org_indices;
+ re_node_set *edests;
+ re_node_set *eclosures;
+ re_node_set *inveclosures;
+ struct re_state_table_entry *state_table;
+ re_dfastate_t *init_state;
+ re_dfastate_t *init_state_word;
+ re_dfastate_t *init_state_nl;
+ re_dfastate_t *init_state_begbuf;
+ bin_tree_t *str_tree;
+ bin_tree_storage_t *str_tree_storage;
+ re_bitset_ptr_t sb_char;
+ int str_tree_storage_idx;
+
+ /* number of subexpressions `re_nsub' is in regex_t. */
+ unsigned int state_hash_mask;
+ int states_alloc;
+ int init_node;
+ int nbackref; /* The number of backreference in this dfa. */
+
+ /* Bitmap expressing which backreference is used. */
+ unsigned int used_bkref_map;
+ unsigned int completed_bkref_map;
+
+ unsigned int has_plural_match : 1;
+ /* If this dfa has "multibyte node", which is a backreference or
+ a node which can accept multibyte character or multi character
+ collating element. */
+ unsigned int has_mb_node : 1;
+ unsigned int is_utf8 : 1;
+ unsigned int map_notascii : 1;
+ unsigned int word_ops_used : 1;
+ int mb_cur_max;
+ bitset word_char;
+ reg_syntax_t syntax;
+ int *subexp_map;
+#ifdef DEBUG
+ char* re_str;
+#endif
+};
+
+#ifndef RE_NO_INTERNAL_PROTOTYPES
+static reg_errcode_t re_node_set_alloc (re_node_set *set, int size) internal_function;
+static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem) internal_function;
+static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1,
+ int elem2) internal_function;
+#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
+static reg_errcode_t re_node_set_init_copy (re_node_set *dest,
+ const re_node_set *src) internal_function;
+static reg_errcode_t re_node_set_add_intersect (re_node_set *dest,
+ const re_node_set *src1,
+ const re_node_set *src2) internal_function;
+static reg_errcode_t re_node_set_init_union (re_node_set *dest,
+ const re_node_set *src1,
+ const re_node_set *src2) internal_function;
+static reg_errcode_t re_node_set_merge (re_node_set *dest,
+ const re_node_set *src) internal_function;
+static int re_node_set_insert (re_node_set *set, int elem) internal_function;
+static int re_node_set_insert_last (re_node_set *set,
+ int elem) internal_function;
+static int re_node_set_compare (const re_node_set *set1,
+ const re_node_set *set2)
+ internal_function __attribute ((pure));
+static int re_node_set_contains (const re_node_set *set, int elem)
+ internal_function __attribute ((pure));
+static void re_node_set_remove_at (re_node_set *set, int idx) internal_function;
+#define re_node_set_remove(set,id) \
+ (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
+#define re_node_set_empty(p) ((p)->nelem = 0)
+#define re_node_set_free(set) re_free ((set)->elems)
+static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function;
+static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa,
+ const re_node_set *nodes) internal_function;
+static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err,
+ re_dfa_t *dfa,
+ const re_node_set *nodes,
+ unsigned int context) internal_function;
+static void free_state (re_dfastate_t *state) internal_function;
+#endif
+
+
+typedef enum
+{
+ SB_CHAR,
+ MB_CHAR,
+ EQUIV_CLASS,
+ COLL_SYM,
+ CHAR_CLASS
+} bracket_elem_type;
+
+typedef struct
+{
+ bracket_elem_type type;
+ union
+ {
+ unsigned char ch;
+ unsigned char *name;
+ wchar_t wch;
+ } opr;
+} bracket_elem_t;
+
+
+/* Inline functions for bitset operation. */
+static inline void
+bitset_not (bitset set)
+{
+ int bitset_i;
+ for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
+ set[bitset_i] = ~set[bitset_i];
+}
+
+static inline void
+bitset_merge (bitset dest, const bitset src)
+{
+ int bitset_i;
+ for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
+ dest[bitset_i] |= src[bitset_i];
+}
+
+static inline void
+bitset_not_merge (bitset dest, const bitset src)
+{
+ int i;
+ for (i = 0; i < BITSET_UINTS; ++i)
+ dest[i] |= ~src[i];
+}
+
+static inline void
+bitset_mask (bitset dest, const bitset src)
+{
+ int bitset_i;
+ for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
+ dest[bitset_i] &= src[bitset_i];
+}
+
+#if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES
+/* Inline functions for re_string. */
+static inline int
+internal_function
+re_string_char_size_at (const re_string_t *pstr, int idx)
+{
+ int byte_idx;
+ if (pstr->mb_cur_max == 1)
+ return 1;
+ for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
+ if (pstr->wcs[idx + byte_idx] != WEOF)
+ break;
+ return byte_idx;
+}
+
+static inline wint_t
+internal_function
+re_string_wchar_at (const re_string_t *pstr, int idx)
+{
+ if (pstr->mb_cur_max == 1)
+ return (wint_t) pstr->mbs[idx];
+ return (wint_t) pstr->wcs[idx];
+}
+
+static int
+internal_function
+re_string_elem_size_at (const re_string_t *pstr, int idx)
+{
+#ifdef _LIBC
+ const unsigned char *p, *extra;
+ const int32_t *table, *indirect;
+ int32_t tmp;
+# include <locale/weight.h>
+ uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+
+ if (nrules != 0)
+ {
+ table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_INDIRECTMB);
+ p = pstr->mbs + idx;
+ tmp = findidx (&p);
+ return p - pstr->mbs - idx;
+ }
+ else
+#endif /* _LIBC */
+ return 1;
+}
+#endif /* RE_ENABLE_I18N */
+
+#endif /* _REGEX_INTERNAL_H */
diff --git a/gnu/lib/libregex/regexec.c b/gnu/lib/libregex/regexec.c
new file mode 100644
index 000000000000..3c226e3c20cd
--- /dev/null
+++ b/gnu/lib/libregex/regexec.c
@@ -0,0 +1,4327 @@
+/* Extended regular expression matching and search library.
+ Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
+ int n) internal_function;
+static void match_ctx_clean (re_match_context_t *mctx) internal_function;
+static void match_ctx_free (re_match_context_t *cache) internal_function;
+static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
+ int str_idx, int from, int to)
+ internal_function;
+static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
+ internal_function;
+static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
+ int str_idx) internal_function;
+static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
+ int node, int str_idx)
+ internal_function;
+static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
+ re_dfastate_t **limited_sts, int last_node,
+ int last_str_idx)
+ internal_function;
+static reg_errcode_t re_search_internal (const regex_t *preg,
+ const char *string, int length,
+ int start, int range, int stop,
+ size_t nmatch, regmatch_t pmatch[],
+ int eflags) internal_function;
+static int re_search_2_stub (struct re_pattern_buffer *bufp,
+ const char *string1, int length1,
+ const char *string2, int length2,
+ int start, int range, struct re_registers *regs,
+ int stop, int ret_len) internal_function;
+static int re_search_stub (struct re_pattern_buffer *bufp,
+ const char *string, int length, int start,
+ int range, int stop, struct re_registers *regs,
+ int ret_len) internal_function;
+static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
+ int nregs, int regs_allocated) internal_function;
+static inline re_dfastate_t *acquire_init_state_context
+ (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
+ __attribute ((always_inline)) internal_function;
+static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
+ internal_function;
+static int check_matching (re_match_context_t *mctx, int fl_longest_match,
+ int *p_match_first)
+ internal_function;
+static int check_halt_node_context (const re_dfa_t *dfa, int node,
+ unsigned int context) internal_function;
+static int check_halt_state_context (const re_match_context_t *mctx,
+ const re_dfastate_t *state, int idx)
+ internal_function;
+static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
+ regmatch_t *prev_idx_match, int cur_node,
+ int cur_idx, int nmatch) internal_function;
+static int proceed_next_node (const re_match_context_t *mctx,
+ int nregs, regmatch_t *regs,
+ int *pidx, int node, re_node_set *eps_via_nodes,
+ struct re_fail_stack_t *fs) internal_function;
+static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
+ int str_idx, int dest_node, int nregs,
+ regmatch_t *regs,
+ re_node_set *eps_via_nodes) internal_function;
+static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
+ regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
+static reg_errcode_t set_regs (const regex_t *preg,
+ const re_match_context_t *mctx,
+ size_t nmatch, regmatch_t *pmatch,
+ int fl_backtrack) internal_function;
+static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
+
+#ifdef RE_ENABLE_I18N
+static int sift_states_iter_mb (const re_match_context_t *mctx,
+ re_sift_context_t *sctx,
+ int node_idx, int str_idx, int max_str_idx) internal_function;
+#endif /* RE_ENABLE_I18N */
+static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
+ re_sift_context_t *sctx) internal_function;
+static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
+ re_sift_context_t *sctx, int str_idx,
+ re_node_set *cur_dest) internal_function;
+static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
+ re_sift_context_t *sctx,
+ int str_idx,
+ re_node_set *dest_nodes) internal_function;
+static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
+ re_node_set *dest_nodes,
+ const re_node_set *candidates) internal_function;
+static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
+ re_node_set *dest_nodes,
+ const re_node_set *and_nodes) internal_function;
+static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
+ int dst_node, int dst_idx, int src_node,
+ int src_idx) internal_function;
+static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
+ int boundaries, int subexp_idx,
+ int from_node, int bkref_idx) internal_function;
+static int check_dst_limits_calc_pos (re_match_context_t *mctx,
+ int limit, int subexp_idx,
+ int node, int str_idx,
+ int bkref_idx) internal_function;
+static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
+ re_node_set *dest_nodes,
+ const re_node_set *candidates,
+ re_node_set *limits,
+ struct re_backref_cache_entry *bkref_ents,
+ int str_idx) internal_function;
+static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
+ re_sift_context_t *sctx,
+ int str_idx, const re_node_set *candidates) internal_function;
+static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
+ int next_state_log_idx) internal_function;
+static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
+ re_dfastate_t **src, int num) internal_function;
+static re_dfastate_t *find_recover_state (reg_errcode_t *err,
+ re_match_context_t *mctx) internal_function;
+static re_dfastate_t *transit_state (reg_errcode_t *err,
+ re_match_context_t *mctx,
+ re_dfastate_t *state) internal_function;
+static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
+ re_match_context_t *mctx,
+ re_dfastate_t *next_state) internal_function;
+static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
+ re_node_set *cur_nodes,
+ int str_idx) internal_function;
+#if 0
+static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
+ re_match_context_t *mctx,
+ re_dfastate_t *pstate) internal_function;
+#endif
+#ifdef RE_ENABLE_I18N
+static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
+ re_dfastate_t *pstate) internal_function;
+#endif /* RE_ENABLE_I18N */
+static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
+ const re_node_set *nodes) internal_function;
+static reg_errcode_t get_subexp (re_match_context_t *mctx,
+ int bkref_node, int bkref_str_idx) internal_function;
+static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
+ const re_sub_match_top_t *sub_top,
+ re_sub_match_last_t *sub_last,
+ int bkref_node, int bkref_str) internal_function;
+static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
+ int subexp_idx, int type) internal_function;
+static reg_errcode_t check_arrival (re_match_context_t *mctx,
+ state_array_t *path, int top_node,
+ int top_str, int last_node, int last_str,
+ int type) internal_function;
+static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
+ int str_idx,
+ re_node_set *cur_nodes,
+ re_node_set *next_nodes) internal_function;
+static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
+ re_node_set *cur_nodes,
+ int ex_subexp, int type) internal_function;
+static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
+ re_node_set *dst_nodes,
+ int target, int ex_subexp,
+ int type) internal_function;
+static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
+ re_node_set *cur_nodes, int cur_str,
+ int subexp_num, int type) internal_function;
+static int build_trtable (re_dfa_t *dfa,
+ re_dfastate_t *state) internal_function;
+#ifdef RE_ENABLE_I18N
+static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
+ const re_string_t *input, int idx) internal_function;
+# ifdef _LIBC
+static unsigned int find_collation_sequence_value (const unsigned char *mbs,
+ size_t name_len) internal_function;
+# endif /* _LIBC */
+#endif /* RE_ENABLE_I18N */
+static int group_nodes_into_DFAstates (re_dfa_t *dfa,
+ const re_dfastate_t *state,
+ re_node_set *states_node,
+ bitset *states_ch) internal_function;
+static int check_node_accept (const re_match_context_t *mctx,
+ const re_token_t *node, int idx) internal_function;
+static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
+
+/* Entry point for POSIX code. */
+
+/* regexec searches for a given pattern, specified by PREG, in the
+ string STRING.
+
+ If NMATCH is zero or REG_NOSUB was set in the cflags argument to
+ `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
+ least NMATCH elements, and we set them to the offsets of the
+ corresponding matched substrings.
+
+ EFLAGS specifies `execution flags' which affect matching: if
+ REG_NOTBOL is set, then ^ does not match at the beginning of the
+ string; if REG_NOTEOL is set, then $ does not match at the end.
+
+ We return 0 if we find a match and REG_NOMATCH if not. */
+
+int
+regexec (preg, string, nmatch, pmatch, eflags)
+ const regex_t *__restrict preg;
+ const char *__restrict string;
+ size_t nmatch;
+ regmatch_t pmatch[];
+ int eflags;
+{
+ reg_errcode_t err;
+ int start, length;
+
+ if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
+ return REG_BADPAT;
+
+ if (eflags & REG_STARTEND)
+ {
+ start = pmatch[0].rm_so;
+ length = pmatch[0].rm_eo;
+ }
+ else
+ {
+ start = 0;
+ length = strlen (string);
+ }
+ if (preg->no_sub)
+ err = re_search_internal (preg, string, length, start, length - start,
+ length, 0, NULL, eflags);
+ else
+ err = re_search_internal (preg, string, length, start, length - start,
+ length, nmatch, pmatch, eflags);
+ return err != REG_NOERROR;
+}
+
+#ifdef _LIBC
+# include <shlib-compat.h>
+versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
+
+# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
+__typeof__ (__regexec) __compat_regexec;
+
+int
+attribute_compat_text_section
+__compat_regexec (const regex_t *__restrict preg,
+ const char *__restrict string, size_t nmatch,
+ regmatch_t pmatch[], int eflags)
+{
+ return regexec (preg, string, nmatch, pmatch,
+ eflags & (REG_NOTBOL | REG_NOTEOL));
+}
+compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
+# endif
+#endif
+
+/* Entry points for GNU code. */
+
+/* re_match, re_search, re_match_2, re_search_2
+
+ The former two functions operate on STRING with length LENGTH,
+ while the later two operate on concatenation of STRING1 and STRING2
+ with lengths LENGTH1 and LENGTH2, respectively.
+
+ re_match() matches the compiled pattern in BUFP against the string,
+ starting at index START.
+
+ re_search() first tries matching at index START, then it tries to match
+ starting from index START + 1, and so on. The last start position tried
+ is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
+ way as re_match().)
+
+ The parameter STOP of re_{match,search}_2 specifies that no match exceeding
+ the first STOP characters of the concatenation of the strings should be
+ concerned.
+
+ If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
+ and all groups is stroed in REGS. (For the "_2" variants, the offsets are
+ computed relative to the concatenation, not relative to the individual
+ strings.)
+
+ On success, re_match* functions return the length of the match, re_search*
+ return the position of the start of the match. Return value -1 means no
+ match was found and -2 indicates an internal error. */
+
+int
+re_match (bufp, string, length, start, regs)
+ struct re_pattern_buffer *bufp;
+ const char *string;
+ int length, start;
+ struct re_registers *regs;
+{
+ return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
+}
+#ifdef _LIBC
+weak_alias (__re_match, re_match)
+#endif
+
+int
+re_search (bufp, string, length, start, range, regs)
+ struct re_pattern_buffer *bufp;
+ const char *string;
+ int length, start, range;
+ struct re_registers *regs;
+{
+ return re_search_stub (bufp, string, length, start, range, length, regs, 0);
+}
+#ifdef _LIBC
+weak_alias (__re_search, re_search)
+#endif
+
+int
+re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ int length1, length2, start, stop;
+ struct re_registers *regs;
+{
+ return re_search_2_stub (bufp, string1, length1, string2, length2,
+ start, 0, regs, stop, 1);
+}
+#ifdef _LIBC
+weak_alias (__re_match_2, re_match_2)
+#endif
+
+int
+re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ int length1, length2, start, range, stop;
+ struct re_registers *regs;
+{
+ return re_search_2_stub (bufp, string1, length1, string2, length2,
+ start, range, regs, stop, 0);
+}
+#ifdef _LIBC
+weak_alias (__re_search_2, re_search_2)
+#endif
+
+static int
+re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
+ stop, ret_len)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ int length1, length2, start, range, stop, ret_len;
+ struct re_registers *regs;
+{
+ const char *str;
+ int rval;
+ int len = length1 + length2;
+ int free_str = 0;
+
+ if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
+ return -2;
+
+ /* Concatenate the strings. */
+ if (length2 > 0)
+ if (length1 > 0)
+ {
+ char *s = re_malloc (char, len);
+
+ if (BE (s == NULL, 0))
+ return -2;
+ memcpy (s, string1, length1);
+ memcpy (s + length1, string2, length2);
+ str = s;
+ free_str = 1;
+ }
+ else
+ str = string2;
+ else
+ str = string1;
+
+ rval = re_search_stub (bufp, str, len, start, range, stop, regs,
+ ret_len);
+ if (free_str)
+ re_free ((char *) str);
+ return rval;
+}
+
+/* The parameters have the same meaning as those of re_search.
+ Additional parameters:
+ If RET_LEN is nonzero the length of the match is returned (re_match style);
+ otherwise the position of the match is returned. */
+
+static int
+re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
+ struct re_pattern_buffer *bufp;
+ const char *string;
+ int length, start, range, stop, ret_len;
+ struct re_registers *regs;
+{
+ reg_errcode_t result;
+ regmatch_t *pmatch;
+ int nregs, rval;
+ int eflags = 0;
+
+ /* Check for out-of-range. */
+ if (BE (start < 0 || start > length, 0))
+ return -1;
+ if (BE (start + range > length, 0))
+ range = length - start;
+ else if (BE (start + range < 0, 0))
+ range = -start;
+
+ eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
+ eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
+
+ /* Compile fastmap if we haven't yet. */
+ if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
+ re_compile_fastmap (bufp);
+
+ if (BE (bufp->no_sub, 0))
+ regs = NULL;
+
+ /* We need at least 1 register. */
+ if (regs == NULL)
+ nregs = 1;
+ else if (BE (bufp->regs_allocated == REGS_FIXED &&
+ regs->num_regs < bufp->re_nsub + 1, 0))
+ {
+ nregs = regs->num_regs;
+ if (BE (nregs < 1, 0))
+ {
+ /* Nothing can be copied to regs. */
+ regs = NULL;
+ nregs = 1;
+ }
+ }
+ else
+ nregs = bufp->re_nsub + 1;
+ pmatch = re_malloc (regmatch_t, nregs);
+ if (BE (pmatch == NULL, 0))
+ return -2;
+
+ result = re_search_internal (bufp, string, length, start, range, stop,
+ nregs, pmatch, eflags);
+
+ rval = 0;
+
+ /* I hope we needn't fill ther regs with -1's when no match was found. */
+ if (result != REG_NOERROR)
+ rval = -1;
+ else if (regs != NULL)
+ {
+ /* If caller wants register contents data back, copy them. */
+ bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
+ bufp->regs_allocated);
+ if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
+ rval = -2;
+ }
+
+ if (BE (rval == 0, 1))
+ {
+ if (ret_len)
+ {
+ assert (pmatch[0].rm_so == start);
+ rval = pmatch[0].rm_eo - start;
+ }
+ else
+ rval = pmatch[0].rm_so;
+ }
+ re_free (pmatch);
+ return rval;
+}
+
+static unsigned
+re_copy_regs (regs, pmatch, nregs, regs_allocated)
+ struct re_registers *regs;
+ regmatch_t *pmatch;
+ int nregs, regs_allocated;
+{
+ int rval = REGS_REALLOCATE;
+ int i;
+ int need_regs = nregs + 1;
+ /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
+ uses. */
+
+ /* Have the register data arrays been allocated? */
+ if (regs_allocated == REGS_UNALLOCATED)
+ { /* No. So allocate them with malloc. */
+ regs->start = re_malloc (regoff_t, need_regs);
+ regs->end = re_malloc (regoff_t, need_regs);
+ if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
+ return REGS_UNALLOCATED;
+ regs->num_regs = need_regs;
+ }
+ else if (regs_allocated == REGS_REALLOCATE)
+ { /* Yes. If we need more elements than were already
+ allocated, reallocate them. If we need fewer, just
+ leave it alone. */
+ if (BE (need_regs > regs->num_regs, 0))
+ {
+ regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
+ regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
+ if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
+ return REGS_UNALLOCATED;
+ regs->start = new_start;
+ regs->end = new_end;
+ regs->num_regs = need_regs;
+ }
+ }
+ else
+ {
+ assert (regs_allocated == REGS_FIXED);
+ /* This function may not be called with REGS_FIXED and nregs too big. */
+ assert (regs->num_regs >= nregs);
+ rval = REGS_FIXED;
+ }
+
+ /* Copy the regs. */
+ for (i = 0; i < nregs; ++i)
+ {
+ regs->start[i] = pmatch[i].rm_so;
+ regs->end[i] = pmatch[i].rm_eo;
+ }
+ for ( ; i < regs->num_regs; ++i)
+ regs->start[i] = regs->end[i] = -1;
+
+ return rval;
+}
+
+/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
+ ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
+ this memory for recording register information. STARTS and ENDS
+ must be allocated using the malloc library routine, and must each
+ be at least NUM_REGS * sizeof (regoff_t) bytes long.
+
+ If NUM_REGS == 0, then subsequent matches should allocate their own
+ register data.
+
+ Unless this function is called, the first search or match using
+ PATTERN_BUFFER will allocate its own register data, without
+ freeing the old data. */
+
+void
+re_set_registers (bufp, regs, num_regs, starts, ends)
+ struct re_pattern_buffer *bufp;
+ struct re_registers *regs;
+ unsigned num_regs;
+ regoff_t *starts, *ends;
+{
+ if (num_regs)
+ {
+ bufp->regs_allocated = REGS_REALLOCATE;
+ regs->num_regs = num_regs;
+ regs->start = starts;
+ regs->end = ends;
+ }
+ else
+ {
+ bufp->regs_allocated = REGS_UNALLOCATED;
+ regs->num_regs = 0;
+ regs->start = regs->end = (regoff_t *) 0;
+ }
+}
+#ifdef _LIBC
+weak_alias (__re_set_registers, re_set_registers)
+#endif
+
+/* Entry points compatible with 4.2 BSD regex library. We don't define
+ them unless specifically requested. */
+
+#if defined _REGEX_RE_COMP || defined _LIBC
+int
+# ifdef _LIBC
+weak_function
+# endif
+re_exec (s)
+ const char *s;
+{
+ return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
+}
+#endif /* _REGEX_RE_COMP */
+
+/* Internal entry point. */
+
+/* Searches for a compiled pattern PREG in the string STRING, whose
+ length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
+ mingings with regexec. START, and RANGE have the same meanings
+ with re_search.
+ Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
+ otherwise return the error code.
+ Note: We assume front end functions already check ranges.
+ (START + RANGE >= 0 && START + RANGE <= LENGTH) */
+
+static reg_errcode_t
+re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
+ eflags)
+ const regex_t *preg;
+ const char *string;
+ int length, start, range, stop, eflags;
+ size_t nmatch;
+ regmatch_t pmatch[];
+{
+ reg_errcode_t err;
+ re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
+ int left_lim, right_lim, incr;
+ int fl_longest_match, match_first, match_kind, match_last = -1;
+ int extra_nmatch;
+ int sb, ch;
+#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
+ re_match_context_t mctx = { .dfa = dfa };
+#else
+ re_match_context_t mctx;
+#endif
+ char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
+ && range && !preg->can_be_null) ? preg->fastmap : NULL;
+ unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
+
+#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
+ memset (&mctx, '\0', sizeof (re_match_context_t));
+ mctx.dfa = dfa;
+#endif
+
+ extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
+ nmatch -= extra_nmatch;
+
+ /* Check if the DFA haven't been compiled. */
+ if (BE (preg->used == 0 || dfa->init_state == NULL
+ || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
+ || dfa->init_state_begbuf == NULL, 0))
+ return REG_NOMATCH;
+
+#ifdef DEBUG
+ /* We assume front-end functions already check them. */
+ assert (start + range >= 0 && start + range <= length);
+#endif
+
+ /* If initial states with non-begbuf contexts have no elements,
+ the regex must be anchored. If preg->newline_anchor is set,
+ we'll never use init_state_nl, so do not check it. */
+ if (dfa->init_state->nodes.nelem == 0
+ && dfa->init_state_word->nodes.nelem == 0
+ && (dfa->init_state_nl->nodes.nelem == 0
+ || !preg->newline_anchor))
+ {
+ if (start != 0 && start + range != 0)
+ return REG_NOMATCH;
+ start = range = 0;
+ }
+
+ /* We must check the longest matching, if nmatch > 0. */
+ fl_longest_match = (nmatch != 0 || dfa->nbackref);
+
+ err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
+ preg->translate, preg->syntax & RE_ICASE, dfa);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ mctx.input.stop = stop;
+ mctx.input.raw_stop = stop;
+ mctx.input.newline_anchor = preg->newline_anchor;
+
+ err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+ /* We will log all the DFA states through which the dfa pass,
+ if nmatch > 1, or this dfa has "multibyte node", which is a
+ back-reference or a node which can accept multibyte character or
+ multi character collating element. */
+ if (nmatch > 1 || dfa->has_mb_node)
+ {
+ mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
+ if (BE (mctx.state_log == NULL, 0))
+ {
+ err = REG_ESPACE;
+ goto free_return;
+ }
+ }
+ else
+ mctx.state_log = NULL;
+
+ match_first = start;
+ mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
+ : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
+
+ /* Check incrementally whether of not the input string match. */
+ incr = (range < 0) ? -1 : 1;
+ left_lim = (range < 0) ? start + range : start;
+ right_lim = (range < 0) ? start : start + range;
+ sb = dfa->mb_cur_max == 1;
+ match_kind =
+ (fastmap
+ ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
+ | (range >= 0 ? 2 : 0)
+ | (t != NULL ? 1 : 0))
+ : 8);
+
+ for (;; match_first += incr)
+ {
+ err = REG_NOMATCH;
+ if (match_first < left_lim || right_lim < match_first)
+ goto free_return;
+
+ /* Advance as rapidly as possible through the string, until we
+ find a plausible place to start matching. This may be done
+ with varying efficiency, so there are various possibilities:
+ only the most common of them are specialized, in order to
+ save on code size. We use a switch statement for speed. */
+ switch (match_kind)
+ {
+ case 8:
+ /* No fastmap. */
+ break;
+
+ case 7:
+ /* Fastmap with single-byte translation, match forward. */
+ while (BE (match_first < right_lim, 1)
+ && !fastmap[t[(unsigned char) string[match_first]]])
+ ++match_first;
+ goto forward_match_found_start_or_reached_end;
+
+ case 6:
+ /* Fastmap without translation, match forward. */
+ while (BE (match_first < right_lim, 1)
+ && !fastmap[(unsigned char) string[match_first]])
+ ++match_first;
+
+ forward_match_found_start_or_reached_end:
+ if (BE (match_first == right_lim, 0))
+ {
+ ch = match_first >= length
+ ? 0 : (unsigned char) string[match_first];
+ if (!fastmap[t ? t[ch] : ch])
+ goto free_return;
+ }
+ break;
+
+ case 4:
+ case 5:
+ /* Fastmap without multi-byte translation, match backwards. */
+ while (match_first >= left_lim)
+ {
+ ch = match_first >= length
+ ? 0 : (unsigned char) string[match_first];
+ if (fastmap[t ? t[ch] : ch])
+ break;
+ --match_first;
+ }
+ if (match_first < left_lim)
+ goto free_return;
+ break;
+
+ default:
+ /* In this case, we can't determine easily the current byte,
+ since it might be a component byte of a multibyte
+ character. Then we use the constructed buffer instead. */
+ for (;;)
+ {
+ /* If MATCH_FIRST is out of the valid range, reconstruct the
+ buffers. */
+ unsigned int offset = match_first - mctx.input.raw_mbs_idx;
+ if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
+ {
+ err = re_string_reconstruct (&mctx.input, match_first,
+ eflags);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+ offset = match_first - mctx.input.raw_mbs_idx;
+ }
+ /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
+ Note that MATCH_FIRST must not be smaller than 0. */
+ ch = (match_first >= length
+ ? 0 : re_string_byte_at (&mctx.input, offset));
+ if (fastmap[ch])
+ break;
+ match_first += incr;
+ if (match_first < left_lim || match_first > right_lim)
+ {
+ err = REG_NOMATCH;
+ goto free_return;
+ }
+ }
+ break;
+ }
+
+ /* Reconstruct the buffers so that the matcher can assume that
+ the matching starts from the beginning of the buffer. */
+ err = re_string_reconstruct (&mctx.input, match_first, eflags);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+#ifdef RE_ENABLE_I18N
+ /* Don't consider this char as a possible match start if it part,
+ yet isn't the head, of a multibyte character. */
+ if (!sb && !re_string_first_byte (&mctx.input, 0))
+ continue;
+#endif
+
+ /* It seems to be appropriate one, then use the matcher. */
+ /* We assume that the matching starts from 0. */
+ mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
+ match_last = check_matching (&mctx, fl_longest_match,
+ range >= 0 ? &match_first : NULL);
+ if (match_last != -1)
+ {
+ if (BE (match_last == -2, 0))
+ {
+ err = REG_ESPACE;
+ goto free_return;
+ }
+ else
+ {
+ mctx.match_last = match_last;
+ if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
+ {
+ re_dfastate_t *pstate = mctx.state_log[match_last];
+ mctx.last_node = check_halt_state_context (&mctx, pstate,
+ match_last);
+ }
+ if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
+ || dfa->nbackref)
+ {
+ err = prune_impossible_nodes (&mctx);
+ if (err == REG_NOERROR)
+ break;
+ if (BE (err != REG_NOMATCH, 0))
+ goto free_return;
+ match_last = -1;
+ }
+ else
+ break; /* We found a match. */
+ }
+ }
+
+ match_ctx_clean (&mctx);
+ }
+
+#ifdef DEBUG
+ assert (match_last != -1);
+ assert (err == REG_NOERROR);
+#endif
+
+ /* Set pmatch[] if we need. */
+ if (nmatch > 0)
+ {
+ int reg_idx;
+
+ /* Initialize registers. */
+ for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
+ pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
+
+ /* Set the points where matching start/end. */
+ pmatch[0].rm_so = 0;
+ pmatch[0].rm_eo = mctx.match_last;
+
+ if (!preg->no_sub && nmatch > 1)
+ {
+ err = set_regs (preg, &mctx, nmatch, pmatch,
+ dfa->has_plural_match && dfa->nbackref > 0);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+
+ /* At last, add the offset to the each registers, since we slided
+ the buffers so that we could assume that the matching starts
+ from 0. */
+ for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
+ if (pmatch[reg_idx].rm_so != -1)
+ {
+#ifdef RE_ENABLE_I18N
+ if (BE (mctx.input.offsets_needed != 0, 0))
+ {
+ if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
+ pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
+ else
+ pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
+ if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
+ pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
+ else
+ pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
+ }
+#else
+ assert (mctx.input.offsets_needed == 0);
+#endif
+ pmatch[reg_idx].rm_so += match_first;
+ pmatch[reg_idx].rm_eo += match_first;
+ }
+ for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
+ {
+ pmatch[nmatch + reg_idx].rm_so = -1;
+ pmatch[nmatch + reg_idx].rm_eo = -1;
+ }
+
+ if (dfa->subexp_map)
+ for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
+ if (dfa->subexp_map[reg_idx] != reg_idx)
+ {
+ pmatch[reg_idx + 1].rm_so
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
+ pmatch[reg_idx + 1].rm_eo
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
+ }
+ }
+
+ free_return:
+ re_free (mctx.state_log);
+ if (dfa->nbackref)
+ match_ctx_free (&mctx);
+ re_string_destruct (&mctx.input);
+ return err;
+}
+
+static reg_errcode_t
+prune_impossible_nodes (mctx)
+ re_match_context_t *mctx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int halt_node, match_last;
+ reg_errcode_t ret;
+ re_dfastate_t **sifted_states;
+ re_dfastate_t **lim_states = NULL;
+ re_sift_context_t sctx;
+#ifdef DEBUG
+ assert (mctx->state_log != NULL);
+#endif
+ match_last = mctx->match_last;
+ halt_node = mctx->last_node;
+ sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
+ if (BE (sifted_states == NULL, 0))
+ {
+ ret = REG_ESPACE;
+ goto free_return;
+ }
+ if (dfa->nbackref)
+ {
+ lim_states = re_malloc (re_dfastate_t *, match_last + 1);
+ if (BE (lim_states == NULL, 0))
+ {
+ ret = REG_ESPACE;
+ goto free_return;
+ }
+ while (1)
+ {
+ memset (lim_states, '\0',
+ sizeof (re_dfastate_t *) * (match_last + 1));
+ sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
+ match_last);
+ ret = sift_states_backward (mctx, &sctx);
+ re_node_set_free (&sctx.limits);
+ if (BE (ret != REG_NOERROR, 0))
+ goto free_return;
+ if (sifted_states[0] != NULL || lim_states[0] != NULL)
+ break;
+ do
+ {
+ --match_last;
+ if (match_last < 0)
+ {
+ ret = REG_NOMATCH;
+ goto free_return;
+ }
+ } while (mctx->state_log[match_last] == NULL
+ || !mctx->state_log[match_last]->halt);
+ halt_node = check_halt_state_context (mctx,
+ mctx->state_log[match_last],
+ match_last);
+ }
+ ret = merge_state_array (dfa, sifted_states, lim_states,
+ match_last + 1);
+ re_free (lim_states);
+ lim_states = NULL;
+ if (BE (ret != REG_NOERROR, 0))
+ goto free_return;
+ }
+ else
+ {
+ sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
+ ret = sift_states_backward (mctx, &sctx);
+ re_node_set_free (&sctx.limits);
+ if (BE (ret != REG_NOERROR, 0))
+ goto free_return;
+ }
+ re_free (mctx->state_log);
+ mctx->state_log = sifted_states;
+ sifted_states = NULL;
+ mctx->last_node = halt_node;
+ mctx->match_last = match_last;
+ ret = REG_NOERROR;
+ free_return:
+ re_free (sifted_states);
+ re_free (lim_states);
+ return ret;
+}
+
+/* Acquire an initial state and return it.
+ We must select appropriate initial state depending on the context,
+ since initial states may have constraints like "\<", "^", etc.. */
+
+static inline re_dfastate_t *
+acquire_init_state_context (err, mctx, idx)
+ reg_errcode_t *err;
+ const re_match_context_t *mctx;
+ int idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ if (dfa->init_state->has_constraint)
+ {
+ unsigned int context;
+ context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
+ if (IS_WORD_CONTEXT (context))
+ return dfa->init_state_word;
+ else if (IS_ORDINARY_CONTEXT (context))
+ return dfa->init_state;
+ else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
+ return dfa->init_state_begbuf;
+ else if (IS_NEWLINE_CONTEXT (context))
+ return dfa->init_state_nl;
+ else if (IS_BEGBUF_CONTEXT (context))
+ {
+ /* It is relatively rare case, then calculate on demand. */
+ return re_acquire_state_context (err, dfa,
+ dfa->init_state->entrance_nodes,
+ context);
+ }
+ else
+ /* Must not happen? */
+ return dfa->init_state;
+ }
+ else
+ return dfa->init_state;
+}
+
+/* Check whether the regular expression match input string INPUT or not,
+ and return the index where the matching end, return -1 if not match,
+ or return -2 in case of an error.
+ FL_LONGEST_MATCH means we want the POSIX longest matching.
+ If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
+ next place where we may want to try matching.
+ Note that the matcher assume that the maching starts from the current
+ index of the buffer. */
+
+static int
+check_matching (mctx, fl_longest_match, p_match_first)
+ re_match_context_t *mctx;
+ int fl_longest_match;
+ int *p_match_first;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int match = 0;
+ int match_last = -1;
+ int cur_str_idx = re_string_cur_idx (&mctx->input);
+ re_dfastate_t *cur_state;
+ int at_init_state = p_match_first != NULL;
+ int next_start_idx = cur_str_idx;
+
+ err = REG_NOERROR;
+ cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
+ /* An initial state must not be NULL (invalid). */
+ if (BE (cur_state == NULL, 0))
+ {
+ assert (err == REG_ESPACE);
+ return -2;
+ }
+
+ if (mctx->state_log != NULL)
+ {
+ mctx->state_log[cur_str_idx] = cur_state;
+
+ /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
+ later. E.g. Processing back references. */
+ if (BE (dfa->nbackref, 0))
+ {
+ at_init_state = 0;
+ err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ if (cur_state->has_backref)
+ {
+ err = transit_state_bkref (mctx, &cur_state->nodes);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+ }
+
+ /* If the RE accepts NULL string. */
+ if (BE (cur_state->halt, 0))
+ {
+ if (!cur_state->has_constraint
+ || check_halt_state_context (mctx, cur_state, cur_str_idx))
+ {
+ if (!fl_longest_match)
+ return cur_str_idx;
+ else
+ {
+ match_last = cur_str_idx;
+ match = 1;
+ }
+ }
+ }
+
+ while (!re_string_eoi (&mctx->input))
+ {
+ re_dfastate_t *old_state = cur_state;
+ int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
+
+ if (BE (next_char_idx >= mctx->input.bufs_len, 0)
+ || (BE (next_char_idx >= mctx->input.valid_len, 0)
+ && mctx->input.valid_len < mctx->input.len))
+ {
+ err = extend_buffers (mctx);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ assert (err == REG_ESPACE);
+ return -2;
+ }
+ }
+
+ cur_state = transit_state (&err, mctx, cur_state);
+ if (mctx->state_log != NULL)
+ cur_state = merge_state_with_log (&err, mctx, cur_state);
+
+ if (cur_state == NULL)
+ {
+ /* Reached the invalid state or an error. Try to recover a valid
+ state using the state log, if available and if we have not
+ already found a valid (even if not the longest) match. */
+ if (BE (err != REG_NOERROR, 0))
+ return -2;
+
+ if (mctx->state_log == NULL
+ || (match && !fl_longest_match)
+ || (cur_state = find_recover_state (&err, mctx)) == NULL)
+ break;
+ }
+
+ if (BE (at_init_state, 0))
+ {
+ if (old_state == cur_state)
+ next_start_idx = next_char_idx;
+ else
+ at_init_state = 0;
+ }
+
+ if (cur_state->halt)
+ {
+ /* Reached a halt state.
+ Check the halt state can satisfy the current context. */
+ if (!cur_state->has_constraint
+ || check_halt_state_context (mctx, cur_state,
+ re_string_cur_idx (&mctx->input)))
+ {
+ /* We found an appropriate halt state. */
+ match_last = re_string_cur_idx (&mctx->input);
+ match = 1;
+
+ /* We found a match, do not modify match_first below. */
+ p_match_first = NULL;
+ if (!fl_longest_match)
+ break;
+ }
+ }
+ }
+
+ if (p_match_first)
+ *p_match_first += next_start_idx;
+
+ return match_last;
+}
+
+/* Check NODE match the current context. */
+
+static int check_halt_node_context (dfa, node, context)
+ const re_dfa_t *dfa;
+ int node;
+ unsigned int context;
+{
+ re_token_type_t type = dfa->nodes[node].type;
+ unsigned int constraint = dfa->nodes[node].constraint;
+ if (type != END_OF_RE)
+ return 0;
+ if (!constraint)
+ return 1;
+ if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
+ return 0;
+ return 1;
+}
+
+/* Check the halt state STATE match the current context.
+ Return 0 if not match, if the node, STATE has, is a halt node and
+ match the context, return the node. */
+
+static int
+check_halt_state_context (mctx, state, idx)
+ const re_match_context_t *mctx;
+ const re_dfastate_t *state;
+ int idx;
+{
+ int i;
+ unsigned int context;
+#ifdef DEBUG
+ assert (state->halt);
+#endif
+ context = re_string_context_at (&mctx->input, idx, mctx->eflags);
+ for (i = 0; i < state->nodes.nelem; ++i)
+ if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
+ return state->nodes.elems[i];
+ return 0;
+}
+
+/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
+ corresponding to the DFA).
+ Return the destination node, and update EPS_VIA_NODES, return -1 in case
+ of errors. */
+
+static int
+proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
+ const re_match_context_t *mctx;
+ regmatch_t *regs;
+ int nregs, *pidx, node;
+ re_node_set *eps_via_nodes;
+ struct re_fail_stack_t *fs;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int i, err, dest_node;
+ dest_node = -1;
+ if (IS_EPSILON_NODE (dfa->nodes[node].type))
+ {
+ re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
+ re_node_set *edests = &dfa->edests[node];
+ int dest_node;
+ err = re_node_set_insert (eps_via_nodes, node);
+ if (BE (err < 0, 0))
+ return -2;
+ /* Pick up a valid destination, or return -1 if none is found. */
+ for (dest_node = -1, i = 0; i < edests->nelem; ++i)
+ {
+ int candidate = edests->elems[i];
+ if (!re_node_set_contains (cur_nodes, candidate))
+ continue;
+ if (dest_node == -1)
+ dest_node = candidate;
+
+ else
+ {
+ /* In order to avoid infinite loop like "(a*)*", return the second
+ epsilon-transition if the first was already considered. */
+ if (re_node_set_contains (eps_via_nodes, dest_node))
+ return candidate;
+
+ /* Otherwise, push the second epsilon-transition on the fail stack. */
+ else if (fs != NULL
+ && push_fail_stack (fs, *pidx, candidate, nregs, regs,
+ eps_via_nodes))
+ return -2;
+
+ /* We know we are going to exit. */
+ break;
+ }
+ }
+ return dest_node;
+ }
+ else
+ {
+ int naccepted = 0;
+ re_token_type_t type = dfa->nodes[node].type;
+
+#ifdef RE_ENABLE_I18N
+ if (dfa->nodes[node].accept_mb)
+ naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
+ else
+#endif /* RE_ENABLE_I18N */
+ if (type == OP_BACK_REF)
+ {
+ int subexp_idx = dfa->nodes[node].opr.idx + 1;
+ naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
+ if (fs != NULL)
+ {
+ if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
+ return -1;
+ else if (naccepted)
+ {
+ char *buf = (char *) re_string_get_buffer (&mctx->input);
+ if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
+ naccepted) != 0)
+ return -1;
+ }
+ }
+
+ if (naccepted == 0)
+ {
+ err = re_node_set_insert (eps_via_nodes, node);
+ if (BE (err < 0, 0))
+ return -2;
+ dest_node = dfa->edests[node].elems[0];
+ if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
+ dest_node))
+ return dest_node;
+ }
+ }
+
+ if (naccepted != 0
+ || check_node_accept (mctx, dfa->nodes + node, *pidx))
+ {
+ dest_node = dfa->nexts[node];
+ *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
+ if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
+ || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
+ dest_node)))
+ return -1;
+ re_node_set_empty (eps_via_nodes);
+ return dest_node;
+ }
+ }
+ return -1;
+}
+
+static reg_errcode_t
+push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
+ struct re_fail_stack_t *fs;
+ int str_idx, dest_node, nregs;
+ regmatch_t *regs;
+ re_node_set *eps_via_nodes;
+{
+ reg_errcode_t err;
+ int num = fs->num++;
+ if (fs->num == fs->alloc)
+ {
+ struct re_fail_stack_ent_t *new_array;
+ new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
+ * fs->alloc * 2));
+ if (new_array == NULL)
+ return REG_ESPACE;
+ fs->alloc *= 2;
+ fs->stack = new_array;
+ }
+ fs->stack[num].idx = str_idx;
+ fs->stack[num].node = dest_node;
+ fs->stack[num].regs = re_malloc (regmatch_t, nregs);
+ if (fs->stack[num].regs == NULL)
+ return REG_ESPACE;
+ memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
+ err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
+ return err;
+}
+
+static int
+pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
+ struct re_fail_stack_t *fs;
+ int *pidx, nregs;
+ regmatch_t *regs;
+ re_node_set *eps_via_nodes;
+{
+ int num = --fs->num;
+ assert (num >= 0);
+ *pidx = fs->stack[num].idx;
+ memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
+ re_node_set_free (eps_via_nodes);
+ re_free (fs->stack[num].regs);
+ *eps_via_nodes = fs->stack[num].eps_via_nodes;
+ return fs->stack[num].node;
+}
+
+/* Set the positions where the subexpressions are starts/ends to registers
+ PMATCH.
+ Note: We assume that pmatch[0] is already set, and
+ pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
+
+static reg_errcode_t
+set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
+ const regex_t *preg;
+ const re_match_context_t *mctx;
+ size_t nmatch;
+ regmatch_t *pmatch;
+ int fl_backtrack;
+{
+ re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
+ int idx, cur_node;
+ re_node_set eps_via_nodes;
+ struct re_fail_stack_t *fs;
+ struct re_fail_stack_t fs_body = { 0, 2, NULL };
+ regmatch_t *prev_idx_match;
+
+#ifdef DEBUG
+ assert (nmatch > 1);
+ assert (mctx->state_log != NULL);
+#endif
+ if (fl_backtrack)
+ {
+ fs = &fs_body;
+ fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
+ if (fs->stack == NULL)
+ return REG_ESPACE;
+ }
+ else
+ fs = NULL;
+
+ cur_node = dfa->init_node;
+ re_node_set_init_empty (&eps_via_nodes);
+
+ prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
+ memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
+
+ for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
+ {
+ update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
+
+ if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+ {
+ int reg_idx;
+ if (fs)
+ {
+ for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
+ if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
+ break;
+ if (reg_idx == nmatch)
+ {
+ re_node_set_free (&eps_via_nodes);
+ return free_fail_stack_return (fs);
+ }
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+ &eps_via_nodes);
+ }
+ else
+ {
+ re_node_set_free (&eps_via_nodes);
+ return REG_NOERROR;
+ }
+ }
+
+ /* Proceed to next node. */
+ cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
+ &eps_via_nodes, fs);
+
+ if (BE (cur_node < 0, 0))
+ {
+ if (BE (cur_node == -2, 0))
+ {
+ re_node_set_free (&eps_via_nodes);
+ free_fail_stack_return (fs);
+ return REG_ESPACE;
+ }
+ if (fs)
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+ &eps_via_nodes);
+ else
+ {
+ re_node_set_free (&eps_via_nodes);
+ return REG_NOMATCH;
+ }
+ }
+ }
+ re_node_set_free (&eps_via_nodes);
+ return free_fail_stack_return (fs);
+}
+
+static reg_errcode_t
+free_fail_stack_return (fs)
+ struct re_fail_stack_t *fs;
+{
+ if (fs)
+ {
+ int fs_idx;
+ for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
+ {
+ re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
+ re_free (fs->stack[fs_idx].regs);
+ }
+ re_free (fs->stack);
+ }
+ return REG_NOERROR;
+}
+
+static void
+update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
+ re_dfa_t *dfa;
+ regmatch_t *pmatch, *prev_idx_match;
+ int cur_node, cur_idx, nmatch;
+{
+ int type = dfa->nodes[cur_node].type;
+ if (type == OP_OPEN_SUBEXP)
+ {
+ int reg_num = dfa->nodes[cur_node].opr.idx + 1;
+
+ /* We are at the first node of this sub expression. */
+ if (reg_num < nmatch)
+ {
+ pmatch[reg_num].rm_so = cur_idx;
+ pmatch[reg_num].rm_eo = -1;
+ }
+ }
+ else if (type == OP_CLOSE_SUBEXP)
+ {
+ int reg_num = dfa->nodes[cur_node].opr.idx + 1;
+ if (reg_num < nmatch)
+ {
+ /* We are at the last node of this sub expression. */
+ if (pmatch[reg_num].rm_so < cur_idx)
+ {
+ pmatch[reg_num].rm_eo = cur_idx;
+ /* This is a non-empty match or we are not inside an optional
+ subexpression. Accept this right away. */
+ memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
+ }
+ else
+ {
+ if (dfa->nodes[cur_node].opt_subexp
+ && prev_idx_match[reg_num].rm_so != -1)
+ /* We transited through an empty match for an optional
+ subexpression, like (a?)*, and this is not the subexp's
+ first match. Copy back the old content of the registers
+ so that matches of an inner subexpression are undone as
+ well, like in ((a?))*. */
+ memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
+ else
+ /* We completed a subexpression, but it may be part of
+ an optional one, so do not update PREV_IDX_MATCH. */
+ pmatch[reg_num].rm_eo = cur_idx;
+ }
+ }
+ }
+}
+
+/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
+ and sift the nodes in each states according to the following rules.
+ Updated state_log will be wrote to STATE_LOG.
+
+ Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
+ 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
+ If `a' isn't the LAST_NODE and `a' can't epsilon transit to
+ the LAST_NODE, we throw away the node `a'.
+ 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
+ string `s' and transit to `b':
+ i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
+ away the node `a'.
+ ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
+ thrown away, we throw away the node `a'.
+ 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
+ i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
+ node `a'.
+ ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
+ we throw away the node `a'. */
+
+#define STATE_NODE_CONTAINS(state,node) \
+ ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
+
+static reg_errcode_t
+sift_states_backward (mctx, sctx)
+ re_match_context_t *mctx;
+ re_sift_context_t *sctx;
+{
+ reg_errcode_t err;
+ int null_cnt = 0;
+ int str_idx = sctx->last_str_idx;
+ re_node_set cur_dest;
+
+#ifdef DEBUG
+ assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
+#endif
+
+ /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
+ transit to the last_node and the last_node itself. */
+ err = re_node_set_init_1 (&cur_dest, sctx->last_node);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+ /* Then check each states in the state_log. */
+ while (str_idx > 0)
+ {
+ /* Update counters. */
+ null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
+ if (null_cnt > mctx->max_mb_elem_len)
+ {
+ memset (sctx->sifted_states, '\0',
+ sizeof (re_dfastate_t *) * str_idx);
+ re_node_set_free (&cur_dest);
+ return REG_NOERROR;
+ }
+ re_node_set_empty (&cur_dest);
+ --str_idx;
+
+ if (mctx->state_log[str_idx])
+ {
+ err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+
+ /* Add all the nodes which satisfy the following conditions:
+ - It can epsilon transit to a node in CUR_DEST.
+ - It is in CUR_SRC.
+ And update state_log. */
+ err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ err = REG_NOERROR;
+ free_return:
+ re_node_set_free (&cur_dest);
+ return err;
+}
+
+static reg_errcode_t
+build_sifted_states (mctx, sctx, str_idx, cur_dest)
+ re_match_context_t *mctx;
+ re_sift_context_t *sctx;
+ int str_idx;
+ re_node_set *cur_dest;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
+ int i;
+
+ /* Then build the next sifted state.
+ We build the next sifted state on `cur_dest', and update
+ `sifted_states[str_idx]' with `cur_dest'.
+ Note:
+ `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
+ `cur_src' points the node_set of the old `state_log[str_idx]'
+ (with the epsilon nodes pre-filtered out). */
+ for (i = 0; i < cur_src->nelem; i++)
+ {
+ int prev_node = cur_src->elems[i];
+ int naccepted = 0;
+ int ret;
+
+#ifdef DEBUG
+ re_token_type_t type = dfa->nodes[prev_node].type;
+ assert (!IS_EPSILON_NODE (type));
+#endif
+#ifdef RE_ENABLE_I18N
+ /* If the node may accept `multi byte'. */
+ if (dfa->nodes[prev_node].accept_mb)
+ naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
+ str_idx, sctx->last_str_idx);
+#endif /* RE_ENABLE_I18N */
+
+ /* We don't check backreferences here.
+ See update_cur_sifted_state(). */
+ if (!naccepted
+ && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
+ && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
+ dfa->nexts[prev_node]))
+ naccepted = 1;
+
+ if (naccepted == 0)
+ continue;
+
+ if (sctx->limits.nelem)
+ {
+ int to_idx = str_idx + naccepted;
+ if (check_dst_limits (mctx, &sctx->limits,
+ dfa->nexts[prev_node], to_idx,
+ prev_node, str_idx))
+ continue;
+ }
+ ret = re_node_set_insert (cur_dest, prev_node);
+ if (BE (ret == -1, 0))
+ return REG_ESPACE;
+ }
+
+ return REG_NOERROR;
+}
+
+/* Helper functions. */
+
+static reg_errcode_t
+clean_state_log_if_needed (mctx, next_state_log_idx)
+ re_match_context_t *mctx;
+ int next_state_log_idx;
+{
+ int top = mctx->state_log_top;
+
+ if (next_state_log_idx >= mctx->input.bufs_len
+ || (next_state_log_idx >= mctx->input.valid_len
+ && mctx->input.valid_len < mctx->input.len))
+ {
+ reg_errcode_t err;
+ err = extend_buffers (mctx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+
+ if (top < next_state_log_idx)
+ {
+ memset (mctx->state_log + top + 1, '\0',
+ sizeof (re_dfastate_t *) * (next_state_log_idx - top));
+ mctx->state_log_top = next_state_log_idx;
+ }
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+merge_state_array (dfa, dst, src, num)
+ re_dfa_t *dfa;
+ re_dfastate_t **dst;
+ re_dfastate_t **src;
+ int num;
+{
+ int st_idx;
+ reg_errcode_t err;
+ for (st_idx = 0; st_idx < num; ++st_idx)
+ {
+ if (dst[st_idx] == NULL)
+ dst[st_idx] = src[st_idx];
+ else if (src[st_idx] != NULL)
+ {
+ re_node_set merged_set;
+ err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
+ &src[st_idx]->nodes);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
+ re_node_set_free (&merged_set);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
+ re_match_context_t *mctx;
+ re_sift_context_t *sctx;
+ int str_idx;
+ re_node_set *dest_nodes;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ const re_node_set *candidates;
+ candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
+ : &mctx->state_log[str_idx]->nodes);
+
+ if (dest_nodes->nelem == 0)
+ sctx->sifted_states[str_idx] = NULL;
+ else
+ {
+ if (candidates)
+ {
+ /* At first, add the nodes which can epsilon transit to a node in
+ DEST_NODE. */
+ err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ /* Then, check the limitations in the current sift_context. */
+ if (sctx->limits.nelem)
+ {
+ err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
+ mctx->bkref_ents, str_idx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+
+ sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+
+ if (candidates && mctx->state_log[str_idx]->has_backref)
+ {
+ err = sift_states_bkref (mctx, sctx, str_idx, candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+add_epsilon_src_nodes (dfa, dest_nodes, candidates)
+ re_dfa_t *dfa;
+ re_node_set *dest_nodes;
+ const re_node_set *candidates;
+{
+ reg_errcode_t err = REG_NOERROR;
+ int i;
+
+ re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ if (!state->inveclosure.alloc)
+ {
+ err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
+ if (BE (err != REG_NOERROR, 0))
+ return REG_ESPACE;
+ for (i = 0; i < dest_nodes->nelem; i++)
+ re_node_set_merge (&state->inveclosure,
+ dfa->inveclosures + dest_nodes->elems[i]);
+ }
+ return re_node_set_add_intersect (dest_nodes, candidates,
+ &state->inveclosure);
+}
+
+static reg_errcode_t
+sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
+ re_dfa_t *dfa;
+ int node;
+ re_node_set *dest_nodes;
+ const re_node_set *candidates;
+{
+ int ecl_idx;
+ reg_errcode_t err;
+ re_node_set *inv_eclosure = dfa->inveclosures + node;
+ re_node_set except_nodes;
+ re_node_set_init_empty (&except_nodes);
+ for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
+ {
+ int cur_node = inv_eclosure->elems[ecl_idx];
+ if (cur_node == node)
+ continue;
+ if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
+ {
+ int edst1 = dfa->edests[cur_node].elems[0];
+ int edst2 = ((dfa->edests[cur_node].nelem > 1)
+ ? dfa->edests[cur_node].elems[1] : -1);
+ if ((!re_node_set_contains (inv_eclosure, edst1)
+ && re_node_set_contains (dest_nodes, edst1))
+ || (edst2 > 0
+ && !re_node_set_contains (inv_eclosure, edst2)
+ && re_node_set_contains (dest_nodes, edst2)))
+ {
+ err = re_node_set_add_intersect (&except_nodes, candidates,
+ dfa->inveclosures + cur_node);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&except_nodes);
+ return err;
+ }
+ }
+ }
+ }
+ for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
+ {
+ int cur_node = inv_eclosure->elems[ecl_idx];
+ if (!re_node_set_contains (&except_nodes, cur_node))
+ {
+ int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
+ re_node_set_remove_at (dest_nodes, idx);
+ }
+ }
+ re_node_set_free (&except_nodes);
+ return REG_NOERROR;
+}
+
+static int
+check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
+ re_match_context_t *mctx;
+ re_node_set *limits;
+ int dst_node, dst_idx, src_node, src_idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int lim_idx, src_pos, dst_pos;
+
+ int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
+ int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
+ for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
+ {
+ int subexp_idx;
+ struct re_backref_cache_entry *ent;
+ ent = mctx->bkref_ents + limits->elems[lim_idx];
+ subexp_idx = dfa->nodes[ent->node].opr.idx;
+
+ dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
+ subexp_idx, dst_node, dst_idx,
+ dst_bkref_idx);
+ src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
+ subexp_idx, src_node, src_idx,
+ src_bkref_idx);
+
+ /* In case of:
+ <src> <dst> ( <subexp> )
+ ( <subexp> ) <src> <dst>
+ ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
+ if (src_pos == dst_pos)
+ continue; /* This is unrelated limitation. */
+ else
+ return 1;
+ }
+ return 0;
+}
+
+static int
+check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
+ re_match_context_t *mctx;
+ int boundaries, subexp_idx, from_node, bkref_idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ re_node_set *eclosures = dfa->eclosures + from_node;
+ int node_idx;
+
+ /* Else, we are on the boundary: examine the nodes on the epsilon
+ closure. */
+ for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
+ {
+ int node = eclosures->elems[node_idx];
+ switch (dfa->nodes[node].type)
+ {
+ case OP_BACK_REF:
+ if (bkref_idx != -1)
+ {
+ struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
+ do
+ {
+ int dst, cpos;
+
+ if (ent->node != node)
+ continue;
+
+ if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
+ && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
+ continue;
+
+ /* Recurse trying to reach the OP_OPEN_SUBEXP and
+ OP_CLOSE_SUBEXP cases below. But, if the
+ destination node is the same node as the source
+ node, don't recurse because it would cause an
+ infinite loop: a regex that exhibits this behavior
+ is ()\1*\1* */
+ dst = dfa->edests[node].elems[0];
+ if (dst == from_node)
+ {
+ if (boundaries & 1)
+ return -1;
+ else /* if (boundaries & 2) */
+ return 0;
+ }
+
+ cpos =
+ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
+ dst, bkref_idx);
+ if (cpos == -1 /* && (boundaries & 1) */)
+ return -1;
+ if (cpos == 0 && (boundaries & 2))
+ return 0;
+
+ ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
+ }
+ while (ent++->more);
+ }
+ break;
+
+ case OP_OPEN_SUBEXP:
+ if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
+ return -1;
+ break;
+
+ case OP_CLOSE_SUBEXP:
+ if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
+ return 0;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return (boundaries & 2) ? 1 : 0;
+}
+
+static int
+check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
+ re_match_context_t *mctx;
+ int limit, subexp_idx, from_node, str_idx, bkref_idx;
+{
+ struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
+ int boundaries;
+
+ /* If we are outside the range of the subexpression, return -1 or 1. */
+ if (str_idx < lim->subexp_from)
+ return -1;
+
+ if (lim->subexp_to < str_idx)
+ return 1;
+
+ /* If we are within the subexpression, return 0. */
+ boundaries = (str_idx == lim->subexp_from);
+ boundaries |= (str_idx == lim->subexp_to) << 1;
+ if (boundaries == 0)
+ return 0;
+
+ /* Else, examine epsilon closure. */
+ return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
+ from_node, bkref_idx);
+}
+
+/* Check the limitations of sub expressions LIMITS, and remove the nodes
+ which are against limitations from DEST_NODES. */
+
+static reg_errcode_t
+check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
+ re_dfa_t *dfa;
+ re_node_set *dest_nodes;
+ const re_node_set *candidates;
+ re_node_set *limits;
+ struct re_backref_cache_entry *bkref_ents;
+ int str_idx;
+{
+ reg_errcode_t err;
+ int node_idx, lim_idx;
+
+ for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
+ {
+ int subexp_idx;
+ struct re_backref_cache_entry *ent;
+ ent = bkref_ents + limits->elems[lim_idx];
+
+ if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
+ continue; /* This is unrelated limitation. */
+
+ subexp_idx = dfa->nodes[ent->node].opr.idx;
+ if (ent->subexp_to == str_idx)
+ {
+ int ops_node = -1;
+ int cls_node = -1;
+ for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
+ {
+ int node = dest_nodes->elems[node_idx];
+ re_token_type_t type = dfa->nodes[node].type;
+ if (type == OP_OPEN_SUBEXP
+ && subexp_idx == dfa->nodes[node].opr.idx)
+ ops_node = node;
+ else if (type == OP_CLOSE_SUBEXP
+ && subexp_idx == dfa->nodes[node].opr.idx)
+ cls_node = node;
+ }
+
+ /* Check the limitation of the open subexpression. */
+ /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
+ if (ops_node >= 0)
+ {
+ err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
+ candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+
+ /* Check the limitation of the close subexpression. */
+ if (cls_node >= 0)
+ for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
+ {
+ int node = dest_nodes->elems[node_idx];
+ if (!re_node_set_contains (dfa->inveclosures + node,
+ cls_node)
+ && !re_node_set_contains (dfa->eclosures + node,
+ cls_node))
+ {
+ /* It is against this limitation.
+ Remove it form the current sifted state. */
+ err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
+ candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ --node_idx;
+ }
+ }
+ }
+ else /* (ent->subexp_to != str_idx) */
+ {
+ for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
+ {
+ int node = dest_nodes->elems[node_idx];
+ re_token_type_t type = dfa->nodes[node].type;
+ if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
+ {
+ if (subexp_idx != dfa->nodes[node].opr.idx)
+ continue;
+ /* It is against this limitation.
+ Remove it form the current sifted state. */
+ err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
+ candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+ }
+ }
+ return REG_NOERROR;
+}
+
+static reg_errcode_t
+sift_states_bkref (mctx, sctx, str_idx, candidates)
+ re_match_context_t *mctx;
+ re_sift_context_t *sctx;
+ int str_idx;
+ const re_node_set *candidates;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int node_idx, node;
+ re_sift_context_t local_sctx;
+ int first_idx = search_cur_bkref_entry (mctx, str_idx);
+
+ if (first_idx == -1)
+ return REG_NOERROR;
+
+ local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
+
+ for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
+ {
+ int enabled_idx;
+ re_token_type_t type;
+ struct re_backref_cache_entry *entry;
+ node = candidates->elems[node_idx];
+ type = dfa->nodes[node].type;
+ /* Avoid infinite loop for the REs like "()\1+". */
+ if (node == sctx->last_node && str_idx == sctx->last_str_idx)
+ continue;
+ if (type != OP_BACK_REF)
+ continue;
+
+ entry = mctx->bkref_ents + first_idx;
+ enabled_idx = first_idx;
+ do
+ {
+ int subexp_len, to_idx, dst_node;
+ re_dfastate_t *cur_state;
+
+ if (entry->node != node)
+ continue;
+ subexp_len = entry->subexp_to - entry->subexp_from;
+ to_idx = str_idx + subexp_len;
+ dst_node = (subexp_len ? dfa->nexts[node]
+ : dfa->edests[node].elems[0]);
+
+ if (to_idx > sctx->last_str_idx
+ || sctx->sifted_states[to_idx] == NULL
+ || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
+ || check_dst_limits (mctx, &sctx->limits, node,
+ str_idx, dst_node, to_idx))
+ continue;
+
+ if (local_sctx.sifted_states == NULL)
+ {
+ local_sctx = *sctx;
+ err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ local_sctx.last_node = node;
+ local_sctx.last_str_idx = str_idx;
+ err = re_node_set_insert (&local_sctx.limits, enabled_idx);
+ if (BE (err < 0, 0))
+ {
+ err = REG_ESPACE;
+ goto free_return;
+ }
+ cur_state = local_sctx.sifted_states[str_idx];
+ err = sift_states_backward (mctx, &local_sctx);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ if (sctx->limited_states != NULL)
+ {
+ err = merge_state_array (dfa, sctx->limited_states,
+ local_sctx.sifted_states,
+ str_idx + 1);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ local_sctx.sifted_states[str_idx] = cur_state;
+ re_node_set_remove (&local_sctx.limits, enabled_idx);
+
+ /* mctx->bkref_ents may have changed, reload the pointer. */
+ entry = mctx->bkref_ents + enabled_idx;
+ }
+ while (enabled_idx++, entry++->more);
+ }
+ err = REG_NOERROR;
+ free_return:
+ if (local_sctx.sifted_states != NULL)
+ {
+ re_node_set_free (&local_sctx.limits);
+ }
+
+ return err;
+}
+
+
+#ifdef RE_ENABLE_I18N
+static int
+sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
+ const re_match_context_t *mctx;
+ re_sift_context_t *sctx;
+ int node_idx, str_idx, max_str_idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int naccepted;
+ /* Check the node can accept `multi byte'. */
+ naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
+ if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
+ !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
+ dfa->nexts[node_idx]))
+ /* The node can't accept the `multi byte', or the
+ destination was already thrown away, then the node
+ could't accept the current input `multi byte'. */
+ naccepted = 0;
+ /* Otherwise, it is sure that the node could accept
+ `naccepted' bytes input. */
+ return naccepted;
+}
+#endif /* RE_ENABLE_I18N */
+
+
+/* Functions for state transition. */
+
+/* Return the next state to which the current state STATE will transit by
+ accepting the current input byte, and update STATE_LOG if necessary.
+ If STATE can accept a multibyte char/collating element/back reference
+ update the destination of STATE_LOG. */
+
+static re_dfastate_t *
+transit_state (err, mctx, state)
+ reg_errcode_t *err;
+ re_match_context_t *mctx;
+ re_dfastate_t *state;
+{
+ re_dfastate_t **trtable;
+ unsigned char ch;
+
+#ifdef RE_ENABLE_I18N
+ /* If the current state can accept multibyte. */
+ if (BE (state->accept_mb, 0))
+ {
+ *err = transit_state_mb (mctx, state);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ }
+#endif /* RE_ENABLE_I18N */
+
+ /* Then decide the next state with the single byte. */
+#if 0
+ if (0)
+ /* don't use transition table */
+ return transit_state_sb (err, mctx, state);
+#endif
+
+ /* Use transition table */
+ ch = re_string_fetch_byte (&mctx->input);
+ for (;;)
+ {
+ trtable = state->trtable;
+ if (BE (trtable != NULL, 1))
+ return trtable[ch];
+
+ trtable = state->word_trtable;
+ if (BE (trtable != NULL, 1))
+ {
+ unsigned int context;
+ context
+ = re_string_context_at (&mctx->input,
+ re_string_cur_idx (&mctx->input) - 1,
+ mctx->eflags);
+ if (IS_WORD_CONTEXT (context))
+ return trtable[ch + SBC_MAX];
+ else
+ return trtable[ch];
+ }
+
+ if (!build_trtable (mctx->dfa, state))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+
+ /* Retry, we now have a transition table. */
+ }
+}
+
+/* Update the state_log if we need */
+re_dfastate_t *
+merge_state_with_log (err, mctx, next_state)
+ reg_errcode_t *err;
+ re_match_context_t *mctx;
+ re_dfastate_t *next_state;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int cur_idx = re_string_cur_idx (&mctx->input);
+
+ if (cur_idx > mctx->state_log_top)
+ {
+ mctx->state_log[cur_idx] = next_state;
+ mctx->state_log_top = cur_idx;
+ }
+ else if (mctx->state_log[cur_idx] == 0)
+ {
+ mctx->state_log[cur_idx] = next_state;
+ }
+ else
+ {
+ re_dfastate_t *pstate;
+ unsigned int context;
+ re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
+ /* If (state_log[cur_idx] != 0), it implies that cur_idx is
+ the destination of a multibyte char/collating element/
+ back reference. Then the next state is the union set of
+ these destinations and the results of the transition table. */
+ pstate = mctx->state_log[cur_idx];
+ log_nodes = pstate->entrance_nodes;
+ if (next_state != NULL)
+ {
+ table_nodes = next_state->entrance_nodes;
+ *err = re_node_set_init_union (&next_nodes, table_nodes,
+ log_nodes);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ }
+ else
+ next_nodes = *log_nodes;
+ /* Note: We already add the nodes of the initial state,
+ then we don't need to add them here. */
+
+ context = re_string_context_at (&mctx->input,
+ re_string_cur_idx (&mctx->input) - 1,
+ mctx->eflags);
+ next_state = mctx->state_log[cur_idx]
+ = re_acquire_state_context (err, dfa, &next_nodes, context);
+ /* We don't need to check errors here, since the return value of
+ this function is next_state and ERR is already set. */
+
+ if (table_nodes != NULL)
+ re_node_set_free (&next_nodes);
+ }
+
+ if (BE (dfa->nbackref, 0) && next_state != NULL)
+ {
+ /* Check OP_OPEN_SUBEXP in the current state in case that we use them
+ later. We must check them here, since the back references in the
+ next state might use them. */
+ *err = check_subexp_matching_top (mctx, &next_state->nodes,
+ cur_idx);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+
+ /* If the next state has back references. */
+ if (next_state->has_backref)
+ {
+ *err = transit_state_bkref (mctx, &next_state->nodes);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ next_state = mctx->state_log[cur_idx];
+ }
+ }
+
+ return next_state;
+}
+
+/* Skip bytes in the input that correspond to part of a
+ multi-byte match, then look in the log for a state
+ from which to restart matching. */
+re_dfastate_t *
+find_recover_state (err, mctx)
+ reg_errcode_t *err;
+ re_match_context_t *mctx;
+{
+ re_dfastate_t *cur_state = NULL;
+ do
+ {
+ int max = mctx->state_log_top;
+ int cur_str_idx = re_string_cur_idx (&mctx->input);
+
+ do
+ {
+ if (++cur_str_idx > max)
+ return NULL;
+ re_string_skip_bytes (&mctx->input, 1);
+ }
+ while (mctx->state_log[cur_str_idx] == NULL);
+
+ cur_state = merge_state_with_log (err, mctx, NULL);
+ }
+ while (err == REG_NOERROR && cur_state == NULL);
+ return cur_state;
+}
+
+/* Helper functions for transit_state. */
+
+/* From the node set CUR_NODES, pick up the nodes whose types are
+ OP_OPEN_SUBEXP and which have corresponding back references in the regular
+ expression. And register them to use them later for evaluating the
+ correspoding back references. */
+
+static reg_errcode_t
+check_subexp_matching_top (mctx, cur_nodes, str_idx)
+ re_match_context_t *mctx;
+ re_node_set *cur_nodes;
+ int str_idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int node_idx;
+ reg_errcode_t err;
+
+ /* TODO: This isn't efficient.
+ Because there might be more than one nodes whose types are
+ OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
+ nodes.
+ E.g. RE: (a){2} */
+ for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
+ {
+ int node = cur_nodes->elems[node_idx];
+ if (dfa->nodes[node].type == OP_OPEN_SUBEXP
+ && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
+ && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
+ {
+ err = match_ctx_add_subtop (mctx, node, str_idx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+ return REG_NOERROR;
+}
+
+#if 0
+/* Return the next state to which the current state STATE will transit by
+ accepting the current input byte. */
+
+static re_dfastate_t *
+transit_state_sb (err, mctx, state)
+ reg_errcode_t *err;
+ re_match_context_t *mctx;
+ re_dfastate_t *state;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ re_node_set next_nodes;
+ re_dfastate_t *next_state;
+ int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
+ unsigned int context;
+
+ *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
+ {
+ int cur_node = state->nodes.elems[node_cnt];
+ if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
+ {
+ *err = re_node_set_merge (&next_nodes,
+ dfa->eclosures + dfa->nexts[cur_node]);
+ if (BE (*err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return NULL;
+ }
+ }
+ }
+ context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
+ next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
+ /* We don't need to check errors here, since the return value of
+ this function is next_state and ERR is already set. */
+
+ re_node_set_free (&next_nodes);
+ re_string_skip_bytes (&mctx->input, 1);
+ return next_state;
+}
+#endif
+
+#ifdef RE_ENABLE_I18N
+static reg_errcode_t
+transit_state_mb (mctx, pstate)
+ re_match_context_t *mctx;
+ re_dfastate_t *pstate;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int i;
+
+ for (i = 0; i < pstate->nodes.nelem; ++i)
+ {
+ re_node_set dest_nodes, *new_nodes;
+ int cur_node_idx = pstate->nodes.elems[i];
+ int naccepted, dest_idx;
+ unsigned int context;
+ re_dfastate_t *dest_state;
+
+ if (!dfa->nodes[cur_node_idx].accept_mb)
+ continue;
+
+ if (dfa->nodes[cur_node_idx].constraint)
+ {
+ context = re_string_context_at (&mctx->input,
+ re_string_cur_idx (&mctx->input),
+ mctx->eflags);
+ if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
+ context))
+ continue;
+ }
+
+ /* How many bytes the node can accept? */
+ naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
+ re_string_cur_idx (&mctx->input));
+ if (naccepted == 0)
+ continue;
+
+ /* The node can accepts `naccepted' bytes. */
+ dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
+ mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
+ : mctx->max_mb_elem_len);
+ err = clean_state_log_if_needed (mctx, dest_idx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+#ifdef DEBUG
+ assert (dfa->nexts[cur_node_idx] != -1);
+#endif
+ new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
+
+ dest_state = mctx->state_log[dest_idx];
+ if (dest_state == NULL)
+ dest_nodes = *new_nodes;
+ else
+ {
+ err = re_node_set_init_union (&dest_nodes,
+ dest_state->entrance_nodes, new_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
+ mctx->state_log[dest_idx]
+ = re_acquire_state_context (&err, dfa, &dest_nodes, context);
+ if (dest_state != NULL)
+ re_node_set_free (&dest_nodes);
+ if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
+ return err;
+ }
+ return REG_NOERROR;
+}
+#endif /* RE_ENABLE_I18N */
+
+static reg_errcode_t
+transit_state_bkref (mctx, nodes)
+ re_match_context_t *mctx;
+ const re_node_set *nodes;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int i;
+ int cur_str_idx = re_string_cur_idx (&mctx->input);
+
+ for (i = 0; i < nodes->nelem; ++i)
+ {
+ int dest_str_idx, prev_nelem, bkc_idx;
+ int node_idx = nodes->elems[i];
+ unsigned int context;
+ const re_token_t *node = dfa->nodes + node_idx;
+ re_node_set *new_dest_nodes;
+
+ /* Check whether `node' is a backreference or not. */
+ if (node->type != OP_BACK_REF)
+ continue;
+
+ if (node->constraint)
+ {
+ context = re_string_context_at (&mctx->input, cur_str_idx,
+ mctx->eflags);
+ if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
+ continue;
+ }
+
+ /* `node' is a backreference.
+ Check the substring which the substring matched. */
+ bkc_idx = mctx->nbkref_ents;
+ err = get_subexp (mctx, node_idx, cur_str_idx);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+ /* And add the epsilon closures (which is `new_dest_nodes') of
+ the backreference to appropriate state_log. */
+#ifdef DEBUG
+ assert (dfa->nexts[node_idx] != -1);
+#endif
+ for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
+ {
+ int subexp_len;
+ re_dfastate_t *dest_state;
+ struct re_backref_cache_entry *bkref_ent;
+ bkref_ent = mctx->bkref_ents + bkc_idx;
+ if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
+ continue;
+ subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
+ new_dest_nodes = (subexp_len == 0
+ ? dfa->eclosures + dfa->edests[node_idx].elems[0]
+ : dfa->eclosures + dfa->nexts[node_idx]);
+ dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
+ - bkref_ent->subexp_from);
+ context = re_string_context_at (&mctx->input, dest_str_idx - 1,
+ mctx->eflags);
+ dest_state = mctx->state_log[dest_str_idx];
+ prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
+ : mctx->state_log[cur_str_idx]->nodes.nelem);
+ /* Add `new_dest_node' to state_log. */
+ if (dest_state == NULL)
+ {
+ mctx->state_log[dest_str_idx]
+ = re_acquire_state_context (&err, dfa, new_dest_nodes,
+ context);
+ if (BE (mctx->state_log[dest_str_idx] == NULL
+ && err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ else
+ {
+ re_node_set dest_nodes;
+ err = re_node_set_init_union (&dest_nodes,
+ dest_state->entrance_nodes,
+ new_dest_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&dest_nodes);
+ goto free_return;
+ }
+ mctx->state_log[dest_str_idx]
+ = re_acquire_state_context (&err, dfa, &dest_nodes, context);
+ re_node_set_free (&dest_nodes);
+ if (BE (mctx->state_log[dest_str_idx] == NULL
+ && err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ /* We need to check recursively if the backreference can epsilon
+ transit. */
+ if (subexp_len == 0
+ && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
+ {
+ err = check_subexp_matching_top (mctx, new_dest_nodes,
+ cur_str_idx);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ err = transit_state_bkref (mctx, new_dest_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+ }
+ }
+ }
+ err = REG_NOERROR;
+ free_return:
+ return err;
+}
+
+/* Enumerate all the candidates which the backreference BKREF_NODE can match
+ at BKREF_STR_IDX, and register them by match_ctx_add_entry().
+ Note that we might collect inappropriate candidates here.
+ However, the cost of checking them strictly here is too high, then we
+ delay these checking for prune_impossible_nodes(). */
+
+static reg_errcode_t
+get_subexp (mctx, bkref_node, bkref_str_idx)
+ re_match_context_t *mctx;
+ int bkref_node, bkref_str_idx;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int subexp_num, sub_top_idx;
+ const char *buf = (const char *) re_string_get_buffer (&mctx->input);
+ /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
+ int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
+ if (cache_idx != -1)
+ {
+ const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
+ do
+ if (entry->node == bkref_node)
+ return REG_NOERROR; /* We already checked it. */
+ while (entry++->more);
+ }
+
+ subexp_num = dfa->nodes[bkref_node].opr.idx;
+
+ /* For each sub expression */
+ for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
+ {
+ reg_errcode_t err;
+ re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
+ re_sub_match_last_t *sub_last;
+ int sub_last_idx, sl_str, bkref_str_off;
+
+ if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
+ continue; /* It isn't related. */
+
+ sl_str = sub_top->str_idx;
+ bkref_str_off = bkref_str_idx;
+ /* At first, check the last node of sub expressions we already
+ evaluated. */
+ for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
+ {
+ int sl_str_diff;
+ sub_last = sub_top->lasts[sub_last_idx];
+ sl_str_diff = sub_last->str_idx - sl_str;
+ /* The matched string by the sub expression match with the substring
+ at the back reference? */
+ if (sl_str_diff > 0)
+ {
+ if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
+ {
+ /* Not enough chars for a successful match. */
+ if (bkref_str_off + sl_str_diff > mctx->input.len)
+ break;
+
+ err = clean_state_log_if_needed (mctx,
+ bkref_str_off
+ + sl_str_diff);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ buf = (const char *) re_string_get_buffer (&mctx->input);
+ }
+ if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
+ break; /* We don't need to search this sub expression any more. */
+ }
+ bkref_str_off += sl_str_diff;
+ sl_str += sl_str_diff;
+ err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
+ bkref_str_idx);
+
+ /* Reload buf, since the preceding call might have reallocated
+ the buffer. */
+ buf = (const char *) re_string_get_buffer (&mctx->input);
+
+ if (err == REG_NOMATCH)
+ continue;
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+
+ if (sub_last_idx < sub_top->nlasts)
+ continue;
+ if (sub_last_idx > 0)
+ ++sl_str;
+ /* Then, search for the other last nodes of the sub expression. */
+ for (; sl_str <= bkref_str_idx; ++sl_str)
+ {
+ int cls_node, sl_str_off;
+ const re_node_set *nodes;
+ sl_str_off = sl_str - sub_top->str_idx;
+ /* The matched string by the sub expression match with the substring
+ at the back reference? */
+ if (sl_str_off > 0)
+ {
+ if (BE (bkref_str_off >= mctx->input.valid_len, 0))
+ {
+ /* If we are at the end of the input, we cannot match. */
+ if (bkref_str_off >= mctx->input.len)
+ break;
+
+ err = extend_buffers (mctx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+
+ buf = (const char *) re_string_get_buffer (&mctx->input);
+ }
+ if (buf [bkref_str_off++] != buf[sl_str - 1])
+ break; /* We don't need to search this sub expression
+ any more. */
+ }
+ if (mctx->state_log[sl_str] == NULL)
+ continue;
+ /* Does this state have a ')' of the sub expression? */
+ nodes = &mctx->state_log[sl_str]->nodes;
+ cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
+ if (cls_node == -1)
+ continue; /* No. */
+ if (sub_top->path == NULL)
+ {
+ sub_top->path = calloc (sizeof (state_array_t),
+ sl_str - sub_top->str_idx + 1);
+ if (sub_top->path == NULL)
+ return REG_ESPACE;
+ }
+ /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
+ in the current context? */
+ err = check_arrival (mctx, sub_top->path, sub_top->node,
+ sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
+ if (err == REG_NOMATCH)
+ continue;
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
+ if (BE (sub_last == NULL, 0))
+ return REG_ESPACE;
+ err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
+ bkref_str_idx);
+ if (err == REG_NOMATCH)
+ continue;
+ }
+ }
+ return REG_NOERROR;
+}
+
+/* Helper functions for get_subexp(). */
+
+/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
+ If it can arrive, register the sub expression expressed with SUB_TOP
+ and SUB_LAST. */
+
+static reg_errcode_t
+get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
+ re_match_context_t *mctx;
+ const re_sub_match_top_t *sub_top;
+ re_sub_match_last_t *sub_last;
+ int bkref_node, bkref_str;
+{
+ reg_errcode_t err;
+ int to_idx;
+ /* Can the subexpression arrive the back reference? */
+ err = check_arrival (mctx, &sub_last->path, sub_last->node,
+ sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
+ if (err != REG_NOERROR)
+ return err;
+ err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
+ sub_last->str_idx);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
+ return clean_state_log_if_needed (mctx, to_idx);
+}
+
+/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
+ Search '(' if FL_OPEN, or search ')' otherwise.
+ TODO: This function isn't efficient...
+ Because there might be more than one nodes whose types are
+ OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
+ nodes.
+ E.g. RE: (a){2} */
+
+static int
+find_subexp_node (dfa, nodes, subexp_idx, type)
+ const re_dfa_t *dfa;
+ const re_node_set *nodes;
+ int subexp_idx, type;
+{
+ int cls_idx;
+ for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
+ {
+ int cls_node = nodes->elems[cls_idx];
+ const re_token_t *node = dfa->nodes + cls_node;
+ if (node->type == type
+ && node->opr.idx == subexp_idx)
+ return cls_node;
+ }
+ return -1;
+}
+
+/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
+ LAST_NODE at LAST_STR. We record the path onto PATH since it will be
+ heavily reused.
+ Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
+
+static reg_errcode_t
+check_arrival (mctx, path, top_node, top_str, last_node, last_str,
+ type)
+ re_match_context_t *mctx;
+ state_array_t *path;
+ int top_node, top_str, last_node, last_str, type;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int subexp_num, backup_cur_idx, str_idx, null_cnt;
+ re_dfastate_t *cur_state = NULL;
+ re_node_set *cur_nodes, next_nodes;
+ re_dfastate_t **backup_state_log;
+ unsigned int context;
+
+ subexp_num = dfa->nodes[top_node].opr.idx;
+ /* Extend the buffer if we need. */
+ if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
+ {
+ re_dfastate_t **new_array;
+ int old_alloc = path->alloc;
+ path->alloc += last_str + mctx->max_mb_elem_len + 1;
+ new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
+ if (new_array == NULL)
+ {
+ path->alloc = old_alloc;
+ return REG_ESPACE;
+ }
+ path->array = new_array;
+ memset (new_array + old_alloc, '\0',
+ sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
+ }
+
+ str_idx = path->next_idx == 0 ? top_str : path->next_idx;
+
+ /* Temporary modify MCTX. */
+ backup_state_log = mctx->state_log;
+ backup_cur_idx = mctx->input.cur_idx;
+ mctx->state_log = path->array;
+ mctx->input.cur_idx = str_idx;
+
+ /* Setup initial node set. */
+ context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
+ if (str_idx == top_str)
+ {
+ err = re_node_set_init_1 (&next_nodes, top_node);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ }
+ else
+ {
+ cur_state = mctx->state_log[str_idx];
+ if (cur_state && cur_state->has_backref)
+ {
+ err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
+ if (BE ( err != REG_NOERROR, 0))
+ return err;
+ }
+ else
+ re_node_set_init_empty (&next_nodes);
+ }
+ if (str_idx == top_str || (cur_state && cur_state->has_backref))
+ {
+ if (next_nodes.nelem)
+ {
+ err = expand_bkref_cache (mctx, &next_nodes, str_idx,
+ subexp_num, type);
+ if (BE ( err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ }
+ cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
+ if (BE (cur_state == NULL && err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ mctx->state_log[str_idx] = cur_state;
+ }
+
+ for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
+ {
+ re_node_set_empty (&next_nodes);
+ if (mctx->state_log[str_idx + 1])
+ {
+ err = re_node_set_merge (&next_nodes,
+ &mctx->state_log[str_idx + 1]->nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ }
+ if (cur_state)
+ {
+ err = check_arrival_add_next_nodes (mctx, str_idx,
+ &cur_state->non_eps_nodes, &next_nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ }
+ ++str_idx;
+ if (next_nodes.nelem)
+ {
+ err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ err = expand_bkref_cache (mctx, &next_nodes, str_idx,
+ subexp_num, type);
+ if (BE ( err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ }
+ context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
+ cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
+ if (BE (cur_state == NULL && err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&next_nodes);
+ return err;
+ }
+ mctx->state_log[str_idx] = cur_state;
+ null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
+ }
+ re_node_set_free (&next_nodes);
+ cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
+ : &mctx->state_log[last_str]->nodes);
+ path->next_idx = str_idx;
+
+ /* Fix MCTX. */
+ mctx->state_log = backup_state_log;
+ mctx->input.cur_idx = backup_cur_idx;
+
+ /* Then check the current node set has the node LAST_NODE. */
+ if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
+ return REG_NOERROR;
+
+ return REG_NOMATCH;
+}
+
+/* Helper functions for check_arrival. */
+
+/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
+ to NEXT_NODES.
+ TODO: This function is similar to the functions transit_state*(),
+ however this function has many additional works.
+ Can't we unify them? */
+
+static reg_errcode_t
+check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
+ re_match_context_t *mctx;
+ int str_idx;
+ re_node_set *cur_nodes, *next_nodes;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ int result;
+ int cur_idx;
+ reg_errcode_t err;
+ re_node_set union_set;
+ re_node_set_init_empty (&union_set);
+ for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
+ {
+ int naccepted = 0;
+ int cur_node = cur_nodes->elems[cur_idx];
+#ifdef DEBUG
+ re_token_type_t type = dfa->nodes[cur_node].type;
+ assert (!IS_EPSILON_NODE (type));
+#endif
+#ifdef RE_ENABLE_I18N
+ /* If the node may accept `multi byte'. */
+ if (dfa->nodes[cur_node].accept_mb)
+ {
+ naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
+ str_idx);
+ if (naccepted > 1)
+ {
+ re_dfastate_t *dest_state;
+ int next_node = dfa->nexts[cur_node];
+ int next_idx = str_idx + naccepted;
+ dest_state = mctx->state_log[next_idx];
+ re_node_set_empty (&union_set);
+ if (dest_state)
+ {
+ err = re_node_set_merge (&union_set, &dest_state->nodes);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&union_set);
+ return err;
+ }
+ }
+ result = re_node_set_insert (&union_set, next_node);
+ if (BE (result < 0, 0))
+ {
+ re_node_set_free (&union_set);
+ return REG_ESPACE;
+ }
+ mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
+ &union_set);
+ if (BE (mctx->state_log[next_idx] == NULL
+ && err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&union_set);
+ return err;
+ }
+ }
+ }
+#endif /* RE_ENABLE_I18N */
+ if (naccepted
+ || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
+ {
+ result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
+ if (BE (result < 0, 0))
+ {
+ re_node_set_free (&union_set);
+ return REG_ESPACE;
+ }
+ }
+ }
+ re_node_set_free (&union_set);
+ return REG_NOERROR;
+}
+
+/* For all the nodes in CUR_NODES, add the epsilon closures of them to
+ CUR_NODES, however exclude the nodes which are:
+ - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
+ - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
+*/
+
+static reg_errcode_t
+check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
+ re_dfa_t *dfa;
+ re_node_set *cur_nodes;
+ int ex_subexp, type;
+{
+ reg_errcode_t err;
+ int idx, outside_node;
+ re_node_set new_nodes;
+#ifdef DEBUG
+ assert (cur_nodes->nelem);
+#endif
+ err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ /* Create a new node set NEW_NODES with the nodes which are epsilon
+ closures of the node in CUR_NODES. */
+
+ for (idx = 0; idx < cur_nodes->nelem; ++idx)
+ {
+ int cur_node = cur_nodes->elems[idx];
+ re_node_set *eclosure = dfa->eclosures + cur_node;
+ outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
+ if (outside_node == -1)
+ {
+ /* There are no problematic nodes, just merge them. */
+ err = re_node_set_merge (&new_nodes, eclosure);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&new_nodes);
+ return err;
+ }
+ }
+ else
+ {
+ /* There are problematic nodes, re-calculate incrementally. */
+ err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
+ ex_subexp, type);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ re_node_set_free (&new_nodes);
+ return err;
+ }
+ }
+ }
+ re_node_set_free (cur_nodes);
+ *cur_nodes = new_nodes;
+ return REG_NOERROR;
+}
+
+/* Helper function for check_arrival_expand_ecl.
+ Check incrementally the epsilon closure of TARGET, and if it isn't
+ problematic append it to DST_NODES. */
+
+static reg_errcode_t
+check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
+ re_dfa_t *dfa;
+ int target, ex_subexp, type;
+ re_node_set *dst_nodes;
+{
+ int cur_node;
+ for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
+ {
+ int err;
+
+ if (dfa->nodes[cur_node].type == type
+ && dfa->nodes[cur_node].opr.idx == ex_subexp)
+ {
+ if (type == OP_CLOSE_SUBEXP)
+ {
+ err = re_node_set_insert (dst_nodes, cur_node);
+ if (BE (err == -1, 0))
+ return REG_ESPACE;
+ }
+ break;
+ }
+ err = re_node_set_insert (dst_nodes, cur_node);
+ if (BE (err == -1, 0))
+ return REG_ESPACE;
+ if (dfa->edests[cur_node].nelem == 0)
+ break;
+ if (dfa->edests[cur_node].nelem == 2)
+ {
+ err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
+ dfa->edests[cur_node].elems[1],
+ ex_subexp, type);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ cur_node = dfa->edests[cur_node].elems[0];
+ }
+ return REG_NOERROR;
+}
+
+
+/* For all the back references in the current state, calculate the
+ destination of the back references by the appropriate entry
+ in MCTX->BKREF_ENTS. */
+
+static reg_errcode_t
+expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
+ type)
+ re_match_context_t *mctx;
+ int cur_str, subexp_num, type;
+ re_node_set *cur_nodes;
+{
+ re_dfa_t *const dfa = mctx->dfa;
+ reg_errcode_t err;
+ int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
+ struct re_backref_cache_entry *ent;
+
+ if (cache_idx_start == -1)
+ return REG_NOERROR;
+
+ restart:
+ ent = mctx->bkref_ents + cache_idx_start;
+ do
+ {
+ int to_idx, next_node;
+
+ /* Is this entry ENT is appropriate? */
+ if (!re_node_set_contains (cur_nodes, ent->node))
+ continue; /* No. */
+
+ to_idx = cur_str + ent->subexp_to - ent->subexp_from;
+ /* Calculate the destination of the back reference, and append it
+ to MCTX->STATE_LOG. */
+ if (to_idx == cur_str)
+ {
+ /* The backreference did epsilon transit, we must re-check all the
+ node in the current state. */
+ re_node_set new_dests;
+ reg_errcode_t err2, err3;
+ next_node = dfa->edests[ent->node].elems[0];
+ if (re_node_set_contains (cur_nodes, next_node))
+ continue;
+ err = re_node_set_init_1 (&new_dests, next_node);
+ err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
+ err3 = re_node_set_merge (cur_nodes, &new_dests);
+ re_node_set_free (&new_dests);
+ if (BE (err != REG_NOERROR || err2 != REG_NOERROR
+ || err3 != REG_NOERROR, 0))
+ {
+ err = (err != REG_NOERROR ? err
+ : (err2 != REG_NOERROR ? err2 : err3));
+ return err;
+ }
+ /* TODO: It is still inefficient... */
+ goto restart;
+ }
+ else
+ {
+ re_node_set union_set;
+ next_node = dfa->nexts[ent->node];
+ if (mctx->state_log[to_idx])
+ {
+ int ret;
+ if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
+ next_node))
+ continue;
+ err = re_node_set_init_copy (&union_set,
+ &mctx->state_log[to_idx]->nodes);
+ ret = re_node_set_insert (&union_set, next_node);
+ if (BE (err != REG_NOERROR || ret < 0, 0))
+ {
+ re_node_set_free (&union_set);
+ err = err != REG_NOERROR ? err : REG_ESPACE;
+ return err;
+ }
+ }
+ else
+ {
+ err = re_node_set_init_1 (&union_set, next_node);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
+ mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
+ re_node_set_free (&union_set);
+ if (BE (mctx->state_log[to_idx] == NULL
+ && err != REG_NOERROR, 0))
+ return err;
+ }
+ }
+ while (ent++->more);
+ return REG_NOERROR;
+}
+
+/* Build transition table for the state.
+ Return 1 if succeeded, otherwise return NULL. */
+
+static int
+build_trtable (dfa, state)
+ re_dfa_t *dfa;
+ re_dfastate_t *state;
+{
+ reg_errcode_t err;
+ int i, j, ch, need_word_trtable = 0;
+ unsigned int elem, mask;
+ int dests_node_malloced = 0, dest_states_malloced = 0;
+ int ndests; /* Number of the destination states from `state'. */
+ re_dfastate_t **trtable;
+ re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
+ re_node_set follows, *dests_node;
+ bitset *dests_ch;
+ bitset acceptable;
+
+ /* We build DFA states which corresponds to the destination nodes
+ from `state'. `dests_node[i]' represents the nodes which i-th
+ destination state contains, and `dests_ch[i]' represents the
+ characters which i-th destination state accepts. */
+#ifdef _LIBC
+ if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
+ dests_node = (re_node_set *)
+ alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
+ else
+#endif
+ {
+ dests_node = (re_node_set *)
+ malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
+ if (BE (dests_node == NULL, 0))
+ return 0;
+ dests_node_malloced = 1;
+ }
+ dests_ch = (bitset *) (dests_node + SBC_MAX);
+
+ /* Initialize transiton table. */
+ state->word_trtable = state->trtable = NULL;
+
+ /* At first, group all nodes belonging to `state' into several
+ destinations. */
+ ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
+ if (BE (ndests <= 0, 0))
+ {
+ if (dests_node_malloced)
+ free (dests_node);
+ /* Return 0 in case of an error, 1 otherwise. */
+ if (ndests == 0)
+ {
+ state->trtable = (re_dfastate_t **)
+ calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ return 1;
+ }
+ return 0;
+ }
+
+ err = re_node_set_alloc (&follows, ndests + 1);
+ if (BE (err != REG_NOERROR, 0))
+ goto out_free;
+
+#ifdef _LIBC
+ if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
+ + ndests * 3 * sizeof (re_dfastate_t *)))
+ dest_states = (re_dfastate_t **)
+ alloca (ndests * 3 * sizeof (re_dfastate_t *));
+ else
+#endif
+ {
+ dest_states = (re_dfastate_t **)
+ malloc (ndests * 3 * sizeof (re_dfastate_t *));
+ if (BE (dest_states == NULL, 0))
+ {
+out_free:
+ if (dest_states_malloced)
+ free (dest_states);
+ re_node_set_free (&follows);
+ for (i = 0; i < ndests; ++i)
+ re_node_set_free (dests_node + i);
+ if (dests_node_malloced)
+ free (dests_node);
+ return 0;
+ }
+ dest_states_malloced = 1;
+ }
+ dest_states_word = dest_states + ndests;
+ dest_states_nl = dest_states_word + ndests;
+ bitset_empty (acceptable);
+
+ /* Then build the states for all destinations. */
+ for (i = 0; i < ndests; ++i)
+ {
+ int next_node;
+ re_node_set_empty (&follows);
+ /* Merge the follows of this destination states. */
+ for (j = 0; j < dests_node[i].nelem; ++j)
+ {
+ next_node = dfa->nexts[dests_node[i].elems[j]];
+ if (next_node != -1)
+ {
+ err = re_node_set_merge (&follows, dfa->eclosures + next_node);
+ if (BE (err != REG_NOERROR, 0))
+ goto out_free;
+ }
+ }
+ dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
+ if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
+ goto out_free;
+ /* If the new state has context constraint,
+ build appropriate states for these contexts. */
+ if (dest_states[i]->has_constraint)
+ {
+ dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
+ CONTEXT_WORD);
+ if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
+ goto out_free;
+
+ if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
+ need_word_trtable = 1;
+
+ dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
+ CONTEXT_NEWLINE);
+ if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
+ goto out_free;
+ }
+ else
+ {
+ dest_states_word[i] = dest_states[i];
+ dest_states_nl[i] = dest_states[i];
+ }
+ bitset_merge (acceptable, dests_ch[i]);
+ }
+
+ if (!BE (need_word_trtable, 0))
+ {
+ /* We don't care about whether the following character is a word
+ character, or we are in a single-byte character set so we can
+ discern by looking at the character code: allocate a
+ 256-entry transition table. */
+ trtable = state->trtable =
+ (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ if (BE (trtable == NULL, 0))
+ goto out_free;
+
+ /* For all characters ch...: */
+ for (i = 0; i < BITSET_UINTS; ++i)
+ for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
+ elem;
+ mask <<= 1, elem >>= 1, ++ch)
+ if (BE (elem & 1, 0))
+ {
+ /* There must be exactly one destination which accepts
+ character ch. See group_nodes_into_DFAstates. */
+ for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
+ ;
+
+ /* j-th destination accepts the word character ch. */
+ if (dfa->word_char[i] & mask)
+ trtable[ch] = dest_states_word[j];
+ else
+ trtable[ch] = dest_states[j];
+ }
+ }
+ else
+ {
+ /* We care about whether the following character is a word
+ character, and we are in a multi-byte character set: discern
+ by looking at the character code: build two 256-entry
+ transition tables, one starting at trtable[0] and one
+ starting at trtable[SBC_MAX]. */
+ trtable = state->word_trtable =
+ (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
+ if (BE (trtable == NULL, 0))
+ goto out_free;
+
+ /* For all characters ch...: */
+ for (i = 0; i < BITSET_UINTS; ++i)
+ for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
+ elem;
+ mask <<= 1, elem >>= 1, ++ch)
+ if (BE (elem & 1, 0))
+ {
+ /* There must be exactly one destination which accepts
+ character ch. See group_nodes_into_DFAstates. */
+ for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
+ ;
+
+ /* j-th destination accepts the word character ch. */
+ trtable[ch] = dest_states[j];
+ trtable[ch + SBC_MAX] = dest_states_word[j];
+ }
+ }
+
+ /* new line */
+ if (bitset_contain (acceptable, NEWLINE_CHAR))
+ {
+ /* The current state accepts newline character. */
+ for (j = 0; j < ndests; ++j)
+ if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
+ {
+ /* k-th destination accepts newline character. */
+ trtable[NEWLINE_CHAR] = dest_states_nl[j];
+ if (need_word_trtable)
+ trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
+ /* There must be only one destination which accepts
+ newline. See group_nodes_into_DFAstates. */
+ break;
+ }
+ }
+
+ if (dest_states_malloced)
+ free (dest_states);
+
+ re_node_set_free (&follows);
+ for (i = 0; i < ndests; ++i)
+ re_node_set_free (dests_node + i);
+
+ if (dests_node_malloced)
+ free (dests_node);
+
+ return 1;
+}
+
+/* Group all nodes belonging to STATE into several destinations.
+ Then for all destinations, set the nodes belonging to the destination
+ to DESTS_NODE[i] and set the characters accepted by the destination
+ to DEST_CH[i]. This function return the number of destinations. */
+
+static int
+group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
+ re_dfa_t *dfa;
+ const re_dfastate_t *state;
+ re_node_set *dests_node;
+ bitset *dests_ch;
+{
+ reg_errcode_t err;
+ int result;
+ int i, j, k;
+ int ndests; /* Number of the destinations from `state'. */
+ bitset accepts; /* Characters a node can accept. */
+ const re_node_set *cur_nodes = &state->nodes;
+ bitset_empty (accepts);
+ ndests = 0;
+
+ /* For all the nodes belonging to `state', */
+ for (i = 0; i < cur_nodes->nelem; ++i)
+ {
+ re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
+ re_token_type_t type = node->type;
+ unsigned int constraint = node->constraint;
+
+ /* Enumerate all single byte character this node can accept. */
+ if (type == CHARACTER)
+ bitset_set (accepts, node->opr.c);
+ else if (type == SIMPLE_BRACKET)
+ {
+ bitset_merge (accepts, node->opr.sbcset);
+ }
+ else if (type == OP_PERIOD)
+ {
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ bitset_merge (accepts, dfa->sb_char);
+ else
+#endif
+ bitset_set_all (accepts);
+ if (!(dfa->syntax & RE_DOT_NEWLINE))
+ bitset_clear (accepts, '\n');
+ if (dfa->syntax & RE_DOT_NOT_NULL)
+ bitset_clear (accepts, '\0');
+ }
+#ifdef RE_ENABLE_I18N
+ else if (type == OP_UTF8_PERIOD)
+ {
+ memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
+ if (!(dfa->syntax & RE_DOT_NEWLINE))
+ bitset_clear (accepts, '\n');
+ if (dfa->syntax & RE_DOT_NOT_NULL)
+ bitset_clear (accepts, '\0');
+ }
+#endif
+ else
+ continue;
+
+ /* Check the `accepts' and sift the characters which are not
+ match it the context. */
+ if (constraint)
+ {
+ if (constraint & NEXT_NEWLINE_CONSTRAINT)
+ {
+ int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
+ bitset_empty (accepts);
+ if (accepts_newline)
+ bitset_set (accepts, NEWLINE_CHAR);
+ else
+ continue;
+ }
+ if (constraint & NEXT_ENDBUF_CONSTRAINT)
+ {
+ bitset_empty (accepts);
+ continue;
+ }
+
+ if (constraint & NEXT_WORD_CONSTRAINT)
+ {
+ unsigned int any_set = 0;
+ if (type == CHARACTER && !node->word_char)
+ {
+ bitset_empty (accepts);
+ continue;
+ }
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ for (j = 0; j < BITSET_UINTS; ++j)
+ any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
+ else
+#endif
+ for (j = 0; j < BITSET_UINTS; ++j)
+ any_set |= (accepts[j] &= dfa->word_char[j]);
+ if (!any_set)
+ continue;
+ }
+ if (constraint & NEXT_NOTWORD_CONSTRAINT)
+ {
+ unsigned int any_set = 0;
+ if (type == CHARACTER && node->word_char)
+ {
+ bitset_empty (accepts);
+ continue;
+ }
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ for (j = 0; j < BITSET_UINTS; ++j)
+ any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
+ else
+#endif
+ for (j = 0; j < BITSET_UINTS; ++j)
+ any_set |= (accepts[j] &= ~dfa->word_char[j]);
+ if (!any_set)
+ continue;
+ }
+ }
+
+ /* Then divide `accepts' into DFA states, or create a new
+ state. Above, we make sure that accepts is not empty. */
+ for (j = 0; j < ndests; ++j)
+ {
+ bitset intersec; /* Intersection sets, see below. */
+ bitset remains;
+ /* Flags, see below. */
+ int has_intersec, not_subset, not_consumed;
+
+ /* Optimization, skip if this state doesn't accept the character. */
+ if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
+ continue;
+
+ /* Enumerate the intersection set of this state and `accepts'. */
+ has_intersec = 0;
+ for (k = 0; k < BITSET_UINTS; ++k)
+ has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
+ /* And skip if the intersection set is empty. */
+ if (!has_intersec)
+ continue;
+
+ /* Then check if this state is a subset of `accepts'. */
+ not_subset = not_consumed = 0;
+ for (k = 0; k < BITSET_UINTS; ++k)
+ {
+ not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
+ not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
+ }
+
+ /* If this state isn't a subset of `accepts', create a
+ new group state, which has the `remains'. */
+ if (not_subset)
+ {
+ bitset_copy (dests_ch[ndests], remains);
+ bitset_copy (dests_ch[j], intersec);
+ err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
+ if (BE (err != REG_NOERROR, 0))
+ goto error_return;
+ ++ndests;
+ }
+
+ /* Put the position in the current group. */
+ result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
+ if (BE (result < 0, 0))
+ goto error_return;
+
+ /* If all characters are consumed, go to next node. */
+ if (!not_consumed)
+ break;
+ }
+ /* Some characters remain, create a new group. */
+ if (j == ndests)
+ {
+ bitset_copy (dests_ch[ndests], accepts);
+ err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
+ if (BE (err != REG_NOERROR, 0))
+ goto error_return;
+ ++ndests;
+ bitset_empty (accepts);
+ }
+ }
+ return ndests;
+ error_return:
+ for (j = 0; j < ndests; ++j)
+ re_node_set_free (dests_node + j);
+ return -1;
+}
+
+#ifdef RE_ENABLE_I18N
+/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
+ Return the number of the bytes the node accepts.
+ STR_IDX is the current index of the input string.
+
+ This function handles the nodes which can accept one character, or
+ one collating element like '.', '[a-z]', opposite to the other nodes
+ can only accept one byte. */
+
+static int
+check_node_accept_bytes (dfa, node_idx, input, str_idx)
+ re_dfa_t *dfa;
+ int node_idx, str_idx;
+ const re_string_t *input;
+{
+ const re_token_t *node = dfa->nodes + node_idx;
+ int char_len, elem_len;
+ int i;
+
+ if (BE (node->type == OP_UTF8_PERIOD, 0))
+ {
+ unsigned char c = re_string_byte_at (input, str_idx), d;
+ if (BE (c < 0xc2, 1))
+ return 0;
+
+ if (str_idx + 2 > input->len)
+ return 0;
+
+ d = re_string_byte_at (input, str_idx + 1);
+ if (c < 0xe0)
+ return (d < 0x80 || d > 0xbf) ? 0 : 2;
+ else if (c < 0xf0)
+ {
+ char_len = 3;
+ if (c == 0xe0 && d < 0xa0)
+ return 0;
+ }
+ else if (c < 0xf8)
+ {
+ char_len = 4;
+ if (c == 0xf0 && d < 0x90)
+ return 0;
+ }
+ else if (c < 0xfc)
+ {
+ char_len = 5;
+ if (c == 0xf8 && d < 0x88)
+ return 0;
+ }
+ else if (c < 0xfe)
+ {
+ char_len = 6;
+ if (c == 0xfc && d < 0x84)
+ return 0;
+ }
+ else
+ return 0;
+
+ if (str_idx + char_len > input->len)
+ return 0;
+
+ for (i = 1; i < char_len; ++i)
+ {
+ d = re_string_byte_at (input, str_idx + i);
+ if (d < 0x80 || d > 0xbf)
+ return 0;
+ }
+ return char_len;
+ }
+
+ char_len = re_string_char_size_at (input, str_idx);
+ if (node->type == OP_PERIOD)
+ {
+ if (char_len <= 1)
+ return 0;
+ /* FIXME: I don't think this if is needed, as both '\n'
+ and '\0' are char_len == 1. */
+ /* '.' accepts any one character except the following two cases. */
+ if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
+ re_string_byte_at (input, str_idx) == '\n') ||
+ ((dfa->syntax & RE_DOT_NOT_NULL) &&
+ re_string_byte_at (input, str_idx) == '\0'))
+ return 0;
+ return char_len;
+ }
+
+ elem_len = re_string_elem_size_at (input, str_idx);
+ if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
+ return 0;
+
+ if (node->type == COMPLEX_BRACKET)
+ {
+ const re_charset_t *cset = node->opr.mbcset;
+# ifdef _LIBC
+ const unsigned char *pin
+ = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
+ int j;
+ uint32_t nrules;
+# endif /* _LIBC */
+ int match_len = 0;
+ wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
+ ? re_string_wchar_at (input, str_idx) : 0);
+
+ /* match with multibyte character? */
+ for (i = 0; i < cset->nmbchars; ++i)
+ if (wc == cset->mbchars[i])
+ {
+ match_len = char_len;
+ goto check_node_accept_bytes_match;
+ }
+ /* match with character_class? */
+ for (i = 0; i < cset->nchar_classes; ++i)
+ {
+ wctype_t wt = cset->char_classes[i];
+ if (__iswctype (wc, wt))
+ {
+ match_len = char_len;
+ goto check_node_accept_bytes_match;
+ }
+ }
+
+# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules != 0)
+ {
+ unsigned int in_collseq = 0;
+ const int32_t *table, *indirect;
+ const unsigned char *weights, *extra;
+ const char *collseqwc;
+ int32_t idx;
+ /* This #include defines a local function! */
+# include <locale/weight.h>
+
+ /* match with collating_symbol? */
+ if (cset->ncoll_syms)
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+ for (i = 0; i < cset->ncoll_syms; ++i)
+ {
+ const unsigned char *coll_sym = extra + cset->coll_syms[i];
+ /* Compare the length of input collating element and
+ the length of current collating element. */
+ if (*coll_sym != elem_len)
+ continue;
+ /* Compare each bytes. */
+ for (j = 0; j < *coll_sym; j++)
+ if (pin[j] != coll_sym[1 + j])
+ break;
+ if (j == *coll_sym)
+ {
+ /* Match if every bytes is equal. */
+ match_len = j;
+ goto check_node_accept_bytes_match;
+ }
+ }
+
+ if (cset->nranges)
+ {
+ if (elem_len <= char_len)
+ {
+ collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
+ in_collseq = __collseq_table_lookup (collseqwc, wc);
+ }
+ else
+ in_collseq = find_collation_sequence_value (pin, elem_len);
+ }
+ /* match with range expression? */
+ for (i = 0; i < cset->nranges; ++i)
+ if (cset->range_starts[i] <= in_collseq
+ && in_collseq <= cset->range_ends[i])
+ {
+ match_len = elem_len;
+ goto check_node_accept_bytes_match;
+ }
+
+ /* match with equivalence_class? */
+ if (cset->nequiv_classes)
+ {
+ const unsigned char *cp = pin;
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ weights = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
+ idx = findidx (&cp);
+ if (idx > 0)
+ for (i = 0; i < cset->nequiv_classes; ++i)
+ {
+ int32_t equiv_class_idx = cset->equiv_classes[i];
+ size_t weight_len = weights[idx];
+ if (weight_len == weights[equiv_class_idx])
+ {
+ int cnt = 0;
+ while (cnt <= weight_len
+ && (weights[equiv_class_idx + 1 + cnt]
+ == weights[idx + 1 + cnt]))
+ ++cnt;
+ if (cnt > weight_len)
+ {
+ match_len = elem_len;
+ goto check_node_accept_bytes_match;
+ }
+ }
+ }
+ }
+ }
+ else
+# endif /* _LIBC */
+ {
+ /* match with range expression? */
+#if __GNUC__ >= 2
+ wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
+#else
+ wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
+ cmp_buf[2] = wc;
+#endif
+ for (i = 0; i < cset->nranges; ++i)
+ {
+ cmp_buf[0] = cset->range_starts[i];
+ cmp_buf[4] = cset->range_ends[i];
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ {
+ match_len = char_len;
+ goto check_node_accept_bytes_match;
+ }
+ }
+ }
+ check_node_accept_bytes_match:
+ if (!cset->non_match)
+ return match_len;
+ else
+ {
+ if (match_len > 0)
+ return 0;
+ else
+ return (elem_len > char_len) ? elem_len : char_len;
+ }
+ }
+ return 0;
+}
+
+# ifdef _LIBC
+static unsigned int
+find_collation_sequence_value (mbs, mbs_len)
+ const unsigned char *mbs;
+ size_t mbs_len;
+{
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules == 0)
+ {
+ if (mbs_len == 1)
+ {
+ /* No valid character. Match it as a single byte character. */
+ const unsigned char *collseq = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
+ return collseq[mbs[0]];
+ }
+ return UINT_MAX;
+ }
+ else
+ {
+ int32_t idx;
+ const unsigned char *extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+ int32_t extrasize = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
+
+ for (idx = 0; idx < extrasize;)
+ {
+ int mbs_cnt, found = 0;
+ int32_t elem_mbs_len;
+ /* Skip the name of collating element name. */
+ idx = idx + extra[idx] + 1;
+ elem_mbs_len = extra[idx++];
+ if (mbs_len == elem_mbs_len)
+ {
+ for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
+ if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
+ break;
+ if (mbs_cnt == elem_mbs_len)
+ /* Found the entry. */
+ found = 1;
+ }
+ /* Skip the byte sequence of the collating element. */
+ idx += elem_mbs_len;
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~3;
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
+ /* Skip the wide char sequence of the collating element. */
+ idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
+ /* If we found the entry, return the sequence value. */
+ if (found)
+ return *(uint32_t *) (extra + idx);
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
+ }
+ return UINT_MAX;
+ }
+}
+# endif /* _LIBC */
+#endif /* RE_ENABLE_I18N */
+
+/* Check whether the node accepts the byte which is IDX-th
+ byte of the INPUT. */
+
+static int
+check_node_accept (mctx, node, idx)
+ const re_match_context_t *mctx;
+ const re_token_t *node;
+ int idx;
+{
+ unsigned char ch;
+ ch = re_string_byte_at (&mctx->input, idx);
+ switch (node->type)
+ {
+ case CHARACTER:
+ if (node->opr.c != ch)
+ return 0;
+ break;
+
+ case SIMPLE_BRACKET:
+ if (!bitset_contain (node->opr.sbcset, ch))
+ return 0;
+ break;
+
+#ifdef RE_ENABLE_I18N
+ case OP_UTF8_PERIOD:
+ if (ch >= 0x80)
+ return 0;
+ /* FALLTHROUGH */
+#endif
+ case OP_PERIOD:
+ if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
+ || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
+ return 0;
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (node->constraint)
+ {
+ /* The node has constraints. Check whether the current context
+ satisfies the constraints. */
+ unsigned int context = re_string_context_at (&mctx->input, idx,
+ mctx->eflags);
+ if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Extend the buffers, if the buffers have run out. */
+
+static reg_errcode_t
+extend_buffers (mctx)
+ re_match_context_t *mctx;
+{
+ reg_errcode_t ret;
+ re_string_t *pstr = &mctx->input;
+
+ /* Double the lengthes of the buffers. */
+ ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+
+ if (mctx->state_log != NULL)
+ {
+ /* And double the length of state_log. */
+ /* XXX We have no indication of the size of this buffer. If this
+ allocation fail we have no indication that the state_log array
+ does not have the right size. */
+ re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
+ pstr->bufs_len + 1);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ mctx->state_log = new_array;
+ }
+
+ /* Then reconstruct the buffers. */
+ if (pstr->icase)
+ {
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ {
+ ret = build_wcs_upper_buffer (pstr);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
+ else
+#endif /* RE_ENABLE_I18N */
+ build_upper_buffer (pstr);
+ }
+ else
+ {
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ build_wcs_buffer (pstr);
+ else
+#endif /* RE_ENABLE_I18N */
+ {
+ if (pstr->trans != NULL)
+ re_string_translate_buffer (pstr);
+ }
+ }
+ return REG_NOERROR;
+}
+
+
+/* Functions for matching context. */
+
+/* Initialize MCTX. */
+
+static reg_errcode_t
+match_ctx_init (mctx, eflags, n)
+ re_match_context_t *mctx;
+ int eflags, n;
+{
+ mctx->eflags = eflags;
+ mctx->match_last = -1;
+ if (n > 0)
+ {
+ mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
+ mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
+ if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
+ return REG_ESPACE;
+ }
+ /* Already zero-ed by the caller.
+ else
+ mctx->bkref_ents = NULL;
+ mctx->nbkref_ents = 0;
+ mctx->nsub_tops = 0; */
+ mctx->abkref_ents = n;
+ mctx->max_mb_elem_len = 1;
+ mctx->asub_tops = n;
+ return REG_NOERROR;
+}
+
+/* Clean the entries which depend on the current input in MCTX.
+ This function must be invoked when the matcher changes the start index
+ of the input, or changes the input string. */
+
+static void
+match_ctx_clean (mctx)
+ re_match_context_t *mctx;
+{
+ int st_idx;
+ for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
+ {
+ int sl_idx;
+ re_sub_match_top_t *top = mctx->sub_tops[st_idx];
+ for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
+ {
+ re_sub_match_last_t *last = top->lasts[sl_idx];
+ re_free (last->path.array);
+ re_free (last);
+ }
+ re_free (top->lasts);
+ if (top->path)
+ {
+ re_free (top->path->array);
+ re_free (top->path);
+ }
+ free (top);
+ }
+
+ mctx->nsub_tops = 0;
+ mctx->nbkref_ents = 0;
+}
+
+/* Free all the memory associated with MCTX. */
+
+static void
+match_ctx_free (mctx)
+ re_match_context_t *mctx;
+{
+ /* First, free all the memory associated with MCTX->SUB_TOPS. */
+ match_ctx_clean (mctx);
+ re_free (mctx->sub_tops);
+ re_free (mctx->bkref_ents);
+}
+
+/* Add a new backreference entry to MCTX.
+ Note that we assume that caller never call this function with duplicate
+ entry, and call with STR_IDX which isn't smaller than any existing entry.
+*/
+
+static reg_errcode_t
+match_ctx_add_entry (mctx, node, str_idx, from, to)
+ re_match_context_t *mctx;
+ int node, str_idx, from, to;
+{
+ if (mctx->nbkref_ents >= mctx->abkref_ents)
+ {
+ struct re_backref_cache_entry* new_entry;
+ new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
+ mctx->abkref_ents * 2);
+ if (BE (new_entry == NULL, 0))
+ {
+ re_free (mctx->bkref_ents);
+ return REG_ESPACE;
+ }
+ mctx->bkref_ents = new_entry;
+ memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
+ sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
+ mctx->abkref_ents *= 2;
+ }
+ if (mctx->nbkref_ents > 0
+ && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
+ mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
+
+ mctx->bkref_ents[mctx->nbkref_ents].node = node;
+ mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
+ mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
+ mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
+
+ /* This is a cache that saves negative results of check_dst_limits_calc_pos.
+ If bit N is clear, means that this entry won't epsilon-transition to
+ an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
+ it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
+ such node.
+
+ A backreference does not epsilon-transition unless it is empty, so set
+ to all zeros if FROM != TO. */
+ mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
+ = (from == to ? ~0 : 0);
+
+ mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
+ if (mctx->max_mb_elem_len < to - from)
+ mctx->max_mb_elem_len = to - from;
+ return REG_NOERROR;
+}
+
+/* Search for the first entry which has the same str_idx, or -1 if none is
+ found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
+
+static int
+search_cur_bkref_entry (mctx, str_idx)
+ re_match_context_t *mctx;
+ int str_idx;
+{
+ int left, right, mid, last;
+ last = right = mctx->nbkref_ents;
+ for (left = 0; left < right;)
+ {
+ mid = (left + right) / 2;
+ if (mctx->bkref_ents[mid].str_idx < str_idx)
+ left = mid + 1;
+ else
+ right = mid;
+ }
+ if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
+ return left;
+ else
+ return -1;
+}
+
+/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
+ at STR_IDX. */
+
+static reg_errcode_t
+match_ctx_add_subtop (mctx, node, str_idx)
+ re_match_context_t *mctx;
+ int node, str_idx;
+{
+#ifdef DEBUG
+ assert (mctx->sub_tops != NULL);
+ assert (mctx->asub_tops > 0);
+#endif
+ if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
+ {
+ int new_asub_tops = mctx->asub_tops * 2;
+ re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
+ re_sub_match_top_t *,
+ new_asub_tops);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ mctx->sub_tops = new_array;
+ mctx->asub_tops = new_asub_tops;
+ }
+ mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
+ if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
+ return REG_ESPACE;
+ mctx->sub_tops[mctx->nsub_tops]->node = node;
+ mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
+ return REG_NOERROR;
+}
+
+/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
+ at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
+
+static re_sub_match_last_t *
+match_ctx_add_sublast (subtop, node, str_idx)
+ re_sub_match_top_t *subtop;
+ int node, str_idx;
+{
+ re_sub_match_last_t *new_entry;
+ if (BE (subtop->nlasts == subtop->alasts, 0))
+ {
+ int new_alasts = 2 * subtop->alasts + 1;
+ re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
+ re_sub_match_last_t *,
+ new_alasts);
+ if (BE (new_array == NULL, 0))
+ return NULL;
+ subtop->lasts = new_array;
+ subtop->alasts = new_alasts;
+ }
+ new_entry = calloc (1, sizeof (re_sub_match_last_t));
+ if (BE (new_entry != NULL, 1))
+ {
+ subtop->lasts[subtop->nlasts] = new_entry;
+ new_entry->node = node;
+ new_entry->str_idx = str_idx;
+ ++subtop->nlasts;
+ }
+ return new_entry;
+}
+
+static void
+sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
+ re_sift_context_t *sctx;
+ re_dfastate_t **sifted_sts, **limited_sts;
+ int last_node, last_str_idx;
+{
+ sctx->sifted_states = sifted_sts;
+ sctx->limited_states = limited_sts;
+ sctx->last_node = last_node;
+ sctx->last_str_idx = last_str_idx;
+ re_node_set_init_empty (&sctx->limits);
+}