1590 lines
42 KiB
C++
1590 lines
42 KiB
C++
/* ScummVM - Graphic Adventure Engine
|
|
*
|
|
* ScummVM is the legal property of its developers, whose names
|
|
* are too numerous to list here. Please refer to the COPYRIGHT
|
|
* file distributed with this source distribution.
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
*/
|
|
|
|
/*
|
|
Regular Expression Parser and Recognizer for TADS
|
|
Function
|
|
Parses and recognizes regular expressions
|
|
Notes
|
|
Regular expression syntax:
|
|
|
|
abc|def either abc or def
|
|
(abc) abc
|
|
abc+ abc, abcc, abccc, ...
|
|
abc* ab, abc, abcc, ...
|
|
abc? ab or abc
|
|
. any single character
|
|
abc$ abc at the end of the line
|
|
^abc abc at the beginning of the line
|
|
%^abc literally ^abc
|
|
[abcx-z] matches a, b, c, x, y, or z
|
|
[^abcx-z] matches any character except a, b, c, x, y, or z
|
|
[^]-q] matches any character except ], -, or q
|
|
|
|
Note that using ']' or '-' in a character range expression requires
|
|
special ordering. If ']' is to be used, it must be the first character
|
|
after the '^', if present, within the brackets. If '-' is to be used,
|
|
it must be the first character after the '^' and/or ']', if present.
|
|
|
|
'%' is used to escape the special characters: | . ( ) * ? + ^ $ % [
|
|
(We use '%' rather than a backslash because it's less trouble to
|
|
enter in a TADS string -- a backslash needs to be quoted with another
|
|
backslash, which is error-prone and hard to read. '%' doesn't need
|
|
any special quoting in a TADS string, which makes it a lot more
|
|
readable.)
|
|
|
|
In addition, '%' is used to introduce additional special sequences:
|
|
|
|
%1 text matching first parenthesized expression
|
|
%9 text matching ninth parenthesized experssion
|
|
%< matches at the beginning of a word only
|
|
%> matches at the end of a word only
|
|
%w matches any word character
|
|
%W matches any non-word character
|
|
%b matches at any word boundary (beginning or end of word)
|
|
%B matches except at a word boundary
|
|
|
|
For the word matching sequences, a word is any sequence of letters and
|
|
numbers.
|
|
*/
|
|
|
|
#include "glk/tads/tads2/regex.h"
|
|
#include "glk/tads/tads2/memory_cache_heap.h"
|
|
#include "common/util.h"
|
|
|
|
namespace Glk {
|
|
namespace TADS {
|
|
namespace TADS2 {
|
|
|
|
/**
|
|
* A "machine" (i.e., a finite state automaton) is a set of state
|
|
* transition tuples. A tuple has three elements: the state ID, the ID
|
|
* of the state that we transition to, and the condition for the
|
|
* transition. The condition is simply the character that we must match
|
|
* to make the transition, or a special distinguished symbol "epsilon,"
|
|
* which refers to a transition with no input character consumed.
|
|
*
|
|
* The primitive elements of our machines guarantee that we never have
|
|
* more than two transitions out of a particular state, so we can
|
|
* denormalize the representation of a state by storing the two possible
|
|
* tuples for that state in a single combined tuple. This has the
|
|
* performance advantage that we can use the state ID as an index into
|
|
* an array of state tuples.
|
|
*
|
|
* A particular machine always has a single initial and single final
|
|
* (successful) state, so we can define a machine by its initial and
|
|
* final state ID's.
|
|
*/
|
|
enum {
|
|
// the special symbol value for "epsilon"
|
|
RE_EPSILON = '\001',
|
|
|
|
// the special symbol value for a wildcard character
|
|
RE_WILDCARD = '\002',
|
|
|
|
// special symbol values for beginning and end of text
|
|
RE_TEXT_BEGIN = '\003',
|
|
RE_TEXT_END = '\004',
|
|
|
|
// special symbol values for start and end of a word
|
|
RE_WORD_BEGIN = '\005',
|
|
RE_WORD_END = '\006',
|
|
|
|
// special symbols for word-char and non-word-char
|
|
RE_WORD_CHAR = '\007',
|
|
RE_NON_WORD_CHAR = '\010',
|
|
|
|
// special symbols for word-boundary and non-word-boundary
|
|
RE_WORD_BOUNDARY = '\011',
|
|
RE_NON_WORD_BOUNDARY = '\012',
|
|
|
|
// special symbol for a character range/exclusion range
|
|
RE_RANGE = '\013',
|
|
RE_RANGE_EXCL = '\014',
|
|
|
|
// a range of special symbol values for group matchers
|
|
RE_GROUP_MATCH_0 = '\015',
|
|
RE_GROUP_MATCH_9 = (RE_GROUP_MATCH_0 + 9)
|
|
};
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* A machine description. Machines are fully described by their initial
|
|
* and final state ID's.
|
|
*/
|
|
struct re_machine {
|
|
/* the machine's initial state */
|
|
re_state_id init;
|
|
|
|
/* the machine's final state */
|
|
re_state_id final_state;
|
|
};
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Initialize the context. The memory for the context structure itself
|
|
* is allocated and maintained by the caller.
|
|
*/
|
|
void re_init(re_context *ctx, errcxdef *errctx)
|
|
{
|
|
/* save the error context */
|
|
ctx->errctx = errctx;
|
|
|
|
/* no tuple array yet */
|
|
ctx->tuple_arr = nullptr;
|
|
ctx->tuples_alloc = 0;
|
|
|
|
/* clear states */
|
|
ctx->next_state = RE_STATE_FIRST_VALID;
|
|
|
|
/* clear groups */
|
|
ctx->cur_group = 0;
|
|
|
|
/* no string buffer yet */
|
|
ctx->strbuf = nullptr;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Reset compiler - clears states and tuples
|
|
*/
|
|
static void re_reset(re_context *ctx)
|
|
{
|
|
int i;
|
|
|
|
/* delete any range tables we've allocated */
|
|
for (i = 0 ; i < ctx->next_state ; ++i)
|
|
{
|
|
if (ctx->tuple_arr[i].char_range != nullptr)
|
|
{
|
|
mchfre(ctx->tuple_arr[i].char_range);
|
|
ctx->tuple_arr[i].char_range = nullptr;
|
|
}
|
|
}
|
|
|
|
/* clear states */
|
|
ctx->next_state = RE_STATE_FIRST_VALID;
|
|
|
|
/* clear groups */
|
|
ctx->cur_group = 0;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Delete the context - frees structures associated with the context.
|
|
* Does NOT free the memory used by the context structure itself.
|
|
*/
|
|
void re_delete(re_context *ctx)
|
|
{
|
|
/* reset state */
|
|
re_reset(ctx);
|
|
|
|
/* if we've allocated an array, delete it */
|
|
if (ctx->tuple_arr != nullptr)
|
|
{
|
|
mchfre(ctx->tuple_arr);
|
|
ctx->tuple_arr = nullptr;
|
|
}
|
|
|
|
/* if we allocated a string buffer, delete it */
|
|
if (ctx->strbuf != nullptr)
|
|
{
|
|
mchfre(ctx->strbuf);
|
|
ctx->strbuf = nullptr;
|
|
}
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Allocate a new state ID
|
|
*/
|
|
static re_state_id re_alloc_state(re_context *ctx)
|
|
{
|
|
/*
|
|
* If we don't have enough room for another state, expand the array
|
|
*/
|
|
if (ctx->next_state >= ctx->tuples_alloc)
|
|
{
|
|
uint new_alloc;
|
|
|
|
/* bump the size by a bit */
|
|
new_alloc = ctx->tuples_alloc + 100;
|
|
|
|
/* allocate or expand the array */
|
|
if (ctx->tuples_alloc == 0)
|
|
{
|
|
/* allocate the initial memory block */
|
|
ctx->tuple_arr =
|
|
(re_tuple *)mchalo(ctx->errctx,
|
|
(new_alloc * sizeof(re_tuple)),
|
|
"regex");
|
|
}
|
|
else
|
|
{
|
|
re_tuple *ptr;
|
|
|
|
/* allocate a new memory block */
|
|
ptr = (re_tuple *)mchalo(ctx->errctx,
|
|
(new_alloc * sizeof(re_tuple)),
|
|
"regex");
|
|
|
|
/* copy the old memory to the new memory */
|
|
memcpy(ptr, ctx->tuple_arr, ctx->tuples_alloc * sizeof(re_tuple));
|
|
|
|
/* free the old block */
|
|
mchfre(ctx->tuple_arr);
|
|
|
|
/* use the new block */
|
|
ctx->tuple_arr = ptr;
|
|
}
|
|
|
|
/* remember the new allocation size */
|
|
ctx->tuples_alloc = new_alloc;
|
|
}
|
|
|
|
/* initialize the next state */
|
|
ctx->tuple_arr[ctx->next_state].next_state_1 = RE_STATE_INVALID;
|
|
ctx->tuple_arr[ctx->next_state].next_state_2 = RE_STATE_INVALID;
|
|
ctx->tuple_arr[ctx->next_state].ch = RE_EPSILON;
|
|
ctx->tuple_arr[ctx->next_state].flags = 0;
|
|
ctx->tuple_arr[ctx->next_state].char_range = nullptr;
|
|
|
|
/* return the new state's ID */
|
|
return ctx->next_state++;
|
|
}
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Set a transition from a state to a given destination state.
|
|
*/
|
|
static void re_set_trans(re_context *ctx,
|
|
re_state_id id, re_state_id dest_id, char ch)
|
|
{
|
|
re_tuple *tuple;
|
|
|
|
/*
|
|
* get the tuple containing the transitions for this state ID - the
|
|
* state ID is the index of the state's transition tuple in the
|
|
* array
|
|
*/
|
|
tuple = &ctx->tuple_arr[id];
|
|
|
|
/*
|
|
* If the first state pointer hasn't been set yet, set it to the new
|
|
* destination. Otherwise, set the second state pointer.
|
|
*
|
|
* Only set the character on setting the first state. When setting
|
|
* the second state, we must assume that the character for the state
|
|
* has already been set, since any given state can have only one
|
|
* character setting.
|
|
*/
|
|
if (tuple->next_state_1 == RE_STATE_INVALID)
|
|
{
|
|
/*
|
|
* set the character ID, unless the state has been marked with a
|
|
* special flag which indicates that the character value has
|
|
* another meaning (in particular, a group marker)
|
|
*/
|
|
if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)))
|
|
tuple->ch = ch;
|
|
|
|
/* set the first transition */
|
|
tuple->next_state_1 = dest_id;
|
|
}
|
|
else
|
|
{
|
|
/* set only the second transition state - don't set the character */
|
|
tuple->next_state_2 = dest_id;
|
|
}
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Initialize a new machine, giving it an initial and final state
|
|
*/
|
|
static void re_init_machine(re_context *ctx, re_machine *machine)
|
|
{
|
|
machine->init = re_alloc_state(ctx);
|
|
machine->final_state = re_alloc_state(ctx);
|
|
}
|
|
|
|
/*
|
|
* Build a character recognizer
|
|
*/
|
|
static void re_build_char(re_context *ctx, re_machine *machine, char ch)
|
|
{
|
|
/* initialize our new machine */
|
|
re_init_machine(ctx, machine);
|
|
|
|
/* allocate a transition tuple for the new state */
|
|
re_set_trans(ctx, machine->init, machine->final_state, ch);
|
|
}
|
|
|
|
/*
|
|
* Build a character range recognizer. 'range' is a 256-bit (32-byte)
|
|
* bit vector.
|
|
*/
|
|
static void re_build_char_range(re_context *ctx, re_machine *machine,
|
|
unsigned char *range, int exclusion)
|
|
{
|
|
unsigned char *range_copy;
|
|
|
|
/* initialize our new machine */
|
|
re_init_machine(ctx, machine);
|
|
|
|
/* allocate a transition table for the new state */
|
|
re_set_trans(ctx, machine->init, machine->final_state,
|
|
(char)(exclusion ? RE_RANGE_EXCL : RE_RANGE));
|
|
|
|
/* allocate a copy of the range bit vector */
|
|
range_copy = (unsigned char *)mchalo(ctx->errctx, 32, "regex range");
|
|
|
|
/* copy the caller's range */
|
|
memcpy(range_copy, range, 32);
|
|
|
|
/* store it in the tuple */
|
|
ctx->tuple_arr[machine->init].char_range = range_copy;
|
|
}
|
|
|
|
|
|
/*
|
|
* Build a group recognizer. This is almost the same as a character
|
|
* recognizer, but matches a previous group rather than a literal
|
|
* character.
|
|
*/
|
|
static void re_build_group_matcher(re_context *ctx,
|
|
re_machine *machine, int group_num)
|
|
{
|
|
/* initialize our new machine */
|
|
re_init_machine(ctx, machine);
|
|
|
|
/*
|
|
* Allocate a transition tuple for the new state, using the group ID
|
|
* as the character code. Store the special code for a group
|
|
* recognizer rather than the normal literal character code.
|
|
*/
|
|
re_set_trans(ctx, machine->init, machine->final_state,
|
|
(char)(group_num + RE_GROUP_MATCH_0));
|
|
}
|
|
|
|
|
|
/*
|
|
* Build a concatenation recognizer
|
|
*/
|
|
static void re_build_concat(re_context *ctx, re_machine *new_machine,
|
|
re_machine *lhs, re_machine *rhs)
|
|
{
|
|
/* initialize the new machine */
|
|
re_init_machine(ctx, new_machine);
|
|
|
|
/*
|
|
* set up an epsilon transition from the new machine's initial state
|
|
* to the first submachine's initial state
|
|
*/
|
|
re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
|
|
|
|
/*
|
|
* Set up an epsilon transition from the first submachine's final
|
|
* state to the second submachine's initial state
|
|
*/
|
|
re_set_trans(ctx, lhs->final_state, rhs->init, RE_EPSILON);
|
|
|
|
/*
|
|
* Set up an epsilon transition from the second submachine's final
|
|
* state to our new machine's final state
|
|
*/
|
|
re_set_trans(ctx, rhs->final_state, new_machine->final_state, RE_EPSILON);
|
|
}
|
|
|
|
/*
|
|
* Build a group machine. sub_machine contains the machine that
|
|
* expresses the group's contents; we'll fill in new_machine with a
|
|
* newly-created machine that encloses and marks the group.
|
|
*/
|
|
static void re_build_group(re_context *ctx, re_machine *new_machine,
|
|
re_machine *sub_machine, int group_id)
|
|
{
|
|
/* initialize the container machine */
|
|
re_init_machine(ctx, new_machine);
|
|
|
|
/*
|
|
* set up an epsilon transition from the new machine's initial state
|
|
* into the initial state of the group, and another transition from
|
|
* the group's final state into the container's final state
|
|
*/
|
|
re_set_trans(ctx, new_machine->init, sub_machine->init, RE_EPSILON);
|
|
re_set_trans(ctx, sub_machine->final_state, new_machine->final_state, RE_EPSILON);
|
|
|
|
/*
|
|
* Mark the initial and final states of the group machine as being
|
|
* group markers.
|
|
*/
|
|
ctx->tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN;
|
|
ctx->tuple_arr[new_machine->final_state].flags |= RE_STATE_GROUP_END;
|
|
|
|
/* store the group ID in the 'ch' member of the start and end states */
|
|
ctx->tuple_arr[new_machine->init].ch = group_id;
|
|
ctx->tuple_arr[new_machine->final_state].ch = group_id;
|
|
}
|
|
|
|
/*
|
|
* Build an alternation recognizer
|
|
*/
|
|
static void re_build_alter(re_context *ctx, re_machine *new_machine,
|
|
re_machine *lhs, re_machine *rhs)
|
|
{
|
|
/* initialize the new machine */
|
|
re_init_machine(ctx, new_machine);
|
|
|
|
/*
|
|
* Set up an epsilon transition from our new machine's initial state
|
|
* to the initial state of each submachine
|
|
*/
|
|
re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
|
|
re_set_trans(ctx, new_machine->init, rhs->init, RE_EPSILON);
|
|
|
|
/*
|
|
* Set up an epsilon transition from the final state of each
|
|
* submachine to our final state
|
|
*/
|
|
re_set_trans(ctx, lhs->final_state, new_machine->final_state, RE_EPSILON);
|
|
re_set_trans(ctx, rhs->final_state, new_machine->final_state, RE_EPSILON);
|
|
}
|
|
|
|
/*
|
|
* Build a closure recognizer
|
|
*/
|
|
static void re_build_closure(re_context *ctx,
|
|
re_machine *new_machine, re_machine *sub,
|
|
char specifier)
|
|
{
|
|
/* initialize the new machine */
|
|
re_init_machine(ctx, new_machine);
|
|
|
|
/*
|
|
* set up an epsilon transition from our initial state to the
|
|
* submachine's initial state, and from the submachine's final state
|
|
* to our final state
|
|
*/
|
|
re_set_trans(ctx, new_machine->init, sub->init, RE_EPSILON);
|
|
re_set_trans(ctx, sub->final_state, new_machine->final_state, RE_EPSILON);
|
|
|
|
/*
|
|
* If this is an unbounded closure ('*' or '+', but not '?'), set up
|
|
* the loop transition that takes us from the new machine's final
|
|
* state back to its initial state. We don't do this on the
|
|
* zero-or-one closure, because we can only match the expression
|
|
* once.
|
|
*/
|
|
if (specifier != '?')
|
|
re_set_trans(ctx, sub->final_state, sub->init, RE_EPSILON);
|
|
|
|
/*
|
|
* If this is a zero-or-one closure or a zero-or-more closure, set
|
|
* up an epsilon transition from our initial state to our final
|
|
* state, since we can skip the entire subexpression. We don't do
|
|
* this on the one-or-more closure, because we can't skip the
|
|
* subexpression in this case.
|
|
*/
|
|
if (specifier != '+')
|
|
re_set_trans(ctx, new_machine->init, new_machine->final_state, RE_EPSILON);
|
|
}
|
|
|
|
/*
|
|
* Build a null machine
|
|
*/
|
|
static void re_build_null_machine(re_context *ctx, re_machine *machine)
|
|
{
|
|
machine->init = machine->final_state = RE_STATE_INVALID;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Determine if a machine is null
|
|
*/
|
|
static int re_is_machine_null(re_context *ctx, re_machine *machine)
|
|
{
|
|
return (machine->init == RE_STATE_INVALID);
|
|
}
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Concatenate the second machine onto the first machine, replacing the
|
|
* first machine with the resulting machine. If the first machine is a
|
|
* null machine (created with re_build_null_machine), we'll simply copy
|
|
* the second machine into the first.
|
|
*/
|
|
static void re_concat_onto(re_context *ctx,
|
|
re_machine *dest, re_machine *rhs)
|
|
{
|
|
/* check for a null destination machine */
|
|
if (re_is_machine_null(ctx, dest))
|
|
{
|
|
/*
|
|
* the first machine is null - simply copy the second machine
|
|
* onto the first unchanged
|
|
*/
|
|
*dest = *rhs;
|
|
}
|
|
else
|
|
{
|
|
re_machine new_machine;
|
|
|
|
/* build the concatenated machine */
|
|
re_build_concat(ctx, &new_machine, dest, rhs);
|
|
|
|
/* copy the concatenated machine onto the first machine */
|
|
*dest = new_machine;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Alternate the second machine onto the first machine, replacing the
|
|
* first machine with the resulting machine. If the first machine is a
|
|
* null machine, this simply replaces the first machine with the second
|
|
* machine. If the second machine is null, this simply leaves the first
|
|
* machine unchanged.
|
|
*/
|
|
static void re_alternate_onto(re_context *ctx,
|
|
re_machine *dest, re_machine *rhs)
|
|
{
|
|
/* check to see if the first machine is null */
|
|
if (re_is_machine_null(ctx, dest))
|
|
{
|
|
/*
|
|
* the first machine is null - simply copy the second machine
|
|
* onto the first
|
|
*/
|
|
*dest = *rhs;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* if the second machine is null, don't do anything; otherwise,
|
|
* build the alternation
|
|
*/
|
|
if (!re_is_machine_null(ctx, rhs))
|
|
{
|
|
re_machine new_machine;
|
|
|
|
/* build the alternation */
|
|
re_build_alter(ctx, &new_machine, dest, rhs);
|
|
|
|
/* replace the first machine with the alternation */
|
|
*dest = new_machine;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Set a bit in a bit vector.
|
|
*/
|
|
#define re_set_bit(set, bit) \
|
|
(((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7)))
|
|
|
|
/*
|
|
* Test a bit in a bit vector
|
|
*/
|
|
#define re_is_bit_set(set, bit) \
|
|
((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0)
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Compile an expression
|
|
*/
|
|
static re_status_t re_compile(re_context *ctx,
|
|
const char *expr, size_t exprlen,
|
|
re_machine *result_machine)
|
|
{
|
|
re_machine cur_machine;
|
|
re_machine alter_machine;
|
|
re_machine new_machine;
|
|
size_t group_stack_level;
|
|
struct
|
|
{
|
|
re_machine old_cur;
|
|
re_machine old_alter;
|
|
int group_id;
|
|
} group_stack[50];
|
|
|
|
/* reset everything */
|
|
re_reset(ctx);
|
|
|
|
/* start out with no current machine and no alternate machine */
|
|
re_build_null_machine(ctx, &cur_machine);
|
|
re_build_null_machine(ctx, &alter_machine);
|
|
|
|
/* nothing on the stack yet */
|
|
group_stack_level = 0;
|
|
|
|
/* loop until we run out of expression to parse */
|
|
for ( ; exprlen != 0 ; ++expr, --exprlen)
|
|
{
|
|
switch(*expr)
|
|
{
|
|
case '^':
|
|
/*
|
|
* beginning of line - if we're not at the beginning of the
|
|
* current expression (i.e., we already have some
|
|
* concatentations accumulated), treat it as an ordinary
|
|
* character
|
|
*/
|
|
if (!re_is_machine_null(ctx, &cur_machine))
|
|
goto normal_char;
|
|
|
|
/* build a new start-of-text recognizer */
|
|
re_build_char(ctx, &new_machine, RE_TEXT_BEGIN);
|
|
|
|
/*
|
|
* concatenate it onto the string - note that this can't
|
|
* have any postfix operators
|
|
*/
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
case '$':
|
|
/*
|
|
* End of line specifier - if there's anything left after
|
|
* the '$' other than a close parens or alternation
|
|
* specifier, great it as a normal character
|
|
*/
|
|
if (exprlen > 1
|
|
&& (*(expr+1) != ')' && *(expr+1) != '|'))
|
|
goto normal_char;
|
|
|
|
/* build a new end-of-text recognizer */
|
|
re_build_char(ctx, &new_machine, RE_TEXT_END);
|
|
|
|
/*
|
|
* concatenate it onto the string - note that this can't
|
|
* have any postfix operators
|
|
*/
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
case '(':
|
|
/*
|
|
* Add a nesting level. Push the current machine and
|
|
* alternate machines onto the group stack, and clear
|
|
* everything out for the new group.
|
|
*/
|
|
if (group_stack_level
|
|
> sizeof(group_stack)/sizeof(group_stack[0]))
|
|
{
|
|
/* we cannot proceed - return an error */
|
|
return RE_STATUS_GROUP_NESTING_TOO_DEEP;
|
|
}
|
|
|
|
/* save the current state on the stack */
|
|
group_stack[group_stack_level].old_cur = cur_machine;
|
|
group_stack[group_stack_level].old_alter = alter_machine;
|
|
|
|
/*
|
|
* Assign the group a group ID - groups are numbered in
|
|
* order of their opening (left) parentheses, so we want to
|
|
* assign a group number now. We won't actually need to
|
|
* know the group number until we get to the matching close
|
|
* paren, but we need to assign it now, so store it in the
|
|
* group stack.
|
|
*/
|
|
group_stack[group_stack_level].group_id = ctx->cur_group;
|
|
|
|
/* consume the group number */
|
|
ctx->cur_group++;
|
|
|
|
/* push the level */
|
|
++group_stack_level;
|
|
|
|
/* start the new group with empty machines */
|
|
re_build_null_machine(ctx, &cur_machine);
|
|
re_build_null_machine(ctx, &alter_machine);
|
|
break;
|
|
|
|
case ')':
|
|
/* if there's nothing on the stack, ignore this */
|
|
if (group_stack_level == 0)
|
|
break;
|
|
|
|
/* take a level off the stack */
|
|
--group_stack_level;
|
|
|
|
/*
|
|
* Remove a nesting level. If we have a pending alternate
|
|
* expression, build the alternation expression. This will
|
|
* leave the entire group expression in alter_machine,
|
|
* regardless of whether an alternation was in progress or
|
|
* not.
|
|
*/
|
|
re_alternate_onto(ctx, &alter_machine, &cur_machine);
|
|
|
|
/*
|
|
* Create a group machine that encloses the group and marks
|
|
* it with a group number. We assigned the group number
|
|
* when we parsed the open paren, so read that group number
|
|
* from the stack.
|
|
*
|
|
* Note that this will leave 'new_machine' with the entire
|
|
* group machine.
|
|
*/
|
|
re_build_group(ctx, &new_machine, &alter_machine,
|
|
group_stack[group_stack_level].group_id);
|
|
|
|
/*
|
|
* Pop the stack - restore the alternation and current
|
|
* machines that were in progress before the group started.
|
|
*/
|
|
cur_machine = group_stack[group_stack_level].old_cur;
|
|
alter_machine = group_stack[group_stack_level].old_alter;
|
|
|
|
/*
|
|
* Check the group expression (in new_machine) for postfix
|
|
* expressions
|
|
*/
|
|
goto apply_postfix;
|
|
|
|
case '|':
|
|
/*
|
|
* Start a new alternation. This ends the current
|
|
* alternation; if we have a previous pending alternate,
|
|
* build an alternation machine out of the previous
|
|
* alternate and the current machine and move that to the
|
|
* alternate; otherwise, simply move the current machine to
|
|
* the pending alternate.
|
|
*/
|
|
re_alternate_onto(ctx, &alter_machine, &cur_machine);
|
|
|
|
/*
|
|
* the alternation starts out with a blank slate, so null
|
|
* out the current machine
|
|
*/
|
|
re_build_null_machine(ctx, &cur_machine);
|
|
break;
|
|
|
|
case '%':
|
|
/*
|
|
* quoted character - skip the quote mark and see what we
|
|
* have
|
|
*/
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* check to see if we're at the end of the expression */
|
|
if (exprlen == 0)
|
|
{
|
|
/*
|
|
* end of the string - ignore it, but undo the extra
|
|
* increment of the expression index so that we exit the
|
|
* enclosing loop properly
|
|
*/
|
|
--expr;
|
|
++exprlen;
|
|
break;
|
|
}
|
|
|
|
/* see what we have */
|
|
switch(*expr)
|
|
{
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
/* group match - build a new literal group recognizer */
|
|
re_build_group_matcher(ctx, &new_machine, (int)(*expr - '1'));
|
|
|
|
/* apply any postfix expression to the group recognizer */
|
|
goto apply_postfix;
|
|
|
|
case '<':
|
|
/* build a beginning-of-word recognizer */
|
|
re_build_char(ctx, &new_machine, RE_WORD_BEGIN);
|
|
|
|
/* it can't be postfixed - just concatenate it */
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
case '>':
|
|
/* build an end-of-word recognizer */
|
|
re_build_char(ctx, &new_machine, RE_WORD_END);
|
|
|
|
/* it can't be postfixed - just concatenate it */
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
case 'w':
|
|
/* word character */
|
|
re_build_char(ctx, &new_machine, RE_WORD_CHAR);
|
|
goto apply_postfix;
|
|
|
|
case 'W':
|
|
/* non-word character */
|
|
re_build_char(ctx, &new_machine, RE_NON_WORD_CHAR);
|
|
goto apply_postfix;
|
|
|
|
case 'b':
|
|
/* word boundary */
|
|
re_build_char(ctx, &new_machine, RE_WORD_BOUNDARY);
|
|
|
|
/* it can't be postfixed */
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
case 'B':
|
|
/* not a word boundary */
|
|
re_build_char(ctx, &new_machine, RE_NON_WORD_BOUNDARY);
|
|
|
|
/* it can't be postfixed */
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
|
|
default:
|
|
/* build a new literal character recognizer */
|
|
re_build_char(ctx, &new_machine, *expr);
|
|
|
|
/* apply any postfix expression to the character */
|
|
goto apply_postfix;
|
|
}
|
|
break;
|
|
|
|
case '.':
|
|
/*
|
|
* wildcard character - build a single character recognizer
|
|
* for the special wildcard symbol, then go check it for a
|
|
* postfix operator
|
|
*/
|
|
re_build_char(ctx, &new_machine, RE_WILDCARD);
|
|
goto apply_postfix;
|
|
break;
|
|
|
|
case '[':
|
|
/* range expression */
|
|
{
|
|
int is_exclusive = FALSE;
|
|
unsigned char set[32];
|
|
|
|
/* clear out the set of characters in the range */
|
|
memset(set, 0, sizeof(set));
|
|
|
|
/* first, skip the open bracket */
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* check to see if starts with the exclusion character */
|
|
if (exprlen != 0 && *expr == '^')
|
|
{
|
|
/* skip the exclusion specifier */
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* note it */
|
|
is_exclusive = TRUE;
|
|
}
|
|
|
|
/*
|
|
* if the first character is a ']', include it in the
|
|
* range
|
|
*/
|
|
if (exprlen != 0 && *expr == ']')
|
|
{
|
|
re_set_bit(set, (int)']');
|
|
++expr;
|
|
--exprlen;
|
|
}
|
|
|
|
/*
|
|
* if the next character is a '-', include it in the
|
|
* range
|
|
*/
|
|
if (exprlen != 0 && *expr == '-')
|
|
{
|
|
re_set_bit(set, (int)'-');
|
|
++expr;
|
|
--exprlen;
|
|
}
|
|
|
|
/* scan the character set */
|
|
while (exprlen != 0 && *expr != ']')
|
|
{
|
|
int ch;
|
|
|
|
/* note this character */
|
|
ch = (int)(unsigned char)*expr;
|
|
|
|
/* set it */
|
|
re_set_bit(set, ch);
|
|
|
|
/* skip this character of the expression */
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* check for a range */
|
|
if (exprlen != 0 && *expr == '-')
|
|
{
|
|
int ch2;
|
|
|
|
/* skip the '-' */
|
|
++expr;
|
|
--exprlen;
|
|
if (exprlen != 0)
|
|
{
|
|
/* get the other end of the range */
|
|
ch2 = (int)(unsigned char)*expr;
|
|
|
|
/* skip the second character */
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* if the range is reversed, swap it */
|
|
if (ch > ch2)
|
|
{
|
|
int tmp = ch;
|
|
ch = ch2;
|
|
ch2 = tmp;
|
|
}
|
|
|
|
/* fill in the range */
|
|
for ( ; ch <= ch2 ; ++ch)
|
|
re_set_bit(set, ch);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* create a character range machine */
|
|
re_build_char_range(ctx, &new_machine, set, is_exclusive);
|
|
|
|
/* apply any postfix operator */
|
|
goto apply_postfix;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
normal_char:
|
|
/*
|
|
* it's an ordinary character - build a single character
|
|
* recognizer machine, and then concatenate it onto any
|
|
* existing machine
|
|
*/
|
|
re_build_char(ctx, &new_machine, *expr);
|
|
|
|
apply_postfix:
|
|
/*
|
|
* Check for a postfix operator, and apply it to the machine
|
|
* in 'new_machine' if present. In any case, concatenate
|
|
* the 'new_machine' (modified by a postix operator or not)
|
|
* to the current machien.
|
|
*/
|
|
if (exprlen > 1)
|
|
{
|
|
switch(*(expr+1))
|
|
{
|
|
case '*':
|
|
case '+':
|
|
case '?':
|
|
/*
|
|
* We have a postfix closure operator. Build a new
|
|
* closure machine out of 'new_machine'.
|
|
*/
|
|
{
|
|
re_machine closure_machine;
|
|
|
|
/* move onto the closure operator */
|
|
++expr;
|
|
--exprlen;
|
|
|
|
/* build the closure machine */
|
|
re_build_closure(ctx, &closure_machine,
|
|
&new_machine, *expr);
|
|
|
|
/* replace the original machine with the closure */
|
|
new_machine = closure_machine;
|
|
|
|
/*
|
|
* skip any redundant closure symbols, keeping
|
|
* only the first one we saw
|
|
*/
|
|
while (exprlen > 1 && (*(expr+1) == '?'
|
|
|| *(expr+1) == '+'
|
|
|| *(expr+1) == '*'))
|
|
{
|
|
++expr;
|
|
--exprlen;
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
/* no postfix operator */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Concatenate the new machine onto the current machine
|
|
* under construction.
|
|
*/
|
|
re_concat_onto(ctx, &cur_machine, &new_machine);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* complete any pending alternation */
|
|
re_alternate_onto(ctx, &alter_machine, &cur_machine);
|
|
|
|
/* store the resulting machine in the caller's machine descriptor */
|
|
*result_machine = alter_machine;
|
|
|
|
/* no errors encountered */
|
|
return RE_STATUS_SUCCESS;
|
|
}
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Pattern recognizer
|
|
*/
|
|
|
|
/*
|
|
* Note a group position if appropriate
|
|
*/
|
|
static void re_note_group(re_context *ctx, re_group_register *regs,
|
|
re_state_id id, const char *p)
|
|
{
|
|
int group_index;
|
|
|
|
/*
|
|
* Check to see if this is a valid state and it's a group marker -
|
|
* if not, there's nothing to do
|
|
*/
|
|
if (id == RE_STATE_INVALID
|
|
|| !(ctx->tuple_arr[id].flags
|
|
& (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
|
|
|| (group_index = (int)ctx->tuple_arr[id].ch) >= RE_GROUP_REG_CNT)
|
|
return;
|
|
|
|
/*
|
|
* It's a valid group marker - note the appropriate register value
|
|
*/
|
|
if ((ctx->tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0)
|
|
regs[group_index].start_ofs = p;
|
|
else
|
|
regs[group_index].end_ofs = p;
|
|
}
|
|
|
|
/*
|
|
* Determine if a character is part of a word. We consider letters and
|
|
* numbers to be word characters.
|
|
*/
|
|
static int re_is_word_char(char c) {
|
|
return Common::isAlnum((unsigned char)c);
|
|
}
|
|
|
|
/*
|
|
* Match a string to a compiled expression. Returns the length of the
|
|
* match if successful, or -1 if no match was found.
|
|
*/
|
|
static int re_match(re_context *ctx, const char *entire_str,
|
|
const char *str, size_t origlen,
|
|
const re_machine *machine, re_group_register *regs)
|
|
{
|
|
re_state_id cur_state;
|
|
const char *p;
|
|
size_t curlen;
|
|
|
|
/* start at the machine's initial state */
|
|
cur_state = machine->init;
|
|
|
|
/* start at the beginning of the string */
|
|
p = str;
|
|
curlen = origlen;
|
|
|
|
/* note any group involved in the initial state */
|
|
re_note_group(ctx, regs, cur_state, p);
|
|
|
|
/*
|
|
* if we're starting in the final state, immediately return success
|
|
* with a zero-length match
|
|
*/
|
|
if (cur_state == machine->final_state)
|
|
{
|
|
/* return success with a zero-length match */
|
|
return 0;
|
|
}
|
|
|
|
/* run the machine */
|
|
for (;;)
|
|
{
|
|
re_tuple *tuple;
|
|
|
|
/* get the tuple for this state */
|
|
tuple = &ctx->tuple_arr[cur_state];
|
|
|
|
/* if this is a group state, adjust the group registers */
|
|
re_note_group(ctx, regs, cur_state, p);
|
|
|
|
/* see what kind of state we're in */
|
|
if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
|
|
&& tuple->ch != RE_EPSILON)
|
|
{
|
|
/*
|
|
* This is a character or group recognizer state. If we
|
|
* match the character or group, continue on to the next
|
|
* state; otherwise, return failure.
|
|
*/
|
|
switch(tuple->ch)
|
|
{
|
|
case RE_GROUP_MATCH_0:
|
|
case RE_GROUP_MATCH_0 + 1:
|
|
case RE_GROUP_MATCH_0 + 2:
|
|
case RE_GROUP_MATCH_0 + 3:
|
|
case RE_GROUP_MATCH_0 + 4:
|
|
case RE_GROUP_MATCH_0 + 5:
|
|
case RE_GROUP_MATCH_0 + 6:
|
|
case RE_GROUP_MATCH_0 + 7:
|
|
case RE_GROUP_MATCH_0 + 8:
|
|
case RE_GROUP_MATCH_0 + 9:
|
|
{
|
|
int group_num;
|
|
re_group_register *group_reg;
|
|
size_t reg_len;
|
|
|
|
/* it's a group - get the group number */
|
|
group_num = tuple->ch - RE_GROUP_MATCH_0;
|
|
group_reg = ®s[group_num];
|
|
|
|
/*
|
|
* if this register isn't defined, there's nothing
|
|
* to match, so fail
|
|
*/
|
|
if (group_reg->start_ofs == nullptr || group_reg->end_ofs == nullptr)
|
|
return -1;
|
|
|
|
/* calculate the length of the register value */
|
|
reg_len = group_reg->end_ofs - group_reg->start_ofs;
|
|
|
|
/* if we don't have enough left to match, it fails */
|
|
if (curlen < reg_len)
|
|
return -1;
|
|
|
|
/* if the string doesn't match exactly, we fail */
|
|
if (memcmp(p, group_reg->start_ofs, reg_len) != 0)
|
|
return -1;
|
|
|
|
/*
|
|
* It matches exactly - skip the entire length of
|
|
* the register in the source string
|
|
*/
|
|
p += reg_len;
|
|
curlen -= reg_len;
|
|
}
|
|
break;
|
|
|
|
case RE_TEXT_BEGIN:
|
|
/*
|
|
* Match only the exact beginning of the string - if
|
|
* we're anywhere else, this isn't a match. If this
|
|
* succeeds, we don't skip any characters.
|
|
*/
|
|
if (p != entire_str)
|
|
return -1;
|
|
break;
|
|
|
|
case RE_TEXT_END:
|
|
/*
|
|
* Match only the exact end of the string - if we're
|
|
* anywhere else, this isn't a match. Don't skip any
|
|
* characters on success.
|
|
*/
|
|
if (curlen != 0)
|
|
return -1;
|
|
break;
|
|
|
|
case RE_WORD_BEGIN:
|
|
/*
|
|
* if the previous character is a word character, we're
|
|
* not at the beginning of a word
|
|
*/
|
|
if (p != entire_str && re_is_word_char(*(p-1)))
|
|
return -1;
|
|
|
|
/*
|
|
* if we're at the end of the string, or the current
|
|
* character isn't the start of a word, we're not at the
|
|
* beginning of a word
|
|
*/
|
|
if (curlen == 0 || !re_is_word_char(*p))
|
|
return -1;
|
|
break;
|
|
|
|
case RE_WORD_END:
|
|
/*
|
|
* if the current character is a word character, we're not
|
|
* at the end of a word
|
|
*/
|
|
if (curlen != 0 && re_is_word_char(*p))
|
|
return -1;
|
|
|
|
/*
|
|
* if we're at the beginning of the string, or the
|
|
* previous character is not a word character, we're not
|
|
* at the end of a word
|
|
*/
|
|
if (p == entire_str || !re_is_word_char(*(p-1)))
|
|
return -1;
|
|
break;
|
|
|
|
case RE_WORD_CHAR:
|
|
/* if it's not a word character, it's a failure */
|
|
if (curlen == 0 || !re_is_word_char(*p))
|
|
return -1;
|
|
|
|
/* skip this character of input */
|
|
++p;
|
|
--curlen;
|
|
break;
|
|
|
|
case RE_NON_WORD_CHAR:
|
|
/* if it's a word character, it's a failure */
|
|
if (curlen == 0 || re_is_word_char(*p))
|
|
return -1;
|
|
|
|
/* skip the input */
|
|
++p;
|
|
--curlen;
|
|
break;
|
|
|
|
case RE_WORD_BOUNDARY:
|
|
case RE_NON_WORD_BOUNDARY:
|
|
{
|
|
int prev_is_word;
|
|
int next_is_word;
|
|
int boundary;
|
|
|
|
/*
|
|
* Determine if the previous character is a word
|
|
* character -- if we're at the beginning of the
|
|
* string, it's obviously not, otherwise check its
|
|
* classification
|
|
*/
|
|
prev_is_word = (p != entire_str
|
|
&& re_is_word_char(*(p-1)));
|
|
|
|
/* make the same check for the current character */
|
|
next_is_word = (curlen != 0
|
|
&& re_is_word_char(*p));
|
|
|
|
/*
|
|
* Determine if this is a boundary - it is if the
|
|
* two states are different
|
|
*/
|
|
boundary = ((prev_is_word != 0) ^ (next_is_word != 0));
|
|
|
|
/*
|
|
* make sure it matches what was desired, and return
|
|
* failure if not
|
|
*/
|
|
if ((tuple->ch == RE_WORD_BOUNDARY && !boundary)
|
|
|| (tuple->ch == RE_NON_WORD_BOUNDARY && boundary))
|
|
return -1;
|
|
}
|
|
break;
|
|
|
|
case RE_WILDCARD:
|
|
/* make sure we have a character to match */
|
|
if (curlen == 0)
|
|
return -1;
|
|
|
|
/* skip this character */
|
|
++p;
|
|
--curlen;
|
|
break;
|
|
|
|
case RE_RANGE:
|
|
case RE_RANGE_EXCL:
|
|
{
|
|
int match;
|
|
|
|
/* make sure we have a character to match */
|
|
if (curlen == 0)
|
|
return -1;
|
|
|
|
/* see if we match */
|
|
match = re_is_bit_set(tuple->char_range,
|
|
(int)(unsigned char)*p);
|
|
|
|
/* make sure we got what we wanted */
|
|
if ((tuple->ch == RE_RANGE && !match)
|
|
|| (tuple->ch == RE_RANGE_EXCL && match))
|
|
return -1;
|
|
|
|
/* skip this character of the input */
|
|
++p;
|
|
--curlen;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
/* make sure we have an exact match */
|
|
if (curlen == 0 || tuple->ch != *p)
|
|
return -1;
|
|
|
|
/* skip this character of the input */
|
|
++p;
|
|
--curlen;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* if we got this far, we were successful - move on to the
|
|
* next state
|
|
*/
|
|
cur_state = tuple->next_state_1;
|
|
}
|
|
else if (tuple->next_state_2 == RE_STATE_INVALID)
|
|
{
|
|
/*
|
|
* We have only one transition, so this state is entirely
|
|
* deterministic. Simply move on to the next state.
|
|
*/
|
|
cur_state = tuple->next_state_1;
|
|
}
|
|
else
|
|
{
|
|
re_machine sub_machine;
|
|
re_group_register regs1[RE_GROUP_REG_CNT];
|
|
re_group_register regs2[RE_GROUP_REG_CNT];
|
|
int ret1;
|
|
int ret2;
|
|
|
|
/*
|
|
* This state has two possible transitions, and we don't
|
|
* know which one to take. So, try both, see which one
|
|
* works better, and return the result. Try the first
|
|
* transition first. Note that each separate attempt must
|
|
* use a separate copy of the registers.
|
|
*/
|
|
memcpy(regs1, regs, sizeof(regs1));
|
|
sub_machine.init = tuple->next_state_1;
|
|
sub_machine.final_state = machine->final_state;
|
|
ret1 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs1);
|
|
|
|
/*
|
|
* Now try the second transition
|
|
*/
|
|
memcpy(regs2, regs, sizeof(regs2));
|
|
sub_machine.init = tuple->next_state_2;
|
|
sub_machine.final_state = machine->final_state;
|
|
ret2 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs2);
|
|
|
|
/*
|
|
* If they both failed, the whole thing failed. Otherwise,
|
|
* return the longer of the two, plus the length we
|
|
* ourselves matched previously. Note that we return the
|
|
* register set from the winning match.
|
|
*/
|
|
if (ret1 < 0 && ret2 < 0)
|
|
{
|
|
/* they both failed */
|
|
return -1;
|
|
}
|
|
else if (ret1 > ret2)
|
|
{
|
|
/* use the first register set and result length */
|
|
memcpy(regs, regs1, sizeof(regs1));
|
|
return ret1 + (p - str);
|
|
}
|
|
else
|
|
{
|
|
/* use the second register set and result length */
|
|
memcpy(regs, regs2, sizeof(regs2));
|
|
return ret2 + (p - str);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If we're in the final state, return success
|
|
*/
|
|
if (cur_state == machine->final_state)
|
|
{
|
|
/* finish off any group involved in the final state */
|
|
re_note_group(ctx, regs, cur_state, p);
|
|
|
|
/* return the length we matched */
|
|
return p - str;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Search for a regular expression within a string. Returns -1 if the
|
|
* string cannot be found, otherwise returns the offset from the start
|
|
* of the string to be searched of the start of the first match for the
|
|
* pattern.
|
|
*/
|
|
static int re_search(re_context *ctx, const char *str, size_t len,
|
|
const re_machine *machine, re_group_register *regs,
|
|
int *result_len)
|
|
{
|
|
int ofs;
|
|
|
|
/*
|
|
* Starting at the first character in the string, search for the
|
|
* pattern at each subsequent character until we either find the
|
|
* pattern or run out of string to test.
|
|
*/
|
|
for (ofs = 0 ; ofs < (int)len ; ++ofs)
|
|
{
|
|
int matchlen;
|
|
|
|
/* check for a match */
|
|
matchlen = re_match(ctx, str, str + ofs, len - ofs,
|
|
machine, regs);
|
|
if (matchlen >= 0)
|
|
{
|
|
/* we found a match here - return the length and offset */
|
|
*result_len = matchlen;
|
|
return ofs;
|
|
}
|
|
}
|
|
|
|
/* we didn't find a match */
|
|
return -1;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Make a copy of a search string in our private buffer.
|
|
*/
|
|
static void re_save_search_str(re_context *ctx, const char *str, size_t len)
|
|
{
|
|
/* if the string is empty, this is easy */
|
|
if (len == 0)
|
|
{
|
|
/* nothing to store - just save the length and return */
|
|
ctx->curlen = 0;
|
|
return;
|
|
}
|
|
|
|
/* if the current buffer isn't big enough, allocate a new one */
|
|
if (ctx->strbuf == nullptr || ctx->strbufsiz < len)
|
|
{
|
|
/*
|
|
* free any previous buffer - its contents are no longer
|
|
* important, since we're about to overwrite it with a new
|
|
* string
|
|
*/
|
|
if (ctx->strbuf != nullptr)
|
|
mchfre(ctx->strbuf);
|
|
|
|
/*
|
|
* allocate a new buffer; round up to the next 256-byte
|
|
* increment to make sure we're not constantly reallocating to
|
|
* random sizes
|
|
*/
|
|
ctx->strbufsiz = ((len + 255) & ~255);
|
|
|
|
/* allocate it */
|
|
ctx->strbuf = (char *)mchalo(ctx->errctx, ctx->strbufsiz,
|
|
"regex str");
|
|
}
|
|
|
|
/* copy the string */
|
|
memcpy(ctx->strbuf, str, len);
|
|
|
|
/* save the length */
|
|
ctx->curlen = len;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Compile an expression and search for a match within the given string.
|
|
* Returns the offset of the match, or -1 if no match was found.
|
|
*/
|
|
int re_compile_and_search(re_context *ctx,
|
|
const char *pattern, size_t patlen,
|
|
const char *searchstr, size_t searchlen,
|
|
int *result_len)
|
|
{
|
|
re_machine machine;
|
|
|
|
/* compile the expression - return failure if we get an error */
|
|
if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
|
|
return -1;
|
|
|
|
/* save the search string in our internal buffer */
|
|
re_save_search_str(ctx, searchstr, searchlen);
|
|
|
|
/* clear the group registers */
|
|
for (uint i = 0; i < ARRAYSIZE(ctx->regs); i++) {
|
|
ctx->regs[i].clear();
|
|
}
|
|
|
|
/*
|
|
* search for the pattern in our copy of the string - use the copy
|
|
* so that the group registers stay valid even if the caller
|
|
* deallocates the original string after we return
|
|
*/
|
|
return re_search(ctx, ctx->strbuf, ctx->curlen, &machine,
|
|
ctx->regs, result_len);
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/*
|
|
* Compile an expression and check for a match. Returns the length of
|
|
* the match if we found a match, -1 if we found no match. This is not
|
|
* a search function; we merely match the leading substring of the given
|
|
* string to the given pattern.
|
|
*/
|
|
int re_compile_and_match(re_context *ctx,
|
|
const char *pattern, size_t patlen,
|
|
const char *searchstr, size_t searchlen)
|
|
{
|
|
re_machine machine;
|
|
|
|
/* compile the expression - return failure if we get an error */
|
|
if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
|
|
return FALSE;
|
|
|
|
/* save the search string in our internal buffer */
|
|
re_save_search_str(ctx, searchstr, searchlen);
|
|
|
|
/* clear the group registers */
|
|
for (uint i = 0; i < ARRAYSIZE(ctx->regs); i++) {
|
|
ctx->regs[i].clear();
|
|
}
|
|
|
|
/* match the string */
|
|
return re_match(ctx, ctx->strbuf, ctx->strbuf, ctx->curlen,
|
|
&machine, ctx->regs);
|
|
}
|
|
|
|
|
|
} // End of namespace TADS2
|
|
} // End of namespace TADS
|
|
} // End of namespace Glk
|