scummvm-cursorfix/engines/glk/tads/tads2/regex.cpp

/* ScummVM - Graphic Adventure Engine
 *
 * ScummVM is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

/*
Regular Expression Parser and Recognizer for TADS
Function
  Parses and recognizes regular expressions
Notes
  Regular expression syntax:

	 abc|def    either abc or def
	 (abc)      abc
	 abc+       abc, abcc, abccc, ...
	 abc*     ab, abc, abcc, ...
	 abc?       ab or abc
	 .          any single character
	 abc$       abc at the end of the line
	 ^abc       abc at the beginning of the line
	 %^abc      literally ^abc
	 [abcx-z]   matches a, b, c, x, y, or z
	 [^abcx-z]  matches any character except a, b, c, x, y, or z
	 [^]-q]     matches any character except ], -, or q

  Note that using ']' or '-' in a character range expression requires
  special ordering.  If ']' is to be used, it must be the first character
  after the '^', if present, within the brackets.  If '-' is to be used,
  it must be the first character after the '^' and/or ']', if present.

  '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [
  (We use '%' rather than a backslash because it's less trouble to
  enter in a TADS string -- a backslash needs to be quoted with another
  backslash, which is error-prone and hard to read.  '%' doesn't need
  any special quoting in a TADS string, which makes it a lot more
  readable.)

  In addition, '%' is used to introduce additional special sequences:

	 %1         text matching first parenthesized expression
	 %9         text matching ninth parenthesized experssion
	 %<         matches at the beginning of a word only
	 %>         matches at the end of a word only
	 %w         matches any word character
	 %W         matches any non-word character
	 %b         matches at any word boundary (beginning or end of word)
	 %B         matches except at a word boundary

  For the word matching sequences, a word is any sequence of letters and
  numbers.
*/

#include "glk/tads/tads2/regex.h"
#include "glk/tads/tads2/memory_cache_heap.h"
#include "common/util.h"

namespace Glk {
namespace TADS {
namespace TADS2 {

/**
 * A "machine" (i.e., a finite state automaton) is a set of state
 * transition tuples.  A tuple has three elements: the state ID, the ID
 * of the state that we transition to, and the condition for the
 * transition.  The condition is simply the character that we must match
 * to make the transition, or a special distinguished symbol "epsilon,"
 * which refers to a transition with no input character consumed.
 *
 * The primitive elements of our machines guarantee that we never have
 * more than two transitions out of a particular state, so we can
 * denormalize the representation of a state by storing the two possible
 * tuples for that state in a single combined tuple.  This has the
 * performance advantage that we can use the state ID as an index into
 * an array of state tuples.
 *
 * A particular machine always has a single initial and single final
 * (successful) state, so we can define a machine by its initial and
 * final state ID's.
 */
enum {
	// the special symbol value for "epsilon"
	RE_EPSILON            = '\001',

	// the special symbol value for a wildcard character
	RE_WILDCARD           = '\002',

	// special symbol values for beginning and end of text
	RE_TEXT_BEGIN         = '\003',
	RE_TEXT_END           = '\004',

	// special symbol values for start and end of a word
	RE_WORD_BEGIN         = '\005',
	RE_WORD_END           = '\006',

	// special symbols for word-char and non-word-char
	RE_WORD_CHAR          = '\007',
	RE_NON_WORD_CHAR      = '\010',

	// special symbols for word-boundary and non-word-boundary
	RE_WORD_BOUNDARY      = '\011',
	RE_NON_WORD_BOUNDARY  = '\012',

	// special symbol for a character range/exclusion range
	RE_RANGE              = '\013',
	RE_RANGE_EXCL         = '\014',

	// a range of special symbol values for group matchers
	RE_GROUP_MATCH_0      = '\015',
	RE_GROUP_MATCH_9      = (RE_GROUP_MATCH_0 + 9)
};

/* ------------------------------------------------------------------------ */
/*
 *   A machine description.  Machines are fully described by their initial
 *   and final state ID's.
 */
struct re_machine {
	/* the machine's initial state */
	re_state_id init;

	/* the machine's final state */
	re_state_id final_state;
};

/* ------------------------------------------------------------------------ */
/*
 *   Initialize the context.  The memory for the context structure itself
 *   is allocated and maintained by the caller.
 */
void re_init(re_context *ctx, errcxdef *errctx)
{
	/* save the error context */
	ctx->errctx = errctx;

	/* no tuple array yet */
	ctx->tuple_arr = nullptr;
	ctx->tuples_alloc = 0;

	/* clear states */
	ctx->next_state = RE_STATE_FIRST_VALID;

	/* clear groups */
	ctx->cur_group = 0;

	/* no string buffer yet */
	ctx->strbuf = nullptr;
}

/* ------------------------------------------------------------------------ */
/*
 *   Reset compiler - clears states and tuples
 */
static void re_reset(re_context *ctx)
{
	int i;

	/* delete any range tables we've allocated */
	for (i = 0 ; i < ctx->next_state ; ++i)
	{
		if (ctx->tuple_arr[i].char_range != nullptr)
		{
			mchfre(ctx->tuple_arr[i].char_range);
			ctx->tuple_arr[i].char_range = nullptr;
		}
	}

	/* clear states */
	ctx->next_state = RE_STATE_FIRST_VALID;

	/* clear groups */
	ctx->cur_group = 0;
}

/* ------------------------------------------------------------------------ */
/*
 *   Delete the context - frees structures associated with the context.
 *   Does NOT free the memory used by the context structure itself.
 */
void re_delete(re_context *ctx)
{
	/* reset state */
	re_reset(ctx);

	/* if we've allocated an array, delete it */
	if (ctx->tuple_arr != nullptr)
	{
		mchfre(ctx->tuple_arr);
		ctx->tuple_arr = nullptr;
	}

	/* if we allocated a string buffer, delete it */
	if (ctx->strbuf != nullptr)
	{
		mchfre(ctx->strbuf);
		ctx->strbuf = nullptr;
	}
}

/* ------------------------------------------------------------------------ */
/*
 *   Allocate a new state ID
 */
static re_state_id re_alloc_state(re_context *ctx)
{
	/*
	 *   If we don't have enough room for another state, expand the array
	 */
	if (ctx->next_state >= ctx->tuples_alloc)
	{
		uint new_alloc;

		/* bump the size by a bit */
		new_alloc = ctx->tuples_alloc + 100;

		/* allocate or expand the array */
		if (ctx->tuples_alloc == 0)
		{
			/* allocate the initial memory block */
			ctx->tuple_arr =
				(re_tuple *)mchalo(ctx->errctx,
								   (new_alloc * sizeof(re_tuple)),
								   "regex");
		}
		else
		{
			re_tuple *ptr;

			/* allocate a new memory block */
			ptr = (re_tuple *)mchalo(ctx->errctx,
									 (new_alloc * sizeof(re_tuple)),
									 "regex");

			/* copy the old memory to the new memory */
			memcpy(ptr, ctx->tuple_arr, ctx->tuples_alloc * sizeof(re_tuple));

			/* free the old block */
			mchfre(ctx->tuple_arr);

			/* use the new block */
			ctx->tuple_arr = ptr;
		}

		/* remember the new allocation size */
		ctx->tuples_alloc = new_alloc;
	}

	/* initialize the next state */
	ctx->tuple_arr[ctx->next_state].next_state_1 = RE_STATE_INVALID;
	ctx->tuple_arr[ctx->next_state].next_state_2 = RE_STATE_INVALID;
	ctx->tuple_arr[ctx->next_state].ch = RE_EPSILON;
	ctx->tuple_arr[ctx->next_state].flags = 0;
	ctx->tuple_arr[ctx->next_state].char_range = nullptr;

	/* return the new state's ID */
	return ctx->next_state++;
}


/* ------------------------------------------------------------------------ */
/*
 *   Set a transition from a state to a given destination state.
 */
static void re_set_trans(re_context *ctx,
						 re_state_id id, re_state_id dest_id, char ch)
{
	re_tuple *tuple;

	/*
	 *   get the tuple containing the transitions for this state ID - the
	 *   state ID is the index of the state's transition tuple in the
	 *   array
	 */
	tuple = &ctx->tuple_arr[id];

	/*
	 *   If the first state pointer hasn't been set yet, set it to the new
	 *   destination.  Otherwise, set the second state pointer.
	 *
	 *   Only set the character on setting the first state.  When setting
	 *   the second state, we must assume that the character for the state
	 *   has already been set, since any given state can have only one
	 *   character setting.
	 */
	if (tuple->next_state_1 == RE_STATE_INVALID)
	{
		/*
		 *   set the character ID, unless the state has been marked with a
		 *   special flag which indicates that the character value has
		 *   another meaning (in particular, a group marker)
		 */
		if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)))
			tuple->ch = ch;

		/* set the first transition */
		tuple->next_state_1 = dest_id;
	}
	else
	{
		/* set only the second transition state - don't set the character */
		tuple->next_state_2 = dest_id;
	}
}

/* ------------------------------------------------------------------------ */
/*
 *   Initialize a new machine, giving it an initial and final state
 */
static void re_init_machine(re_context *ctx, re_machine *machine)
{
	machine->init = re_alloc_state(ctx);
	machine->final_state = re_alloc_state(ctx);
}

/*
 *   Build a character recognizer
 */
static void re_build_char(re_context *ctx, re_machine *machine, char ch)
{
	/* initialize our new machine */
	re_init_machine(ctx, machine);

	/* allocate a transition tuple for the new state */
	re_set_trans(ctx, machine->init, machine->final_state, ch);
}

/*
 *   Build a character range recognizer.  'range' is a 256-bit (32-byte)
 *   bit vector.
 */
static void re_build_char_range(re_context *ctx, re_machine *machine,
								unsigned char *range, int exclusion)
{
	unsigned char *range_copy;

	/* initialize our new machine */
	re_init_machine(ctx, machine);

	/* allocate a transition table for the new state */
	re_set_trans(ctx, machine->init, machine->final_state,
				 (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE));

	/* allocate a copy of the range bit vector */
	range_copy = (unsigned char *)mchalo(ctx->errctx, 32, "regex range");

	/* copy the caller's range */
	memcpy(range_copy, range, 32);

	/* store it in the tuple */
	ctx->tuple_arr[machine->init].char_range = range_copy;
}


/*
 *   Build a group recognizer.  This is almost the same as a character
 *   recognizer, but matches a previous group rather than a literal
 *   character.
 */
static void re_build_group_matcher(re_context *ctx,
								   re_machine *machine, int group_num)
{
	/* initialize our new machine */
	re_init_machine(ctx, machine);

	/*
	 *   Allocate a transition tuple for the new state, using the group ID
	 *   as the character code.  Store the special code for a group
	 *   recognizer rather than the normal literal character code.
	 */
	re_set_trans(ctx, machine->init, machine->final_state,
				 (char)(group_num + RE_GROUP_MATCH_0));
}


/*
 *   Build a concatenation recognizer
 */
static void re_build_concat(re_context *ctx, re_machine *new_machine,
							re_machine *lhs, re_machine *rhs)
{
	/* initialize the new machine */
	re_init_machine(ctx, new_machine);

	/*
	 *   set up an epsilon transition from the new machine's initial state
	 *   to the first submachine's initial state
	 */
	re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);

	/*
	 *   Set up an epsilon transition from the first submachine's final
	 *   state to the second submachine's initial state
	 */
	re_set_trans(ctx, lhs->final_state, rhs->init, RE_EPSILON);

	/*
	 *   Set up an epsilon transition from the second submachine's final
	 *   state to our new machine's final state
	 */
	re_set_trans(ctx, rhs->final_state, new_machine->final_state, RE_EPSILON);
}

/*
 *   Build a group machine.  sub_machine contains the machine that
 *   expresses the group's contents; we'll fill in new_machine with a
 *   newly-created machine that encloses and marks the group.
 */
static void re_build_group(re_context *ctx, re_machine *new_machine,
						   re_machine *sub_machine, int group_id)
{
	/* initialize the container machine */
	re_init_machine(ctx, new_machine);

	/*
	 *   set up an epsilon transition from the new machine's initial state
	 *   into the initial state of the group, and another transition from
	 *   the group's final state into the container's final state
	 */
	re_set_trans(ctx, new_machine->init, sub_machine->init, RE_EPSILON);
	re_set_trans(ctx, sub_machine->final_state, new_machine->final_state, RE_EPSILON);

	/*
	 *   Mark the initial and final states of the group machine as being
	 *   group markers.
	 */
	ctx->tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN;
	ctx->tuple_arr[new_machine->final_state].flags |= RE_STATE_GROUP_END;

	/* store the group ID in the 'ch' member of the start and end states */
	ctx->tuple_arr[new_machine->init].ch = group_id;
	ctx->tuple_arr[new_machine->final_state].ch = group_id;
}

/*
 *   Build an alternation recognizer
 */
static void re_build_alter(re_context *ctx, re_machine *new_machine,
						   re_machine *lhs, re_machine *rhs)
{
	/* initialize the new machine */
	re_init_machine(ctx, new_machine);

	/*
	 *   Set up an epsilon transition from our new machine's initial state
	 *   to the initial state of each submachine
	 */
	re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
	re_set_trans(ctx, new_machine->init, rhs->init, RE_EPSILON);

	/*
	 *   Set up an epsilon transition from the final state of each
	 *   submachine to our final state
	 */
	re_set_trans(ctx, lhs->final_state, new_machine->final_state, RE_EPSILON);
	re_set_trans(ctx, rhs->final_state, new_machine->final_state, RE_EPSILON);
}

/*
 *   Build a closure recognizer
 */
static void re_build_closure(re_context *ctx,
							 re_machine *new_machine, re_machine *sub,
							 char specifier)
{
	/* initialize the new machine */
	re_init_machine(ctx, new_machine);

	/*
	 *   set up an epsilon transition from our initial state to the
	 *   submachine's initial state, and from the submachine's final state
	 *   to our final state
	 */
	re_set_trans(ctx, new_machine->init, sub->init, RE_EPSILON);
	re_set_trans(ctx, sub->final_state, new_machine->final_state, RE_EPSILON);

	/*
	 *   If this is an unbounded closure ('*' or '+', but not '?'), set up
	 *   the loop transition that takes us from the new machine's final
	 *   state back to its initial state.  We don't do this on the
	 *   zero-or-one closure, because we can only match the expression
	 *   once.
	 */
	if (specifier != '?')
		re_set_trans(ctx, sub->final_state, sub->init, RE_EPSILON);

	/*
	 *   If this is a zero-or-one closure or a zero-or-more closure, set
	 *   up an epsilon transition from our initial state to our final
	 *   state, since we can skip the entire subexpression.  We don't do
	 *   this on the one-or-more closure, because we can't skip the
	 *   subexpression in this case.
	 */
	if (specifier != '+')
		re_set_trans(ctx, new_machine->init, new_machine->final_state, RE_EPSILON);
}

/*
 *   Build a null machine
 */
static void re_build_null_machine(re_context *ctx, re_machine *machine)
{
	machine->init = machine->final_state = RE_STATE_INVALID;
}

/* ------------------------------------------------------------------------ */
/*
 *   Determine if a machine is null
 */
static int re_is_machine_null(re_context *ctx, re_machine *machine)
{
	return (machine->init == RE_STATE_INVALID);
}


/* ------------------------------------------------------------------------ */
/*
 *   Concatenate the second machine onto the first machine, replacing the
 *   first machine with the resulting machine.  If the first machine is a
 *   null machine (created with re_build_null_machine), we'll simply copy
 *   the second machine into the first.
 */
static void re_concat_onto(re_context *ctx,
						   re_machine *dest, re_machine *rhs)
{
	/* check for a null destination machine */
	if (re_is_machine_null(ctx, dest))
	{
		/*
		 *   the first machine is null - simply copy the second machine
		 *   onto the first unchanged
		 */
		*dest = *rhs;
	}
	else
	{
		re_machine new_machine;

		/* build the concatenated machine */
		re_build_concat(ctx, &new_machine, dest, rhs);

		/* copy the concatenated machine onto the first machine */
		*dest = new_machine;
	}
}

/*
 *   Alternate the second machine onto the first machine, replacing the
 *   first machine with the resulting machine.  If the first machine is a
 *   null machine, this simply replaces the first machine with the second
 *   machine.  If the second machine is null, this simply leaves the first
 *   machine unchanged.
 */
static void re_alternate_onto(re_context *ctx,
							  re_machine *dest, re_machine *rhs)
{
	/* check to see if the first machine is null */
	if (re_is_machine_null(ctx, dest))
	{
		/*
		 *   the first machine is null - simply copy the second machine
		 *   onto the first
		 */
		*dest = *rhs;
	}
	else
	{
		/*
		 *   if the second machine is null, don't do anything; otherwise,
		 *   build the alternation
		 */
		if (!re_is_machine_null(ctx, rhs))
		{
			re_machine new_machine;

			/* build the alternation */
			re_build_alter(ctx, &new_machine, dest, rhs);

			/* replace the first machine with the alternation */
			*dest = new_machine;
		}
	}
}

/* ------------------------------------------------------------------------ */
/*
 *   Set a bit in a bit vector.
 */
#define re_set_bit(set, bit) \
	(((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7)))

/*
 *   Test a bit in a bit vector
 */
#define re_is_bit_set(set, bit) \
	((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0)

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression
 */
static re_status_t re_compile(re_context *ctx,
							  const char *expr, size_t exprlen,
							  re_machine *result_machine)
{
	re_machine cur_machine;
	re_machine alter_machine;
	re_machine new_machine;
	size_t group_stack_level;
	struct
	{
		re_machine old_cur;
		re_machine old_alter;
		int group_id;
	} group_stack[50];

	/* reset everything */
	re_reset(ctx);

	/* start out with no current machine and no alternate machine */
	re_build_null_machine(ctx, &cur_machine);
	re_build_null_machine(ctx, &alter_machine);

	/* nothing on the stack yet */
	group_stack_level = 0;

	/* loop until we run out of expression to parse */
	for ( ; exprlen != 0 ; ++expr, --exprlen)
	{
		switch(*expr)
		{
		case '^':
			/*
			 *   beginning of line - if we're not at the beginning of the
			 *   current expression (i.e., we already have some
			 *   concatentations accumulated), treat it as an ordinary
			 *   character
			 */
			if (!re_is_machine_null(ctx, &cur_machine))
				goto normal_char;

			/* build a new start-of-text recognizer */
			re_build_char(ctx, &new_machine, RE_TEXT_BEGIN);

			/*
			 *   concatenate it onto the string - note that this can't
			 *   have any postfix operators
			 */
			re_concat_onto(ctx, &cur_machine, &new_machine);
			break;

		case '$':
			/*
			 *   End of line specifier - if there's anything left after
			 *   the '$' other than a close parens or alternation
			 *   specifier, great it as a normal character
			 */
			if (exprlen > 1
				&& (*(expr+1) != ')' && *(expr+1) != '|'))
				goto normal_char;

			/* build a new end-of-text recognizer */
			re_build_char(ctx, &new_machine, RE_TEXT_END);

			/*
			 *   concatenate it onto the string - note that this can't
			 *   have any postfix operators
			 */
			re_concat_onto(ctx, &cur_machine, &new_machine);
			break;

		case '(':
			/*
			 *   Add a nesting level.  Push the current machine and
			 *   alternate machines onto the group stack, and clear
			 *   everything out for the new group.
			 */
			if (group_stack_level
				> sizeof(group_stack)/sizeof(group_stack[0]))
			{
				/* we cannot proceed - return an error */
				return RE_STATUS_GROUP_NESTING_TOO_DEEP;
			}

			/* save the current state on the stack */
			group_stack[group_stack_level].old_cur = cur_machine;
			group_stack[group_stack_level].old_alter = alter_machine;

			/*
			 *   Assign the group a group ID - groups are numbered in
			 *   order of their opening (left) parentheses, so we want to
			 *   assign a group number now.  We won't actually need to
			 *   know the group number until we get to the matching close
			 *   paren, but we need to assign it now, so store it in the
			 *   group stack.
			 */
			group_stack[group_stack_level].group_id = ctx->cur_group;

			/* consume the group number */
			ctx->cur_group++;

			/* push the level */
			++group_stack_level;

			/* start the new group with empty machines */
			re_build_null_machine(ctx, &cur_machine);
			re_build_null_machine(ctx, &alter_machine);
			break;

		case ')':
			/* if there's nothing on the stack, ignore this */
			if (group_stack_level == 0)
				break;

			/* take a level off the stack */
			--group_stack_level;

			/*
			 *   Remove a nesting level.  If we have a pending alternate
			 *   expression, build the alternation expression.  This will
			 *   leave the entire group expression in alter_machine,
			 *   regardless of whether an alternation was in progress or
			 *   not.
			 */
			re_alternate_onto(ctx, &alter_machine, &cur_machine);

			/*
			 *   Create a group machine that encloses the group and marks
			 *   it with a group number.  We assigned the group number
			 *   when we parsed the open paren, so read that group number
			 *   from the stack.
			 *
			 *   Note that this will leave 'new_machine' with the entire
			 *   group machine.
			 */
			re_build_group(ctx, &new_machine, &alter_machine,
						   group_stack[group_stack_level].group_id);

			/*
			 *   Pop the stack - restore the alternation and current
			 *   machines that were in progress before the group started.
			 */
			cur_machine = group_stack[group_stack_level].old_cur;
			alter_machine = group_stack[group_stack_level].old_alter;

			/*
			 *   Check the group expression (in new_machine) for postfix
			 *   expressions
			 */
			goto apply_postfix;

		case '|':
			/*
			 *   Start a new alternation.  This ends the current
			 *   alternation; if we have a previous pending alternate,
			 *   build an alternation machine out of the previous
			 *   alternate and the current machine and move that to the
			 *   alternate; otherwise, simply move the current machine to
			 *   the pending alternate.
			 */
			re_alternate_onto(ctx, &alter_machine, &cur_machine);

			/*
			 *   the alternation starts out with a blank slate, so null
			 *   out the current machine
			 */
			re_build_null_machine(ctx, &cur_machine);
			break;

		case '%':
			/*
			 *   quoted character - skip the quote mark and see what we
			 *   have
			 */
			++expr;
			--exprlen;

			/* check to see if we're at the end of the expression */
			if (exprlen == 0)
			{
				/*
				 *   end of the string - ignore it, but undo the extra
				 *   increment of the expression index so that we exit the
				 *   enclosing loop properly
				 */
				--expr;
				++exprlen;
				break;
			}

			/* see what we have */
			switch(*expr)
			{
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				/* group match - build a new literal group recognizer */
				re_build_group_matcher(ctx, &new_machine, (int)(*expr - '1'));

				/* apply any postfix expression to the group recognizer */
				goto apply_postfix;

			case '<':
				/* build a beginning-of-word recognizer */
				re_build_char(ctx, &new_machine, RE_WORD_BEGIN);

				/* it can't be postfixed - just concatenate it */
				re_concat_onto(ctx, &cur_machine, &new_machine);
				break;

			case '>':
				/* build an end-of-word recognizer */
				re_build_char(ctx, &new_machine, RE_WORD_END);

				/* it can't be postfixed - just concatenate it */
				re_concat_onto(ctx, &cur_machine, &new_machine);
				break;

			case 'w':
				/* word character */
				re_build_char(ctx, &new_machine, RE_WORD_CHAR);
				goto apply_postfix;

			case 'W':
				/* non-word character */
				re_build_char(ctx, &new_machine, RE_NON_WORD_CHAR);
				goto apply_postfix;

			case 'b':
				/* word boundary */
				re_build_char(ctx, &new_machine, RE_WORD_BOUNDARY);

				/* it can't be postfixed */
				re_concat_onto(ctx, &cur_machine, &new_machine);
				break;

			case 'B':
				/* not a word boundary */
				re_build_char(ctx, &new_machine, RE_NON_WORD_BOUNDARY);

				/* it can't be postfixed */
				re_concat_onto(ctx, &cur_machine, &new_machine);
				break;

			default:
				/* build a new literal character recognizer */
				re_build_char(ctx, &new_machine, *expr);

				/* apply any postfix expression to the character */
				goto apply_postfix;
			}
			break;

		case '.':
			/*
			 *   wildcard character - build a single character recognizer
			 *   for the special wildcard symbol, then go check it for a
			 *   postfix operator
			 */
			re_build_char(ctx, &new_machine, RE_WILDCARD);
			goto apply_postfix;
			break;

		case '[':
			/* range expression */
			{
				int is_exclusive = FALSE;
				unsigned char set[32];

				/* clear out the set of characters in the range */
				memset(set, 0, sizeof(set));

				/* first, skip the open bracket */
				++expr;
				--exprlen;

				/* check to see if starts with the exclusion character */
				if (exprlen != 0 && *expr == '^')
				{
					/* skip the exclusion specifier */
					++expr;
					--exprlen;

					/* note it */
					is_exclusive = TRUE;
				}

				/*
				 *   if the first character is a ']', include it in the
				 *   range
				 */
				if (exprlen != 0 && *expr == ']')
				{
					re_set_bit(set, (int)']');
					++expr;
					--exprlen;
				}

				/*
				 *   if the next character is a '-', include it in the
				 *   range
				 */
				if (exprlen != 0 && *expr == '-')
				{
					re_set_bit(set, (int)'-');
					++expr;
					--exprlen;
				}

				/* scan the character set */
				while (exprlen != 0 && *expr != ']')
				{
					int ch;

					/* note this character */
					ch = (int)(unsigned char)*expr;

					/* set it */
					re_set_bit(set, ch);

					/* skip this character of the expression */
					++expr;
					--exprlen;

					/* check for a range */
					if (exprlen != 0 && *expr == '-')
					{
						int ch2;

						/* skip the '-' */
						++expr;
						--exprlen;
						if (exprlen != 0)
						{
							/* get the other end of the range */
							ch2 = (int)(unsigned char)*expr;

							/* skip the second character */
							++expr;
							--exprlen;

							/* if the range is reversed, swap it */
							if (ch > ch2)
							{
								int tmp = ch;
								ch = ch2;
								ch2 = tmp;
							}

							/* fill in the range */
							for ( ; ch <= ch2 ; ++ch)
								re_set_bit(set, ch);
						}
					}
				}

				/* create a character range machine */
				re_build_char_range(ctx, &new_machine, set, is_exclusive);

				/* apply any postfix operator */
				goto apply_postfix;
			}
			break;

		default:
		normal_char:
			/*
			 *   it's an ordinary character - build a single character
			 *   recognizer machine, and then concatenate it onto any
			 *   existing machine
			 */
			re_build_char(ctx, &new_machine, *expr);

		apply_postfix:
			/*
			 *   Check for a postfix operator, and apply it to the machine
			 *   in 'new_machine' if present.  In any case, concatenate
			 *   the 'new_machine' (modified by a postix operator or not)
			 *   to the current machien.
			 */
			if (exprlen > 1)
			{
				switch(*(expr+1))
				{
				case '*':
				case '+':
				case '?':
					/*
					 *   We have a postfix closure operator.  Build a new
					 *   closure machine out of 'new_machine'.
					 */
					{
						re_machine closure_machine;

						/* move onto the closure operator */
						++expr;
						--exprlen;

						/* build the closure machine */
						re_build_closure(ctx, &closure_machine,
										 &new_machine, *expr);

						/* replace the original machine with the closure */
						new_machine = closure_machine;

						/*
						 *   skip any redundant closure symbols, keeping
						 *   only the first one we saw
						 */
						while (exprlen > 1 && (*(expr+1) == '?'
											   || *(expr+1) == '+'
											   || *(expr+1) == '*'))
						{
							++expr;
							--exprlen;
						}
					}
					break;

				default:
					/* no postfix operator */
					break;
				}
			}

			/*
			 *   Concatenate the new machine onto the current machine
			 *   under construction.
			 */
			re_concat_onto(ctx, &cur_machine, &new_machine);
			break;
		}
	}

	/* complete any pending alternation */
	re_alternate_onto(ctx, &alter_machine, &cur_machine);

	/* store the resulting machine in the caller's machine descriptor */
	*result_machine = alter_machine;

	/* no errors encountered */
	return RE_STATUS_SUCCESS;
}


/* ------------------------------------------------------------------------ */
/*
 *   Pattern recognizer
 */

/*
 *   Note a group position if appropriate
 */
static void re_note_group(re_context *ctx, re_group_register *regs,
						  re_state_id id, const char *p)
{
	int group_index;

	/*
	 *   Check to see if this is a valid state and it's a group marker -
	 *   if not, there's nothing to do
	 */
	if (id == RE_STATE_INVALID
		|| !(ctx->tuple_arr[id].flags
			 & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
		|| (group_index = (int)ctx->tuple_arr[id].ch) >= RE_GROUP_REG_CNT)
		return;

	/*
	 *   It's a valid group marker - note the appropriate register value
	 */
	if ((ctx->tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0)
		regs[group_index].start_ofs = p;
	else
		regs[group_index].end_ofs = p;
}

/*
 *   Determine if a character is part of a word.  We consider letters and
 *   numbers to be word characters.
 */
static int re_is_word_char(char c) {
	return Common::isAlnum((unsigned char)c);
}

/*
 *   Match a string to a compiled expression.  Returns the length of the
 *   match if successful, or -1 if no match was found.
 */
static int re_match(re_context *ctx, const char *entire_str,
					const char *str, size_t origlen,
					const re_machine *machine, re_group_register *regs)
{
	re_state_id cur_state;
	const char *p;
	size_t curlen;

	/* start at the machine's initial state */
	cur_state = machine->init;

	/* start at the beginning of the string */
	p = str;
	curlen = origlen;

	/* note any group involved in the initial state */
	re_note_group(ctx, regs, cur_state, p);

	/*
	 *   if we're starting in the final state, immediately return success
	 *   with a zero-length match
	 */
	if (cur_state == machine->final_state)
	{
		/* return success with a zero-length match */
		return 0;
	}

	/* run the machine */
	for (;;)
	{
		re_tuple *tuple;

		/* get the tuple for this state */
		tuple = &ctx->tuple_arr[cur_state];

		/* if this is a group state, adjust the group registers */
		re_note_group(ctx, regs, cur_state, p);

		/* see what kind of state we're in */
		if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
			&& tuple->ch != RE_EPSILON)
		{
			/*
			 *   This is a character or group recognizer state.  If we
			 *   match the character or group, continue on to the next
			 *   state; otherwise, return failure.
			 */
			switch(tuple->ch)
			{
			case RE_GROUP_MATCH_0:
			case RE_GROUP_MATCH_0 + 1:
			case RE_GROUP_MATCH_0 + 2:
			case RE_GROUP_MATCH_0 + 3:
			case RE_GROUP_MATCH_0 + 4:
			case RE_GROUP_MATCH_0 + 5:
			case RE_GROUP_MATCH_0 + 6:
			case RE_GROUP_MATCH_0 + 7:
			case RE_GROUP_MATCH_0 + 8:
			case RE_GROUP_MATCH_0 + 9:
				{
					int group_num;
					re_group_register *group_reg;
					size_t reg_len;

					/* it's a group - get the group number */
					group_num = tuple->ch - RE_GROUP_MATCH_0;
					group_reg = &regs[group_num];

					/*
					 *   if this register isn't defined, there's nothing
					 *   to match, so fail
					 */
					if (group_reg->start_ofs == nullptr || group_reg->end_ofs == nullptr)
						return -1;

					/* calculate the length of the register value */
					reg_len = group_reg->end_ofs - group_reg->start_ofs;

					/* if we don't have enough left to match, it fails */
					if (curlen < reg_len)
						return -1;

					/* if the string doesn't match exactly, we fail */
					if (memcmp(p, group_reg->start_ofs, reg_len) != 0)
						return -1;

					/*
					 *   It matches exactly - skip the entire length of
					 *   the register in the source string
					 */
					p += reg_len;
					curlen -= reg_len;
				}
				break;

			case RE_TEXT_BEGIN:
				/*
				 *   Match only the exact beginning of the string - if
				 *   we're anywhere else, this isn't a match.  If this
				 *   succeeds, we don't skip any characters.
				 */
				if (p != entire_str)
					return -1;
				break;

			case RE_TEXT_END:
				/*
				 *   Match only the exact end of the string - if we're
				 *   anywhere else, this isn't a match.  Don't skip any
				 *   characters on success.
				 */
				if (curlen != 0)
					return -1;
				break;

			case RE_WORD_BEGIN:
				/*
				 *   if the previous character is a word character, we're
				 *   not at the beginning of a word
				 */
				if (p != entire_str && re_is_word_char(*(p-1)))
					return -1;

				/*
				 *   if we're at the end of the string, or the current
				 *   character isn't the start of a word, we're not at the
				 *   beginning of a word
				 */
				if (curlen == 0 || !re_is_word_char(*p))
					return -1;
				break;

			case RE_WORD_END:
				/*
				 *   if the current character is a word character, we're not
				 *   at the end of a word
				 */
				if (curlen != 0 && re_is_word_char(*p))
					return -1;

				/*
				 *   if we're at the beginning of the string, or the
				 *   previous character is not a word character, we're not
				 *   at the end of a word
				 */
				if (p == entire_str || !re_is_word_char(*(p-1)))
					return -1;
				break;

			case RE_WORD_CHAR:
				/* if it's not a word character, it's a failure */
				if (curlen == 0 || !re_is_word_char(*p))
					return -1;

				/* skip this character of input */
				++p;
				--curlen;
				break;

			case RE_NON_WORD_CHAR:
				/* if it's a word character, it's a failure */
				if (curlen == 0 || re_is_word_char(*p))
					return -1;

				/* skip the input */
				++p;
				--curlen;
				break;

			case RE_WORD_BOUNDARY:
			case RE_NON_WORD_BOUNDARY:
				{
					int prev_is_word;
					int next_is_word;
					int boundary;

					/*
					 *   Determine if the previous character is a word
					 *   character -- if we're at the beginning of the
					 *   string, it's obviously not, otherwise check its
					 *   classification
					 */
					prev_is_word = (p != entire_str
									&& re_is_word_char(*(p-1)));

					/* make the same check for the current character */
					next_is_word = (curlen != 0
									&& re_is_word_char(*p));

					/*
					 *   Determine if this is a boundary - it is if the
					 *   two states are different
					 */
					boundary = ((prev_is_word != 0) ^ (next_is_word != 0));

					/*
					 *   make sure it matches what was desired, and return
					 *   failure if not
					 */
					if ((tuple->ch == RE_WORD_BOUNDARY && !boundary)
						|| (tuple->ch == RE_NON_WORD_BOUNDARY && boundary))
						return -1;
				}
				break;

			case RE_WILDCARD:
				/* make sure we have a character to match */
				if (curlen == 0)
					return -1;

				/* skip this character */
				++p;
				--curlen;
				break;

			case RE_RANGE:
			case RE_RANGE_EXCL:
				{
					int match;

					/* make sure we have a character to match */
					if (curlen == 0)
						return -1;

					/* see if we match */
					match = re_is_bit_set(tuple->char_range,
										  (int)(unsigned char)*p);

					/* make sure we got what we wanted */
					if ((tuple->ch == RE_RANGE && !match)
						|| (tuple->ch == RE_RANGE_EXCL && match))
						return -1;

					/* skip this character of the input */
					++p;
					--curlen;
				}
				break;

			default:
				/* make sure we have an exact match */
				if (curlen == 0 || tuple->ch != *p)
					return -1;

				/* skip this character of the input */
				++p;
				--curlen;
				break;
			}

			/*
			 *   if we got this far, we were successful - move on to the
			 *   next state
			 */
			cur_state = tuple->next_state_1;
		}
		else if (tuple->next_state_2 == RE_STATE_INVALID)
		{
			/*
			 *   We have only one transition, so this state is entirely
			 *   deterministic.  Simply move on to the next state.
			 */
			cur_state = tuple->next_state_1;
		}
		else
		{
			re_machine sub_machine;
			re_group_register regs1[RE_GROUP_REG_CNT];
			re_group_register regs2[RE_GROUP_REG_CNT];
			int ret1;
			int ret2;

			/*
			 *   This state has two possible transitions, and we don't
			 *   know which one to take.  So, try both, see which one
			 *   works better, and return the result.  Try the first
			 *   transition first.  Note that each separate attempt must
			 *   use a separate copy of the registers.
			 */
			memcpy(regs1, regs, sizeof(regs1));
			sub_machine.init = tuple->next_state_1;
			sub_machine.final_state = machine->final_state;
			ret1 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs1);

			/*
			 *   Now try the second transition
			 */
			memcpy(regs2, regs, sizeof(regs2));
			sub_machine.init = tuple->next_state_2;
			sub_machine.final_state = machine->final_state;
			ret2 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs2);

			/*
			 *   If they both failed, the whole thing failed.  Otherwise,
			 *   return the longer of the two, plus the length we
			 *   ourselves matched previously.  Note that we return the
			 *   register set from the winning match.
			 */
			if (ret1 < 0 && ret2 < 0)
			{
				/* they both failed */
				return -1;
			}
			else if (ret1 > ret2)
			{
				/* use the first register set and result length */
				memcpy(regs, regs1, sizeof(regs1));
				return ret1 + (p - str);
			}
			else
			{
				/* use the second register set and result length */
				memcpy(regs, regs2, sizeof(regs2));
				return ret2 + (p - str);
			}
		}

		/*
		 *   If we're in the final state, return success
		 */
		if (cur_state == machine->final_state)
		{
			/* finish off any group involved in the final state */
			re_note_group(ctx, regs, cur_state, p);

			/* return the length we matched */
			return p - str;
		}
	}
}

/* ------------------------------------------------------------------------ */
/*
 *   Search for a regular expression within a string.  Returns -1 if the
 *   string cannot be found, otherwise returns the offset from the start
 *   of the string to be searched of the start of the first match for the
 *   pattern.
 */
static int re_search(re_context *ctx, const char *str, size_t len,
					 const re_machine *machine, re_group_register *regs,
					 int *result_len)
{
	int ofs;

	/*
	 *   Starting at the first character in the string, search for the
	 *   pattern at each subsequent character until we either find the
	 *   pattern or run out of string to test.
	 */
	for (ofs = 0 ; ofs < (int)len ; ++ofs)
	{
		int matchlen;

		/* check for a match */
		matchlen = re_match(ctx, str, str + ofs, len - ofs,
							machine, regs);
		if (matchlen >= 0)
		{
			/* we found a match here - return the length and offset */
			*result_len = matchlen;
			return ofs;
		}
	}

	/* we didn't find a match */
	return -1;
}

/* ------------------------------------------------------------------------ */
/*
 *   Make a copy of a search string in our private buffer.
 */
static void re_save_search_str(re_context *ctx, const char *str, size_t len)
{
	/* if the string is empty, this is easy */
	if (len == 0)
	{
		/* nothing to store - just save the length and return */
		ctx->curlen = 0;
		return;
	}

	/* if the current buffer isn't big enough, allocate a new one */
	if (ctx->strbuf == nullptr || ctx->strbufsiz < len)
	{
		/*
		 *   free any previous buffer - its contents are no longer
		 *   important, since we're about to overwrite it with a new
		 *   string
		 */
		if (ctx->strbuf != nullptr)
			mchfre(ctx->strbuf);

		/*
		 *   allocate a new buffer; round up to the next 256-byte
		 *   increment to make sure we're not constantly reallocating to
		 *   random sizes
		 */
		ctx->strbufsiz = ((len + 255) & ~255);

		/* allocate it */
		ctx->strbuf = (char *)mchalo(ctx->errctx, ctx->strbufsiz,
									 "regex str");
	}

	/* copy the string */
	memcpy(ctx->strbuf, str, len);

	/* save the length */
	ctx->curlen = len;
}

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression and search for a match within the given string.
 *   Returns the offset of the match, or -1 if no match was found.
 */
int re_compile_and_search(re_context *ctx,
						  const char *pattern, size_t patlen,
						  const char *searchstr, size_t searchlen,
						  int *result_len)
{
	re_machine machine;

	/* compile the expression - return failure if we get an error */
	if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
		return -1;

	/* save the search string in our internal buffer */
	re_save_search_str(ctx, searchstr, searchlen);

	/* clear the group registers */
	for (uint i = 0; i < ARRAYSIZE(ctx->regs); i++) {
		ctx->regs[i].clear();
	}

	/*
	 *   search for the pattern in our copy of the string - use the copy
	 *   so that the group registers stay valid even if the caller
	 *   deallocates the original string after we return
	 */
	return re_search(ctx, ctx->strbuf, ctx->curlen, &machine,
					 ctx->regs, result_len);
}

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression and check for a match.  Returns the length of
 *   the match if we found a match, -1 if we found no match.  This is not
 *   a search function; we merely match the leading substring of the given
 *   string to the given pattern.
 */
int re_compile_and_match(re_context *ctx,
						 const char *pattern, size_t patlen,
						 const char *searchstr, size_t searchlen)
{
	re_machine machine;

	/* compile the expression - return failure if we get an error */
	if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
		return FALSE;

	/* save the search string in our internal buffer */
	re_save_search_str(ctx, searchstr, searchlen);

	/* clear the group registers */
	for (uint i = 0; i < ARRAYSIZE(ctx->regs); i++) {
		ctx->regs[i].clear();
	}

	/* match the string */
	return re_match(ctx, ctx->strbuf, ctx->strbuf, ctx->curlen,
					&machine, ctx->regs);
}


} // End of namespace TADS2
} // End of namespace TADS
} // End of namespace Glk