/*
 * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <stdio.h>

#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <regex.h>
#include <math.h>
#include <errno.h>
#include <endian.h>

#include "ucode/vm.h"
#include "ucode/lib.h"
#include "ucode/lexer.h"

struct keyword {
	unsigned type;
	const char *pat;
	unsigned plen;
};

#define dec(o) \
	((o) - '0')

#define hex(x) \
	(((x) >= 'a') ? (10 + (x) - 'a') : \
		(((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))

#ifndef NO_COMPILE

static const struct keyword reserved_words[] = {
	{ TK_ENDFUNC,	"endfunction", 11 },
	{ TK_CONTINUE,	"continue", 8 },
	{ TK_ENDWHILE,	"endwhile", 8 },
	{ TK_FUNC,		"function", 8 },
	{ TK_DEFAULT,	"default", 7 },
	{ TK_DELETE,	"delete", 6 },
	{ TK_RETURN,	"return", 6 },
	{ TK_ENDFOR,	"endfor", 6 },
	{ TK_SWITCH,	"switch", 6 },
	{ TK_IMPORT,	"import", 6 },
	{ TK_EXPORT,	"export", 6 },
	{ TK_ENDIF,		"endif", 5 },
	{ TK_WHILE,		"while", 5 },
	{ TK_BREAK,		"break", 5 },
	{ TK_CATCH,		"catch", 5 },
	{ TK_CONST,		"const", 5 },
	{ TK_FALSE,		"false", 5 },
	{ TK_TRUE,		"true",  4 },
	{ TK_ELIF,		"elif",  4 },
	{ TK_ELSE,		"else",  4 },
	{ TK_THIS,		"this",  4 },
	{ TK_NULL,		"null",  4 },
	{ TK_CASE,		"case",  4 },
	{ TK_FROM,		"from",  4 },
	{ TK_TRY,		"try",   3 },
	{ TK_FOR,		"for",   3 },
	{ TK_LOCAL,		"let",   3 },
	{ TK_IF,		"if",    2 },
	{ TK_IN,		"in",    2 },
	{ TK_AS,		"as",    2 },
};


static int
fill_buf(uc_lexer_t *lex) {
	lex->rbuf = xrealloc(lex->rbuf, 128);
	lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
	lex->rpos = 0;

	if (!lex->rlen)
		return EOF;

	lex->rpos++;

	return (int)lex->rbuf[0];
}

static int
update_line(uc_lexer_t *lex, int ch) {
	if (ch == '\n')
		uc_source_line_next(lex->source);
	else if (ch != EOF)
		uc_source_line_update(lex->source, 1);

	lex->source->off++;

	return ch;
}

static int
lookahead_char(uc_lexer_t *lex) {
	int c;

	if (lex->rpos < lex->rlen)
		return (int)lex->rbuf[lex->rpos];

	c = fill_buf(lex);
	lex->rpos = 0;

	return c;
}

static bool
check_char(uc_lexer_t *lex, int ch) {
	if (lookahead_char(lex) != ch)
		return false;

	lex->rpos++;

	update_line(lex, ch);

	return true;
}

static int
next_char(uc_lexer_t *lex) {
	int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);

	return update_line(lex, ch);
}

static uc_token_t *
emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
{
	lex->curr.type = type;
	lex->curr.uv = uv;

	if (pos < 0)
		lex->curr.pos = lex->source->off + pos;
	else
		lex->curr.pos = (size_t)pos;

	return &lex->curr;
}

static uc_token_t *
emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
	uc_token_t *rv = NULL;

	if (lex->buffer.count) {
		if (strip_trailing_chars)
			while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
				lex->buffer.count--;

		rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));

		uc_vector_clear(&lex->buffer);
	}
	else if (type != TK_TEXT) {
		rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
	}

	return rv;
}


static uc_token_t *
parse_comment(uc_lexer_t *lex, int kind)
{
	int ch;

	while (true) {
		ch = next_char(lex);

		if (kind == '/' && (ch == '\n' || ch == EOF))
			break;

		if (kind == '*' && ch == '*' && check_char(lex, '/'))
			break;

		if (ch == EOF) {
			lex->state = UC_LEX_EOF;

			return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
		}
	}

	return NULL;
}

static void
append_utf8(uc_lexer_t *lex, int code) {
	char ustr[8], *up;
	int rem;

	up = ustr;
	rem = sizeof(ustr);

	if (utf8enc(&up, &rem, code))
		for (up = ustr; rem < (int)sizeof(ustr); rem++)
			uc_vector_push(&lex->buffer, *up++);
}

static uc_token_t *
parse_escape(uc_lexer_t *lex, const char *regex_macros)
{
	int code, ch, i;
	const char *p;

	/* unicode escape sequence */
	if (check_char(lex, 'u')) {
		for (i = 0, code = 0; i < 4; i++) {
			ch = next_char(lex);

			if (!isxdigit(ch))
				return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));

			code = code * 16 + hex(ch);
		}

		/* is a leading surrogate value */
		if ((code & 0xFC00) == 0xD800) {
			/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
			if (lex->lead_surrogate)
				append_utf8(lex, 0xFFFD);

			/* store surrogate value and advance to next escape sequence */
			lex->lead_surrogate = code;
		}

		/* is a trailing surrogate value */
		else if ((code & 0xFC00) == 0xDC00) {
			/* found a trailing surrogate following a leading one, combine and encode */
			if (lex->lead_surrogate) {
				code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
				lex->lead_surrogate = 0;
			}

			/* trailing surrogate not following a leading one, ignore and use replacement char */
			else {
				code = 0xFFFD;
			}

			append_utf8(lex, code);
		}

		/* is a normal codepoint */
		else {
			append_utf8(lex, code);
		}
	}

	/* hex escape sequence */
	else if (check_char(lex, 'x')) {
		for (i = 0, code = 0; i < 2; i++) {
			ch = next_char(lex);

			if (!isxdigit(ch))
				return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));

			code = code * 16 + hex(ch);
		}

		append_utf8(lex, code);
	}

	/* octal or letter */
	else {
		/* try to parse octal sequence... */
		for (i = 0, code = 0, ch = lookahead_char(lex);
		     i < 3 && ch >= '0' && ch <= '7';
		     i++, next_char(lex), ch = lookahead_char(lex)) {
			code = code * 8 + dec(ch);
		}

		if (i) {
			if (code > 255)
				return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));

			append_utf8(lex, code);
		}

		/* ... no octal sequence, handle potential regex macros */
		else if (strchr(regex_macros, ch)) {
			ch = next_char(lex);

			switch (ch) {
			case 'd': p = "[[:digit:]]";   break;
			case 'D': p = "[^[:digit:]]";  break;
			case 'w': p = "[[:alnum:]_]";  break;
			case 'W': p = "[^[:alnum:]_]"; break;
			case 's': p = "[[:space:]]";   break;
			case 'S': p = "[^[:space:]]";  break;
			default:  p = NULL;
			}

			if (p) {
				while (*p)
					uc_vector_push(&lex->buffer, *p++);
			}
			else {
				uc_vector_push(&lex->buffer, '\\');
				uc_vector_push(&lex->buffer, ch);
			}
		}

		/* ... handle other escape */
		else {
			ch = next_char(lex);

			switch (ch) {
			case 'a': uc_vector_push(&lex->buffer, '\a'); break;
			case 'b': uc_vector_push(&lex->buffer, '\b'); break;
			case 'e': uc_vector_push(&lex->buffer, '\033'); break;
			case 'f': uc_vector_push(&lex->buffer, '\f'); break;
			case 'n': uc_vector_push(&lex->buffer, '\n'); break;
			case 'r': uc_vector_push(&lex->buffer, '\r'); break;
			case 't': uc_vector_push(&lex->buffer, '\t'); break;
			case 'v': uc_vector_push(&lex->buffer, '\v'); break;

			case EOF:
				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));

			default:
				uc_vector_push(&lex->buffer, ch);
			}
		}
	}

	return NULL;
}

static uc_token_t *
parse_string(uc_lexer_t *lex, int kind)
{
	uc_token_t *err;
	unsigned type;
	int code, ch;
	size_t off;

	if (kind == '`')
		type = TK_TEMPLATE;
	else if (kind == '/')
		type = TK_REGEXP;
	else
		type = TK_STRING;

	off = lex->source->off - 1;

	for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
		switch (ch) {
		/* placeholder */
		case '$':
			if (type == TK_TEMPLATE && check_char(lex, '{')) {
				lex->state = UC_LEX_PLACEHOLDER_START;

				return emit_buffer(lex, off, type, NULL);
			}

			uc_vector_push(&lex->buffer, '$');
			break;

		/* regexp bracket expression */
		case '[':
			uc_vector_push(&lex->buffer, '[');

			if (type == TK_REGEXP) {
				/* skip leading negation (^) */
				if (check_char(lex, '^'))
					uc_vector_push(&lex->buffer, '^');

				/* skip leading `]` - it is literal and not closing the bracket expr */
				if (check_char(lex, ']'))
					uc_vector_push(&lex->buffer, ']');

				/* read until closing `]` */
				for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
					if (ch == '\\') {
						err = parse_escape(lex, "^");

						if (err)
							return err;

						continue;
					}

					uc_vector_push(&lex->buffer, ch);

					if (ch == ']')
						break;

					/* skip nested char classes / equivalence classes / collating chars */
					if (ch == '[') {
						code = lookahead_char(lex);

						if (code == ':' || code == '.' || code == '=') {
							uc_vector_push(&lex->buffer, code);
							next_char(lex);

							for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
								if (ch == '\\') {
									err = parse_escape(lex, "");

									if (err)
										return err;

									continue;
								}

								uc_vector_push(&lex->buffer, ch);

								if (ch == code && check_char(lex, ']')) {
									uc_vector_push(&lex->buffer, ']');
									break;
								}
							}
						}
					}
				}
			}

			break;

		/* escape sequence */
		case '\\':
			err = parse_escape(lex,
				(type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");

			if (err)
				return err;

			break;

		/* other character */
		default:
			/* terminating delimitter */
			if (ch == kind)
				return emit_buffer(lex, off, type, NULL);

			uc_vector_push(&lex->buffer, ch);
		}
	}

	// FIXME
	lex->state = UC_LEX_EOF;

	return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
}


/*
 * Parses a regexp literal from the given buffer.
 *
 * Returns a negative value on error, otherwise the amount of consumed
 * characters from the given buffer.
 *
 * Error values:
 *  -UC_ERROR_UNTERMINATED_STRING	Unterminated regexp
 *  -UC_ERROR_INVALID_ESCAPE		Invalid escape sequence
 *  -UC_ERROR_OVERLONG_STRING		Regexp literal too long
 *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
 */

enum {
	UC_LEX_PARSE_REGEX_INIT,
	UC_LEX_PARSE_REGEX_PATTERN,
	UC_LEX_PARSE_REGEX_FLAGS
};

static uc_token_t *
parse_regexp(uc_lexer_t *lex)
{
	bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
	uc_token_t *rv;
	size_t len;
	char *s;

	rv = parse_string(lex, '/');

	if (rv->type == TK_REGEXP) {
		while (true) {
			if (check_char(lex, 'g'))
				is_reg_global = true;
			else if (check_char(lex, 'i'))
				is_reg_icase = true;
			else if (check_char(lex, 's'))
				is_reg_newline = true;
			else
				break;
		}

		len = xasprintf(&s, "%c%*s",
			(is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
			ucv_string_length(rv->uv),
			ucv_string_get(rv->uv));

		ucv_free(rv->uv, false);
		rv->uv = ucv_string_new_length(s, len);
		free(s);
	}

	return rv;
}


/*
 * Parses a label from the given buffer.
 *
 * Returns a negative value on error, otherwise the amount of consumed
 * characters from the given buffer.
 *
 * Error values:
 *  -UC_ERROR_OVERLONG_STRING	Label too long
 */

static uc_token_t *
parse_label(uc_lexer_t *lex, int ch)
{
	const struct keyword *word;
	size_t i, len;

	while (true) {
		uc_vector_push(&lex->buffer, ch);
		ch = lookahead_char(lex);

		if (!isalnum(ch) && ch != '_')
			break;

		next_char(lex);
	}

	len = lex->buffer.count;

	if (!lex->no_keyword) {
		for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
			if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
				uc_vector_clear(&lex->buffer);

				return emit_op(lex, -len, word->type, NULL);
			}
		}
	}

	return emit_buffer(lex, -len, TK_LABEL, NULL);
}


/*
 * Parses a number literal from the given buffer.
 *
 * Returns a negative value on error, otherwise the amount of consumed
 * characters from the given buffer.
 *
 * Error values:
 *  -UC_ERROR_INVALID_ESCAPE	Invalid number character
 */

static inline bool
is_numeric_char(uc_lexer_t *lex, char c)
{
	char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;

	switch (c|32) {
	case '.':
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		return true;

	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'e':
	case 'f':
	case 'o':
	case 'x':
		/* require previous char, a number literal cannot start with these */
		return prev != 0;

	case '+':
	case '-':
		/* sign is only allowed after an exponent char */
		return (prev|32) == 'e';
	}

	return false;
}

static uc_token_t *
parse_number(uc_lexer_t *lex, int ch)
{
	uc_value_t *nv = NULL;
	size_t len;
	char *e;

	while (true) {
		uc_vector_push(&lex->buffer, ch);
		ch = lookahead_char(lex);

		if (!is_numeric_char(lex, ch))
			break;

		next_char(lex);
	}

	len = lex->buffer.count;

	uc_vector_push(&lex->buffer, '\0');

	nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);

	uc_vector_clear(&lex->buffer);

	switch (ucv_type(nv)) {
	case UC_DOUBLE:
		return emit_op(lex, -len, TK_DOUBLE, nv);

	case UC_INTEGER:
		return emit_op(lex, -len, TK_NUMBER, nv);

	default:
		return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
	}
}

static uc_token_t *
lex_find_token(uc_lexer_t *lex)
{
	bool tpl = !(lex->config && lex->config->raw_mode);
	int ch = next_char(lex);

	while (isspace(ch))
		ch = next_char(lex);

	switch (ch) {
	case '~':
		return emit_op(lex, -1, TK_COMPL, NULL);

	case '}':
		if (tpl && check_char(lex, '}'))
			return emit_op(lex, -2, TK_REXP, NULL);

		return emit_op(lex, -1, TK_RBRACE, NULL);

	case '|':
		if (check_char(lex, '|')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASOR, NULL);

			return emit_op(lex, -2, TK_OR, NULL);
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASBOR, NULL);

		return emit_op(lex, -1, TK_BOR, NULL);

	case '{':
		if (tpl && check_char(lex, '{'))
			return emit_op(lex, -2, TK_LEXP, NULL);

		if (tpl && check_char(lex, '%'))
			return emit_op(lex, -2, TK_LSTM, NULL);

		return emit_op(lex, -1, TK_LBRACE, NULL);

	case '^':
		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASBXOR, NULL);

		return emit_op(lex, -1, TK_BXOR, NULL);

	case '[':
		return emit_op(lex, -1, TK_LBRACK, NULL);

	case ']':
		return emit_op(lex, -1, TK_RBRACK, NULL);

	case '?':
		if (check_char(lex, '?')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASNULLISH, NULL);

			return emit_op(lex, -2, TK_NULLISH, NULL);
		}

		if (check_char(lex, '.')) {
			if (check_char(lex, '['))
				return emit_op(lex, -3, TK_QLBRACK, NULL);

			if (check_char(lex, '('))
				return emit_op(lex, -3, TK_QLPAREN, NULL);

			return emit_op(lex, -2, TK_QDOT, NULL);
		}

		return emit_op(lex, lex->source->off, TK_QMARK, NULL);

	case '>':
		if (check_char(lex, '>')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASRIGHT, NULL);

			return emit_op(lex, -2, TK_RSHIFT, NULL);
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_GE, NULL);

		return emit_op(lex, -1, TK_GT, NULL);

	case '=':
		if (check_char(lex, '=')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_EQS, NULL);

			return emit_op(lex, -2, TK_EQ, NULL);
		}

		if (check_char(lex, '>'))
			return emit_op(lex, -2, TK_ARROW, NULL);

		return emit_op(lex, -1, TK_ASSIGN, NULL);

	case '<':
		if (check_char(lex, '<')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASLEFT, NULL);

			return emit_op(lex, -2, TK_LSHIFT, NULL);
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_LE, NULL);

		return emit_op(lex, -1, TK_LT, NULL);

	case ';':
		return emit_op(lex, -1, TK_SCOL, NULL);

	case ':':
		return emit_op(lex, -1, TK_COLON, NULL);

	case '/':
		ch = lookahead_char(lex);
		lex->lastoff = lex->source->off - 1;

		if (ch == '/' || ch == '*')
			return parse_comment(lex, ch);

		if (lex->no_regexp) {
			if (check_char(lex, '='))
				return emit_op(lex, -2, TK_ASDIV, NULL);

			return emit_op(lex, -1, TK_DIV, NULL);
		}

		return parse_regexp(lex);

	case '.':
		if (check_char(lex, '.')) {
			if (check_char(lex, '.'))
				return emit_op(lex, -3, TK_ELLIP, NULL);

			/* The sequence ".." cannot be a valid */
			return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
		}

		return emit_op(lex, -1, TK_DOT, NULL);

	case '-':
		if (tpl && check_char(lex, '}')) {
			if (check_char(lex, '}')) {
				lex->modifier = MINUS;

				return emit_op(lex, -3, TK_REXP, NULL);
			}

			/* The sequence "-}" cannot be a valid */
			return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
		}

		if (tpl && check_char(lex, '%')) {
			if (check_char(lex, '}')) {
				lex->modifier = MINUS;

				return emit_op(lex, -3, TK_RSTM, NULL);
			}

			/* The sequence "-%" cannot be a valid */
			return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASSUB, NULL);

		if (check_char(lex, '-'))
			return emit_op(lex, -2, TK_DEC, NULL);

		return emit_op(lex, -1, TK_SUB, NULL);

	case ',':
		return emit_op(lex, -1, TK_COMMA, NULL);

	case '+':
		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASADD, NULL);

		if (check_char(lex, '+'))
			return emit_op(lex, -2, TK_INC, NULL);

		return emit_op(lex, -1, TK_ADD, NULL);

	case '*':
		if (check_char(lex, '*')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASEXP, NULL);

			return emit_op(lex, -2, TK_EXP, NULL);
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASMUL, NULL);

		return emit_op(lex, -1, TK_MUL, NULL);

	case '(':
		return emit_op(lex, -1, TK_LPAREN, NULL);

	case ')':
		return emit_op(lex, -1, TK_RPAREN, NULL);

	case '\'':
	case '"':
	case '`':
		lex->lastoff = lex->source->off - 1;

		return parse_string(lex, ch);

	case '&':
		if (check_char(lex, '&')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_ASAND, NULL);

			return emit_op(lex, -2, TK_AND, NULL);
		}

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASBAND, NULL);

		return emit_op(lex, -1, TK_BAND, NULL);

	case '%':
		if (tpl && check_char(lex, '}'))
			return emit_op(lex, -2, TK_RSTM, NULL);

		if (check_char(lex, '='))
			return emit_op(lex, -2, TK_ASMOD, NULL);

		return emit_op(lex, -1, TK_MOD, NULL);

	case '!':
		if (check_char(lex, '=')) {
			if (check_char(lex, '='))
				return emit_op(lex, -3, TK_NES, NULL);

			return emit_op(lex, -2, TK_NE, NULL);
		}

		return emit_op(lex, -1, TK_NOT, NULL);

	case EOF:
		return emit_op(lex, -1, TK_EOF, NULL);

	default:
		if (isalpha(ch) || ch == '_')
			return parse_label(lex, ch);

		if (isdigit(ch))
			return parse_number(lex, ch);

		return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
	}
}

static uc_token_t *
lex_step(uc_lexer_t *lex)
{
	const char *strip = NULL;
	uc_token_t *tok;
	size_t *nest;
	int ch;

	while (lex->state != UC_LEX_EOF) {
		switch (lex->state) {
		case UC_LEX_IDENTIFY_BLOCK:
			ch = next_char(lex);

			/* previous block had strip trailing whitespace flag, skip leading whitespace */
			if (lex->modifier == MINUS) {
				while (isspace(ch))
					ch = next_char(lex);

				lex->modifier = UNSPEC;
			}

			/* previous block was a statement block and trim_blocks is enabled, skip leading newline */
			else if (lex->modifier == NEWLINE) {
				if (ch == '\n')
					ch = next_char(lex);

				lex->modifier = UNSPEC;
			}

			/* scan forward through buffer to identify block start token */
			while (ch != EOF) {
				if (ch == '{') {
					ch = next_char(lex);

					switch (ch) {
					/* found start of comment block */
					case '#':
						lex->state = UC_LEX_BLOCK_COMMENT;
						lex->block = COMMENT;

						if (check_char(lex, '-'))
							strip = " \n\t\v\f\r";

						break;

					/* found start of expression block */
					case '{':
						lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;

						if (check_char(lex, '-'))
							strip = " \n\t\v\f\r";

						break;

					/* found start of statement block */
					case '%':
						lex->state = UC_LEX_IDENTIFY_TOKEN;
						lex->block = STATEMENTS;

						if (check_char(lex, '-'))
							strip = " \n\t\v\f\r";
						else if (check_char(lex, '+'))
							strip = NULL;
						else if (lex->config && lex->config->lstrip_blocks)
							strip = " \t\v\f\r";

						break;

					default:
						/* not a start tag, remember char and move on */
						uc_vector_push(&lex->buffer, '{');
						continue;
					}

					break;
				}

				uc_vector_push(&lex->buffer, ch);
				ch = next_char(lex);
			}

			if (ch == EOF)
				lex->state = UC_LEX_EOF;

			/* push out leading text */
			tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
			lex->lastoff = lex->source->off - 2;

			if (!tok)
				continue;

			return tok;


		case UC_LEX_BLOCK_COMMENT:
			ch = next_char(lex);

			/* scan forward through buffer to identify end token */
			while (ch != EOF) {
				if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
					lex->modifier = MINUS;
					break;
				}

				if (ch == '#' && check_char(lex, '}'))
					break;

				ch = next_char(lex);
			}

			if (ch == EOF) {
				lex->state = UC_LEX_EOF;

				return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
			}

			lex->lastoff = lex->source->off;
			lex->state = UC_LEX_IDENTIFY_BLOCK;

			continue;


		case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
			lex->state = UC_LEX_IDENTIFY_TOKEN;
			lex->block = EXPRESSION;

			return emit_op(lex, lex->source->off, TK_LEXP, NULL);


		case UC_LEX_IDENTIFY_TOKEN:
			do { tok = lex_find_token(lex); } while (tok == NULL);

			/* disallow nesting blocks */
			if (tok->type == TK_LSTM || tok->type == TK_LEXP)
				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));

			/* found end of statement block */
			if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
				/* strip newline after statement block? */
				if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
					lex->modifier = NEWLINE;

				lex->lastoff = lex->source->off;
				lex->state = UC_LEX_IDENTIFY_BLOCK;
				lex->block = NONE;

				tok = emit_op(lex, -2, TK_SCOL, NULL);
			}

			/* found end of expression block */
			else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
				lex->lastoff = lex->source->off;
				lex->state = UC_LEX_IDENTIFY_BLOCK;
				lex->block = NONE;
			}

			/* track opening braces */
			else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
				nest = uc_vector_last(&lex->templates);
				(*nest)++;
			}

			/* check end of placeholder expression */
			else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
				nest = uc_vector_last(&lex->templates);

				if (*nest == 0) {
					lex->templates.count--;
					lex->state = UC_LEX_PLACEHOLDER_END;
				}
				else {
					(*nest)--;
				}
			}

			/* premature EOF? */
			else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
				lex->state = UC_LEX_EOF;

				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
			}

			return tok;


		case UC_LEX_PLACEHOLDER_START:
			lex->state = UC_LEX_IDENTIFY_TOKEN;

			uc_vector_push(&lex->templates, 0);

			return emit_op(lex, -2, TK_PLACEH, NULL);


		case UC_LEX_PLACEHOLDER_END:
			lex->state = UC_LEX_IDENTIFY_TOKEN;

			return parse_string(lex, '`');


		case UC_LEX_EOF:
			break;
		}
	}

	return emit_op(lex, lex->source->off, TK_EOF, NULL);
}

void
uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
{
	lex->state = UC_LEX_IDENTIFY_BLOCK;

	lex->config = config;
	lex->source = uc_source_get(source);

	lex->block = NONE;
	lex->modifier = UNSPEC;

	lex->rlen = 0;
	lex->rpos = 0;
	lex->rbuf = NULL;

	lex->buffer.count = 0;
	lex->buffer.entries = NULL;

	lex->lead_surrogate = 0;

	lex->lastoff = 0;

	lex->templates.count = 0;
	lex->templates.entries = NULL;

	if (config && config->raw_mode) {
		lex->state = UC_LEX_IDENTIFY_TOKEN;
		lex->block = STATEMENTS;
	}
}

void
uc_lexer_free(uc_lexer_t *lex)
{
	uc_vector_clear(&lex->buffer);
	uc_vector_clear(&lex->templates);

	uc_source_put(lex->source);

	free(lex->rbuf);
}

uc_token_t *
uc_lexer_next_token(uc_lexer_t *lex)
{
	uc_token_t *rv = NULL;

	rv = lex_step(lex);

	lex->no_keyword = false;
	lex->no_regexp = false;

	return rv;
}

const char *
uc_tokenname(unsigned type)
{
	static char buf[sizeof("'endfunction'")];
	const char *tokennames[] = {
		[TK_LEXP] = "'{{'",
		[TK_REXP] = "'}}'",
		[TK_LSTM] = "'{%'",
		[TK_RSTM] = "'%}'",
		[TK_COMMA] = "','",
		[TK_ASSIGN] = "'='",
		[TK_ASADD] = "'+='",
		[TK_ASSUB] = "'-='",
		[TK_ASMUL] = "'*='",
		[TK_ASDIV] = "'/='",
		[TK_ASMOD] = "'%='",
		[TK_ASLEFT] = "'<<='",
		[TK_ASRIGHT] = "'>>='",
		[TK_ASBAND] = "'&='",
		[TK_ASBXOR] = "'^='",
		[TK_ASBOR] = "'|='",
		[TK_QMARK] = "'?'",
		[TK_COLON] = "':'",
		[TK_OR] = "'||'",
		[TK_AND] = "'&&'",
		[TK_BOR] = "'|'",
		[TK_BXOR] = "'^'",
		[TK_BAND] = "'&'",
		[TK_EQS] = "'==='",
		[TK_NES] = "'!=='",
		[TK_EQ] = "'=='",
		[TK_NE] = "'!='",
		[TK_LT] = "'<'",
		[TK_LE] = "'<='",
		[TK_GT] = "'>'",
		[TK_GE] = "'>='",
		[TK_LSHIFT] = "'<<'",
		[TK_RSHIFT] = "'>>'",
		[TK_ADD] = "'+'",
		[TK_SUB] = "'-'",
		[TK_MUL] = "'*'",
		[TK_DIV] = "'/'",
		[TK_MOD] = "'%'",
		[TK_EXP] = "'**'",
		[TK_NOT] = "'!'",
		[TK_COMPL] = "'~'",
		[TK_INC] = "'++'",
		[TK_DEC] = "'--'",
		[TK_DOT] = "'.'",
		[TK_LBRACK] = "'['",
		[TK_RBRACK] = "']'",
		[TK_LPAREN] = "'('",
		[TK_RPAREN] = "')'",
		[TK_LBRACE] = "'{'",
		[TK_RBRACE] = "'}'",
		[TK_SCOL] = "';'",
		[TK_ELLIP] = "'...'",
		[TK_ARROW] = "'=>'",
		[TK_QLBRACK] = "'?.['",
		[TK_QLPAREN] = "'?.('",
		[TK_QDOT] = "'?.'",
		[TK_ASEXP] = "'**='",
		[TK_ASAND] = "'&&='",
		[TK_ASOR] = "'||='",
		[TK_ASNULLISH] = "'\?\?='",
		[TK_NULLISH] = "'\?\?'",
		[TK_PLACEH] = "'${'",

		[TK_TEXT] = "Text",
		[TK_LABEL] = "Label",
		[TK_NUMBER] = "Number",
		[TK_DOUBLE] = "Double",
		[TK_STRING] = "String",
		[TK_REGEXP] = "Regexp",
		[TK_TEMPLATE] = "Template",
		[TK_ERROR] = "Error",
		[TK_EOF] = "End of file",
	};

	size_t i;

	for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
		if (reserved_words[i].type != type)
			continue;

		snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);

		return buf;
	}

	return tokennames[type] ? tokennames[type] : "?";
}

bool
uc_lexer_is_keyword(uc_value_t *label)
{
	size_t i;

	if (ucv_type(label) != UC_STRING)
		return false;

	for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
		if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
			return true;

	return false;
}

#endif /* NO_COMPILE */

/*
 * Stores the given codepoint as a utf8 multibyte sequence into the given
 * output buffer and substracts the required amount of bytes from  the given
 * length pointer.
 *
 * Returns false if the multibyte sequence would not fit into the buffer,
 * otherwise true.
 */

bool
utf8enc(char **out, int *rem, int code)
{
	if (code >= 0 && code <= 0x7F) {
		if (*rem < 1)
			return false;

		*(*out)++ = code; (*rem)--;

		return true;
	}
	else if (code > 0 && code <= 0x7FF) {
		if (*rem < 2)
			return false;

		*(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
		*(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;

		return true;
	}
	else if (code > 0 && code <= 0xFFFF) {
		if (*rem < 3)
			return false;

		*(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
		*(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
		*(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;

		return true;
	}
	else if (code > 0 && code <= 0x10FFFF) {
		if (*rem < 4)
			return false;

		*(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
		*(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
		*(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
		*(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;

		return true;
	}

	return true;
}