lexer: rewrite token scanner

- Use nested switches instead of lookup tables to detect tokens - Simplify input buffer logic - Reduce amount of intermediate states Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2022-07-14 14:33:12 +0200
committer: Jo-Philipp Wich <jo@mein.io> 2022-07-28 13:18:30 +0200
commit: 03c8e4b465c8cffd2596d2741b29ad2ba4ec1765 (patch)
tree: 6a43c9f54be5e3de4fcbc73b5ebaa518e642d3ad /lexer.c
parent: 1219d7efa170bf38fb1bf6a10fa0d1f96e62f091 (diff)
1 files changed, 700 insertions, 764 deletions
diff --git a/lexer.c b/lexer.c
index 5be8ece..574c051 100644
--- a/lexer.c
+++ b/lexer.c
@@ -29,24 +29,12 @@
 #include "ucode/lib.h"
 #include "ucode/lexer.h"
 
-#define UC_LEX_CONTINUE_PARSING (void *)1
-
 struct keyword {
 	unsigned type;
 	const char *pat;
 	unsigned plen;
 };
 
-struct token {
-	unsigned type;
-	union {
-		uint32_t patn;
-		char pat[4];
-	} u;
-	unsigned plen;
-	uc_token_t *(*parse)(uc_lexer_t *);
-};
-
 #define dec(o) \
 	((o) - '0')
 
@@ -56,94 +44,6 @@ struct token {
 
 #ifndef NO_COMPILE
 
-static uc_token_t *parse_comment(uc_lexer_t *);
-static uc_token_t *parse_string(uc_lexer_t *);
-static uc_token_t *parse_regexp(uc_lexer_t *);
-static uc_token_t *parse_number(uc_lexer_t *);
-static uc_token_t *parse_label(uc_lexer_t *);
-
-static const struct token tokens[] = {
-	{ TK_ASLEFT,	{ .pat = "<<=" },   3, NULL },
-	{ TK_ASRIGHT,	{ .pat = ">>=" },   3, NULL },
-	{ TK_LEXP,		{ .pat = "{{-" },   3, NULL },
-	{ TK_REXP,		{ .pat = "-}}" },   3, NULL },
-	{ TK_LSTM,		{ .pat = "{%+" },   3, NULL },
-	{ TK_LSTM,		{ .pat = "{%-" },   3, NULL },
-	{ TK_RSTM,		{ .pat = "-%}" },   3, NULL },
-	{ TK_EQS,		{ .pat = "===" },   3, NULL },
-	{ TK_NES,		{ .pat = "!==" },   3, NULL },
-	{ TK_ELLIP,		{ .pat = "..." },   3, NULL },
-	{ TK_QLBRACK,	{ .pat = "?.[" },   3, NULL },
-	{ TK_QLPAREN,	{ .pat = "?.(" },   3, NULL },
-	{ TK_ASEXP,		{ .pat = "**=" },   3, NULL },
-	{ TK_ASAND,		{ .pat = "&&=" },   3, NULL },
-	{ TK_ASOR,		{ .pat = "||=" },   3, NULL },
-	{ TK_ASNULLISH,	{ .pat = "\?\?=" }, 3, NULL },
-	{ TK_AND,		{ .pat = "&&" },    2, NULL },
-	{ TK_ASADD,		{ .pat = "+=" },    2, NULL },
-	{ TK_ASBAND,	{ .pat = "&=" },    2, NULL },
-	{ TK_ASBOR,		{ .pat = "|=" },    2, NULL },
-	{ TK_ASBXOR,	{ .pat = "^=" },    2, NULL },
-	//{ TK_ASDIV,	{ .pat = "/=" },    2, NULL },
-	{ TK_ASMOD,		{ .pat = "%=" },    2, NULL },
-	{ TK_ASMUL,		{ .pat = "*=" },    2, NULL },
-	{ TK_ASSUB,		{ .pat = "-=" },    2, NULL },
-	{ TK_EXP,		{ .pat = "**" },    2, NULL },
-	{ TK_DEC,		{ .pat = "--" },    2, NULL },
-	{ TK_INC,		{ .pat = "++" },    2, NULL },
-	{ TK_EQ,		{ .pat = "==" },    2, NULL },
-	{ TK_NE,		{ .pat = "!=" },    2, NULL },
-	{ TK_LE,		{ .pat = "<=" },    2, NULL },
-	{ TK_GE,		{ .pat = ">=" },    2, NULL },
-	{ TK_LSHIFT,	{ .pat = "<<" },    2, NULL },
-	{ TK_RSHIFT,	{ .pat = ">>" },    2, NULL },
-	{ 0,			{ .pat = "//" },    2, parse_comment },
-	{ 0,			{ .pat = "/*" },    2, parse_comment },
-	{ TK_OR,		{ .pat = "||" },    2, NULL },
-	{ TK_LEXP,		{ .pat = "{{" },    2, NULL },
-	{ TK_REXP,		{ .pat = "}}" },    2, NULL },
-	{ TK_LSTM,		{ .pat = "{%" },    2, NULL },
-	{ TK_RSTM,		{ .pat = "%}" },    2, NULL },
-	{ TK_ARROW,		{ .pat = "=>" },    2, NULL },
-	{ TK_NULLISH,	{ .pat = "??" },    2, NULL },
-	{ TK_QDOT,		{ .pat = "?." },    2, NULL },
-	{ TK_PLACEH,	{ .pat = "${" },    2, NULL },
-	{ TK_ADD,		{ .pat = "+" },     1, NULL },
-	{ TK_ASSIGN,	{ .pat = "=" },     1, NULL },
-	{ TK_BAND,		{ .pat = "&" },     1, NULL },
-	{ TK_BOR,		{ .pat = "|" },     1, NULL },
-	{ TK_LBRACK,	{ .pat = "[" },     1, NULL },
-	{ TK_RBRACK,	{ .pat = "]" },     1, NULL },
-	{ TK_BXOR,		{ .pat = "^" },     1, NULL },
-	{ TK_LBRACE,	{ .pat = "{" },     1, NULL },
-	{ TK_RBRACE,	{ .pat = "}" },     1, NULL },
-	{ TK_COLON,		{ .pat = ":" },     1, NULL },
-	{ TK_COMMA,		{ .pat = "," },     1, NULL },
-	{ TK_COMPL,		{ .pat = "~" },     1, NULL },
-	//{ TK_DIV,		{ .pat = "/" },     1, NULL },
-	{ TK_GT,		{ .pat = ">" },     1, NULL },
-	{ TK_NOT,		{ .pat = "!" },     1, NULL },
-	{ TK_LT,		{ .pat = "<" },     1, NULL },
-	{ TK_MOD,		{ .pat = "%" },     1, NULL },
-	{ TK_MUL,		{ .pat = "*" },     1, NULL },
-	{ TK_LPAREN,	{ .pat = "(" },     1, NULL },
-	{ TK_RPAREN,	{ .pat = ")" },     1, NULL },
-	{ TK_QMARK,		{ .pat = "?" },     1, NULL },
-	{ TK_SCOL,		{ .pat = ";" },     1, NULL },
-	{ TK_SUB,		{ .pat = "-" },     1, NULL },
-	{ TK_DOT,		{ .pat = "." },     1, NULL },
-	{ TK_STRING,	{ .pat = "'" },     1, parse_string },
-	{ TK_STRING,	{ .pat = "\"" },    1, parse_string },
-	{ TK_REGEXP,	{ .pat = "/" },     1, parse_regexp },
-	{ TK_LABEL,		{ .pat = "_" },     1, parse_label },
-	{ TK_LABEL,		{ .pat = "az" },    0, parse_label },
-	{ TK_LABEL,		{ .pat = "AZ" },    0, parse_label },
-	{ TK_NUMBER,	{ .pat = "09" },    0, parse_number },
-
-	/* NB: this must be last for simple retrieval */
-	{ TK_TEMPLATE,	{ .pat = "`" },     1, parse_string }
-};
-
 static const struct keyword reserved_words[] = {
 	{ TK_ENDFUNC,	"endfunction", 11 },
 	{ TK_CONTINUE,	"continue", 8 },
@@ -174,119 +74,118 @@ static const struct keyword reserved_words[] = {
 };
 
 
-/* length of the longest token in our lookup table */
-#define UC_LEX_MAX_TOKEN_LEN 3
+static int
+fill_buf(uc_lexer_t *lex) {
+	lex->rbuf = xrealloc(lex->rbuf, 128);
+	lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
+	lex->rpos = 0;
 
-static uc_token_t *
-emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv)
-{
-	lex->curr.type = type;
-	lex->curr.uv = uv;
-	lex->curr.pos = pos;
+	if (!lex->rlen)
+		return EOF;
 
-	return &lex->curr;
-}
+	lex->rpos++;
 
-static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len)
-{
-	if (len) {
-		lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len);
-		memcpy(lex->lookbehind + lex->lookbehindlen, data, len);
-		lex->lookbehindlen += len;
-	}
+	return (int)lex->rbuf[0];
 }
 
-static void lookbehind_reset(uc_lexer_t *lex) {
-	free(lex->lookbehind);
-	lex->lookbehind = NULL;
-	lex->lookbehindlen = 0;
-}
+static int
+update_line(uc_lexer_t *lex, int ch) {
+	if (ch == '\n' || ch == EOF)
+		uc_source_line_next(lex->source);
+	else
+		uc_source_line_update(lex->source, 1);
 
-static uc_token_t *
-lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) {
-	uc_token_t *rv = NULL;
+	lex->source->off++;
 
-	if (lex->lookbehind) {
-		if (strip_trailing_chars) {
-			while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1]))
-				lex->lookbehindlen--;
-		}
+	return ch;
+}
 
-		rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen));
+static int
+lookahead_char(uc_lexer_t *lex) {
+	int c;
 
-		lookbehind_reset(lex);
-	}
+	if (lex->rpos < lex->rlen)
+		return (int)lex->rbuf[lex->rpos];
 
-	return rv;
-}
+	c = fill_buf(lex);
+	lex->rpos = 0;
 
-static inline size_t
-buf_remaining(uc_lexer_t *lex) {
-	return (lex->bufend - lex->bufstart);
+	return c;
 }
 
-static inline bool
-_buf_startswith(uc_lexer_t *lex, const char *str, size_t len) {
-	return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len));
-}
+static bool
+check_char(uc_lexer_t *lex, int ch) {
+	if (lookahead_char(lex) != ch)
+		return false;
 
-#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
+	lex->rpos++;
 
+	update_line(lex, ch);
 
-static void
-buf_consume(uc_lexer_t *lex, size_t len) {
-	size_t i, linelen;
+	return true;
+}
 
-	for (i = 0, linelen = 0; i < len; i++) {
-		if (lex->bufstart[i] == '\n') {
-			uc_source_line_update(lex->source, linelen);
-			uc_source_line_next(lex->source);
+static int
+next_char(uc_lexer_t *lex) {
+	int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
 
-			linelen = 0;
-		}
-		else {
-			linelen++;
-		}
-	}
+	return update_line(lex, ch);
+}
+
+static uc_token_t *
+emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
+{
+	lex->curr.type = type;
+	lex->curr.uv = uv;
 
-	if (linelen)
-		uc_source_line_update(lex->source, linelen);
+	if (pos < 0)
+		lex->curr.pos = lex->source->off + pos;
+	else
+		lex->curr.pos = (size_t)pos;
 
-	lex->bufstart += len;
-	lex->source->off += len;
+	return &lex->curr;
 }
 
 static uc_token_t *
-parse_comment(uc_lexer_t *lex)
-{
-	const struct token *tok = lex->tok;
-	const char *ptr, *end;
-	size_t elen;
+emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
+	uc_token_t *rv = NULL;
+
+	if (lex->buffer.count) {
+		if (strip_trailing_chars)
+			while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
+				lex->buffer.count--;
+
+		rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
 
-	if (!strcmp(tok->u.pat, "//")) {
-		end = "\n";
-		elen = 1;
+		uc_vector_clear(&lex->buffer);
 	}
-	else {
-		end = "*/";
-		elen = 2;
+	else if (type != TK_TEXT) {
+		rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
 	}
 
-	for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) {
-		if (!strncmp(ptr, end, elen)) {
-			buf_consume(lex, (ptr - lex->bufstart) + elen);
+	return rv;
+}
 
-			return UC_LEX_CONTINUE_PARSING;
-		}
-	}
 
-	buf_consume(lex, ptr - lex->bufstart);
+static uc_token_t *
+parse_comment(uc_lexer_t *lex, int kind)
+{
+	int ch;
+
+	while (true) {
+		ch = next_char(lex);
 
-	if (lex->eof) {
-		lex->state = UC_LEX_EOF;
+		if (kind == '/' && (ch == '\n' || ch == EOF))
+			break;
+
+		if (kind == '*' && ch == '*' && check_char(lex, '/'))
+			break;
+
+		if (ch == EOF) {
+			lex->state = UC_LEX_EOF;
 
-		if (elen == 2)
 			return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
+		}
 	}
 
 	return NULL;
@@ -301,238 +200,157 @@ append_utf8(uc_lexer_t *lex, int code) {
 	rem = sizeof(ustr);
 
 	if (utf8enc(&up, &rem, code))
-		lookbehind_append(lex, ustr, up - ustr);
+		for (up = ustr; rem < (int)sizeof(ustr); rem++)
+			uc_vector_push(&lex->buffer, *up++);
 }
 
 static uc_token_t *
-parse_string(uc_lexer_t *lex)
+parse_string(uc_lexer_t *lex, int kind)
 {
-	const struct token *tok = lex->tok;
-	char q = tok->u.pat[0];
-	char *ptr, *c;
-	uc_token_t *rv;
-	int code;
+	int code, ch, i;
+	unsigned type;
+	size_t off;
 
-	if (!buf_remaining(lex))
-		return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
+	if (kind == '`')
+		type = TK_TEMPLATE;
+	else if (kind == '/')
+		type = TK_REGEXP;
+	else
+		type = TK_STRING;
 
-	for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
-		/* continuation of placeholder start */
-		if (lex->is_placeholder) {
-			if (*ptr == '{') {
-				buf_consume(lex, 1);
-				rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
+	off = lex->source->off - 1;
 
-				if (!rv)
-					rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
+	for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+		switch (ch) {
+		/* placeholder */
+		case '$':
+			if (type == TK_TEMPLATE && check_char(lex, '{')) {
+				lex->state = UC_LEX_PLACEHOLDER_START;
 
-				return rv;
+				return emit_buffer(lex, off, type, NULL);
 			}
 
-			lex->is_placeholder = false;
-			lookbehind_append(lex, "$", 1);
-		}
+			uc_vector_push(&lex->buffer, '$');
+			break;
 
-		/* continuation of escape sequence */
-		if (lex->is_escape) {
-			if (lex->esclen == 0) {
-				/* non-unicode escape following a lead surrogate, emit replacement... */
-				if (lex->lead_surrogate && *ptr != 'u') {
-					append_utf8(lex, 0xFFFD);
-					lex->lead_surrogate = 0;
-				}
+		/* escape sequence */
+		case '\\':
+			/* unicode escape sequence */
+			if (type != TK_REGEXP && check_char(lex, 'u')) {
+				for (i = 0, code = 0; i < 4; i++) {
+					ch = next_char(lex);
 
-				switch ((q == '/') ? 0 : *ptr) {
-				case 'u':
-				case 'x':
-					lex->esc[lex->esclen++] = *ptr;
-					break;
+					if (!isxdigit(ch))
+						return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
 
-				case '0':
-				case '1':
-				case '2':
-				case '3':
-				case '4':
-				case '5':
-				case '6':
-				case '7':
-					lex->esc[lex->esclen++] = 'o';
-					lex->esc[lex->esclen++] = *ptr;
-					break;
+					code = code * 16 + hex(ch);
+				}
 
-				default:
-					lex->is_escape = false;
-					c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr);
+				/* is a leading surrogate value */
+				if ((code & 0xFC00) == 0xD800) {
+					/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+					if (lex->lead_surrogate)
+						append_utf8(lex, 0xFFFD);
 
-					if (c && *c >= 'a') {
-						lookbehind_append(lex, c + 1, 1);
+					/* store surrogate value and advance to next escape sequence */
+					lex->lead_surrogate = code;
+				}
+
+				/* is a trailing surrogate value */
+				else if ((code & 0xFC00) == 0xDC00) {
+					/* found a trailing surrogate following a leading one, combine and encode */
+					if (lex->lead_surrogate) {
+						code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+						lex->lead_surrogate = 0;
 					}
-					else {
-						/* regex mode => retain backslash */
-						if (q == '/')
-							lookbehind_append(lex, "\\", 1);
 
-						lookbehind_append(lex, ptr, 1);
+					/* trailing surrogate not following a leading one, ignore and use replacement char */
+					else {
+						code = 0xFFFD;
 					}
 
-					buf_consume(lex, (ptr + 1) - lex->bufstart);
+					append_utf8(lex, code);
+				}
 
-					break;
+				/* is a normal codepoint */
+				else {
+					append_utf8(lex, code);
 				}
 			}
-			else {
-				switch (lex->esc[0]) {
-				case 'u':
-					if (lex->esclen < 5) {
-						if (!isxdigit(*ptr))
-							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
 
-						lex->esc[lex->esclen++] = *ptr;
-					}
-
-					if (lex->esclen == 5) {
-						code = hex(lex->esc[1]) * 16 * 16 * 16 +
-						       hex(lex->esc[2]) * 16 * 16 +
-						       hex(lex->esc[3]) * 16 +
-						       hex(lex->esc[4]);
-
-						/* is a leading surrogate value */
-						if ((code & 0xFC00) == 0xD800) {
-							/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
-							if (lex->lead_surrogate)
-								append_utf8(lex, 0xFFFD);
-
-							/* store surrogate value and advance to next escape sequence */
-							lex->lead_surrogate = code;
-						}
-
-						/* is a trailing surrogate value */
-						else if ((code & 0xFC00) == 0xDC00) {
-							/* found a trailing surrogate following a leading one, combine and encode */
-							if (lex->lead_surrogate) {
-								code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
-								lex->lead_surrogate = 0;
-							}
-
-							/* trailing surrogate not following a leading one, ignore and use replacement char */
-							else {
-								code = 0xFFFD;
-							}
-
-							append_utf8(lex, code);
-						}
-
-						/* is a normal codepoint */
-						else {
-							append_utf8(lex, code);
-						}
-
-						lex->esclen = 0;
-						lex->is_escape = false;
-						buf_consume(lex, (ptr + 1) - lex->bufstart);
-					}
+			/* hex escape sequence */
+			else if (type != TK_REGEXP && check_char(lex, 'x')) {
+				for (i = 0, code = 0; i < 2; i++) {
+					ch = next_char(lex);
 
-					break;
+					if (!isxdigit(ch))
+						return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
 
-				case 'x':
-					if (lex->esclen < 3) {
-						if (!isxdigit(*ptr))
-							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+					code = code * 16 + hex(ch);
+				}
 
-						lex->esc[lex->esclen++] = *ptr;
-					}
+				append_utf8(lex, code);
+			}
 
-					if (lex->esclen == 3) {
-						append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2]));
+			/* octal or letter */
+			else {
+				/* try to parse octal sequence... */
+				for (i = 0, code = 0, ch = lookahead_char(lex);
+				     kind != '/' && i < 3 && ch >= '0' && ch <= '7';
+				     i++, next_char(lex), ch = lookahead_char(lex)) {
+					code = code * 8 + dec(ch);
+				}
 
-						lex->esclen = 0;
-						lex->is_escape = false;
-						buf_consume(lex, (ptr + 1) - lex->bufstart);
-					}
+				if (i) {
+					if (code > 255)
+						return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
 
-					break;
+					append_utf8(lex, code);
+				}
 
-				case 'o':
-					if (lex->esclen < 4) {
-						/* found a non-octal char */
-						if (*ptr < '0' || *ptr > '7') {
-							/* pad sequence to three chars */
-							switch (lex->esclen) {
-							case 3:
-								lex->esc[3] = lex->esc[2];
-								lex->esc[2] = lex->esc[1];
-								lex->esc[1] = '0';
-								break;
-
-							case 2:
-								lex->esc[3] = lex->esc[1];
-								lex->esc[2] = '0';
-								lex->esc[1] = '0';
-								break;
-							}
-
-							lex->esclen = 4;
-							buf_consume(lex, ptr-- - lex->bufstart);
-						}
-
-						/* append */
-						else {
-							lex->esc[lex->esclen++] = *ptr;
-							buf_consume(lex, (ptr + 1) - lex->bufstart);
-						}
-					}
+				/* ... no octal sequence, handle other escape */
+				else {
+					ch = next_char(lex);
 
-					if (lex->esclen == 4) {
-						code = dec(lex->esc[1]) * 8 * 8 +
-						       dec(lex->esc[2]) * 8 +
-						       dec(lex->esc[3]);
+					switch (ch) {
+					case 'a': uc_vector_push(&lex->buffer, '\a'); break;
+					case 'b': uc_vector_push(&lex->buffer, '\b'); break;
+					case 'e': uc_vector_push(&lex->buffer, '\033'); break;
+					case 'f': uc_vector_push(&lex->buffer, '\f'); break;
+					case 'n': uc_vector_push(&lex->buffer, '\n'); break;
+					case 'r': uc_vector_push(&lex->buffer, '\r'); break;
+					case 't': uc_vector_push(&lex->buffer, '\t'); break;
+					case 'v': uc_vector_push(&lex->buffer, '\v'); break;
 
-						if (code > 255)
-							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+					case EOF:
+						return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
 
-						append_utf8(lex, code);
+					default:
+						/* regex mode => retain backslash */
+						if (type == TK_REGEXP)
+							uc_vector_push(&lex->buffer, '\\');
 
-						lex->esclen = 0;
-						lex->is_escape = false;
+						uc_vector_push(&lex->buffer, ch);
 					}
-
-					break;
 				}
 			}
-		}
-
-		/* terminating char */
-		else if (*ptr == q) {
-			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-			buf_consume(lex, (ptr + 1) - lex->bufstart);
-
-			rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
 
-			if (!rv)
-				rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
-
-			return rv;
-		}
+			break;
 
-		/* escape sequence start */
-		else if (*ptr == '\\') {
-			lex->is_escape = true;
-			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-			buf_consume(lex, (ptr - lex->bufstart) + 1);
-		}
+		/* other character */
+		default:
+			/* terminating delimitter */
+			if (ch == kind)
+				return emit_buffer(lex, off, type, NULL);
 
-		/* potential placeholder start */
-		else if (q == '`' && *ptr == '$') {
-			lex->is_placeholder = true;
-			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-			buf_consume(lex, (ptr - lex->bufstart) + 1);
+			uc_vector_push(&lex->buffer, ch);
 		}
 	}
 
-	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-	buf_consume(lex, ptr - lex->bufstart);
+	// FIXME
+	lex->state = UC_LEX_EOF;
 
-	return NULL;
+	return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
 }
 
 
@@ -563,76 +381,31 @@ parse_regexp(uc_lexer_t *lex)
 	size_t len;
 	char *s;
 
-	switch (lex->esc[0]) {
-	case UC_LEX_PARSE_REGEX_INIT:
-		if (lex->no_regexp) {
-			if (buf_startswith(lex, "=")) {
-				buf_consume(lex, 1);
-
-				return emit_op(lex, lex->source->off, TK_ASDIV, NULL);
-			}
-
-			return emit_op(lex, lex->source->off, TK_DIV, NULL);
-		}
-
-		lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN;
-		break;
-
-	case UC_LEX_PARSE_REGEX_PATTERN:
-		rv = parse_string(lex);
-
-		if (rv && rv->type == TK_ERROR)
-			return rv;
+	rv = parse_string(lex, '/');
 
-		if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) {
-			lex->lookbehind = (char *)rv;
-			lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS;
-		}
-
-		break;
-
-	case UC_LEX_PARSE_REGEX_FLAGS:
-		rv = (uc_token_t *)lex->lookbehind;
-
-		while (lex->bufstart < lex->bufend || lex->eof) {
-			switch (lex->eof ? EOF : lex->bufstart[0]) {
-			case 'g':
-				buf_consume(lex, 1);
+	if (rv->type == TK_REGEXP) {
+		while (true) {
+			if (check_char(lex, 'g'))
 				is_reg_global = true;
-				break;
-
-			case 'i':
-				buf_consume(lex, 1);
+			else if (check_char(lex, 'i'))
 				is_reg_icase = true;
-				break;
-
-			case 's':
-				buf_consume(lex, 1);
+			else if (check_char(lex, 's'))
 				is_reg_newline = true;
+			else
 				break;
-
-			default:
-				lex->lookbehind = NULL;
-
-				len = xasprintf(&s, "%c%*s",
-					(is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
-					ucv_string_length(rv->uv),
-					ucv_string_get(rv->uv));
-
-				ucv_free(rv->uv, false);
-				rv->uv = ucv_string_new_length(s, len);
-				free(s);
-
-				rv->type = TK_REGEXP;
-
-				return rv;
-			}
 		}
 
-		break;
+		len = xasprintf(&s, "%c%*s",
+			(is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
+			ucv_string_length(rv->uv),
+			ucv_string_get(rv->uv));
+
+		ucv_free(rv->uv, false);
+		rv->uv = ucv_string_new_length(s, len);
+		free(s);
 	}
 
-	return NULL;
+	return rv;
 }
 
 
@@ -647,37 +420,34 @@ parse_regexp(uc_lexer_t *lex)
  */
 
 static uc_token_t *
-parse_label(uc_lexer_t *lex)
+parse_label(uc_lexer_t *lex, int ch)
 {
-	const struct token *tok = lex->tok;
 	const struct keyword *word;
-	char *ptr;
-	size_t i;
-
-	if (!lex->lookbehind && tok->plen)
-		lookbehind_append(lex, tok->u.pat, tok->plen);
+	size_t i, len;
 
-	if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
-		if (lex->no_keyword == false) {
-			for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
-				if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
-					lookbehind_reset(lex);
+	while (true) {
+		uc_vector_push(&lex->buffer, ch);
+		ch = lookahead_char(lex);
 
-					return emit_op(lex, lex->source->off - word->plen, word->type, NULL);
-				}
-			}
-		}
+		if (!isalnum(ch) && ch != '_')
+			break;
 
-		return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL);
+		next_char(lex);
 	}
 
-	for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
-		;
+	len = lex->buffer.count;
 
-	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-	buf_consume(lex, ptr - lex->bufstart);
+	if (!lex->no_keyword) {
+		for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
+			if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
+				uc_vector_clear(&lex->buffer);
 
-	return NULL;
+				return emit_op(lex, -len, word->type, NULL);
+			}
+		}
+	}
+
+	return emit_buffer(lex, -len, TK_LABEL, NULL);
 }
 
 
@@ -694,7 +464,7 @@ parse_label(uc_lexer_t *lex)
 static inline bool
 is_numeric_char(uc_lexer_t *lex, char c)
 {
-	char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0;
+	char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
 
 	switch (c|32) {
 	case '.':
@@ -731,380 +501,507 @@ is_numeric_char(uc_lexer_t *lex, char c)
 }
 
 static uc_token_t *
-parse_number(uc_lexer_t *lex)
+parse_number(uc_lexer_t *lex, int ch)
 {
-	uc_token_t *rv = NULL;
 	uc_value_t *nv = NULL;
-	const char *ptr;
+	size_t len;
 	char *e;
 
-	if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) {
-		lookbehind_append(lex, "\0", 1);
-
-		nv = uc_number_parse_octal(lex->lookbehind, &e);
+	while (true) {
+		uc_vector_push(&lex->buffer, ch);
+		ch = lookahead_char(lex);
 
-		switch (ucv_type(nv)) {
-		case UC_DOUBLE:
-			rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv);
+		if (!is_numeric_char(lex, ch))
 			break;
 
-		case UC_INTEGER:
-			rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv);
-			break;
+		next_char(lex);
+	}
 
-		default:
-			rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal"));
-		}
+	len = lex->buffer.count;
 
-		lookbehind_reset(lex);
+	uc_vector_push(&lex->buffer, '\0');
 
-		return rv;
-	}
+	nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
 
-	for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++)
-		;
+	uc_vector_clear(&lex->buffer);
 
-	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-	buf_consume(lex, ptr - lex->bufstart);
+	switch (ucv_type(nv)) {
+	case UC_DOUBLE:
+		return emit_op(lex, -len, TK_DOUBLE, nv);
 
-	return NULL;
+	case UC_INTEGER:
+		return emit_op(lex, -len, TK_NUMBER, nv);
+
+	default:
+		return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
+	}
 }
 
 static uc_token_t *
-lex_step(uc_lexer_t *lex, FILE *fp)
+lex_find_token(uc_lexer_t *lex)
 {
-	uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
-	union { uint32_t n; char str[4]; } search;
-	const struct token *tok;
-	size_t rlen, rem, *nest;
-	char *ptr, c;
-	uc_token_t *rv;
-	size_t i;
+	bool tpl = !(lex->config && lex->config->raw_mode);
+	int ch = next_char(lex);
+
+	while (isspace(ch))
+		ch = next_char(lex);
+
+	switch (ch) {
+	case '~':
+		return emit_op(lex, -1, TK_COMPL, NULL);
 
-	/* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining,
-	 * move the remaining bytes to the beginning and read more data */
-	if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) {
-		if (!lex->buf) {
-			lex->buflen = 128;
-			lex->buf = xalloc(lex->buflen);
+	case '}':
+		if (tpl && check_char(lex, '}'))
+			return emit_op(lex, -2, TK_REXP, NULL);
+
+		return emit_op(lex, -1, TK_RBRACE, NULL);
+
+	case '|':
+		if (check_char(lex, '|')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASOR, NULL);
+
+			return emit_op(lex, -2, TK_OR, NULL);
 		}
 
-		rem = lex->bufend - lex->bufstart;
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASBOR, NULL);
 
-		if (rem)
-			memcpy(lex->buf, lex->bufstart, rem);
+		return emit_op(lex, -1, TK_BOR, NULL);
 
-		rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp);
+	case '{':
+		if (tpl && check_char(lex, '{'))
+			return emit_op(lex, -2, TK_LEXP, NULL);
 
-		lex->bufstart = lex->buf;
-		lex->bufend   = lex->buf + rlen + rem;
+		if (tpl && check_char(lex, '%'))
+			return emit_op(lex, -2, TK_LSTM, NULL);
 
-		if (rlen == 0 && (ferror(fp) || feof(fp)))
-			lex->eof = 1;
-	}
+		return emit_op(lex, -1, TK_LBRACE, NULL);
 
-	switch (lex->state) {
-	case UC_LEX_IDENTIFY_BLOCK:
-		/* previous block had strip trailing whitespace flag, skip leading whitespace */
-		if (lex->modifier == MINUS) {
-			while (buf_remaining(lex) && isspace(lex->bufstart[0]))
-				buf_consume(lex, 1);
+	case '^':
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASBXOR, NULL);
 
-			lex->modifier = UNSPEC;
+		return emit_op(lex, -1, TK_BXOR, NULL);
+
+	case '[':
+		return emit_op(lex, -1, TK_LBRACK, NULL);
+
+	case ']':
+		return emit_op(lex, -1, TK_RBRACK, NULL);
+
+	case '?':
+		if (check_char(lex, '?')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASNULLISH, NULL);
+
+			return emit_op(lex, -2, TK_NULLISH, NULL);
 		}
 
-		/* previous block was a statement block and trim_blocks is enabld, skip leading newline */
-		else if (lex->modifier == NEWLINE) {
-			if (buf_startswith(lex, "\n"))
-				buf_consume(lex, 1);
+		if (check_char(lex, '.')) {
+			if (check_char(lex, '['))
+				return emit_op(lex, -3, TK_QLBRACK, NULL);
+
+			if (check_char(lex, '('))
+				return emit_op(lex, -3, TK_QLPAREN, NULL);
 
-			lex->modifier = UNSPEC;
+			return emit_op(lex, -2, TK_QDOT, NULL);
 		}
 
-		/* scan forward through buffer to identify start token */
-		for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) {
-			/* found start of comment block */
-			if (!strncmp(ptr, "{#", 2)) {
-				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-				buf_consume(lex, (ptr + 2) - lex->bufstart);
-				lex->lastoff = lex->source->off - 2;
-				lex->state = UC_LEX_BLOCK_COMMENT_START;
+		return emit_op(lex, lex->source->off, TK_QMARK, NULL);
 
-				return NULL;
-			}
+	case '>':
+		if (check_char(lex, '>')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASRIGHT, NULL);
 
-			/* found start of expression block */
-			else if (!strncmp(ptr, "{{", 2)) {
-				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-				buf_consume(lex, (ptr + 2) - lex->bufstart);
-				lex->lastoff = lex->source->off - 2;
-				lex->state = UC_LEX_BLOCK_EXPRESSION_START;
+			return emit_op(lex, -2, TK_RSHIFT, NULL);
+		}
 
-				return NULL;
-			}
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_GE, NULL);
 
-			/* found start of statement block */
-			else if (!strncmp(ptr, "{%", 2)) {
-				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-				buf_consume(lex, (ptr + 2) - lex->bufstart);
-				lex->lastoff = lex->source->off - 2;
-				lex->state = UC_LEX_BLOCK_STATEMENT_START;
+		return emit_op(lex, -1, TK_GT, NULL);
 
-				return NULL;
-			}
+	case '=':
+		if (check_char(lex, '=')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_EQS, NULL);
+
+			return emit_op(lex, -2, TK_EQ, NULL);
 		}
 
-		/* we're at eof */
-		if (lex->eof) {
-			lookbehind_append(lex, ptr, lex->bufend - ptr);
-			lex->state = UC_LEX_EOF;
+		if (check_char(lex, '>'))
+			return emit_op(lex, -2, TK_ARROW, NULL);
+
+		return emit_op(lex, -1, TK_ASSIGN, NULL);
+
+	case '<':
+		if (check_char(lex, '<')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASLEFT, NULL);
 
-			return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL);
+			return emit_op(lex, -2, TK_LSHIFT, NULL);
 		}
 
-		lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
-		buf_consume(lex, ptr - lex->bufstart);
-		break;
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_LE, NULL);
+
+		return emit_op(lex, -1, TK_LT, NULL);
+
+	case ';':
+		return emit_op(lex, -1, TK_SCOL, NULL);
 
+	case ':':
+		return emit_op(lex, -1, TK_COLON, NULL);
 
-	case UC_LEX_BLOCK_COMMENT_START:
-	case UC_LEX_BLOCK_EXPRESSION_START:
-	case UC_LEX_BLOCK_STATEMENT_START:
-		rv = NULL;
-		lex->modifier = UNSPEC;
+	case '/':
+		ch = lookahead_char(lex);
+		lex->lastoff = lex->source->off - 1;
 
-		/* strip whitespace before block */
-		if (buf_startswith(lex, "-")) {
-			rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r");
-			buf_consume(lex, 1);
+		if (ch == '/' || ch == '*')
+			return parse_comment(lex, ch);
+
+		if (lex->no_regexp) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -2, TK_ASDIV, NULL);
+
+			return emit_op(lex, -1, TK_DIV, NULL);
 		}
 
-		/* disable lstrip flag (only valid for statement blocks) */
-		else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) {
-			/* disable lstrip flag */
-			if (buf_startswith(lex, "+")) {
-				rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
-				buf_consume(lex, 1);
-			}
+		return parse_regexp(lex);
 
-			/* put out text leading up to the opening tag and potentially
-			 * strip trailing white space from it depending on the global
-			 * block lstrip setting */
-			else {
-				rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT,
-					(lex->config && lex->config->lstrip_blocks) ? " \t\v\f\r" : NULL);
+	case '.':
+		if (check_char(lex, '.')) {
+			if (check_char(lex, '.'))
+				return emit_op(lex, -3, TK_ELLIP, NULL);
+
+			/* The sequence ".." cannot be a valid */
+			return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
+		}
+
+		return emit_op(lex, -1, TK_DOT, NULL);
+
+	case '-':
+		if (tpl && check_char(lex, '}')) {
+			if (check_char(lex, '}')) {
+				lex->modifier = MINUS;
+
+				return emit_op(lex, -3, TK_REXP, NULL);
 			}
+
+			/* The sequence "-}" cannot be a valid */
+			return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
 		}
-		else {
-			rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
+
+		if (tpl && check_char(lex, '%')) {
+			if (check_char(lex, '}')) {
+				lex->modifier = MINUS;
+
+				return emit_op(lex, -3, TK_RSTM, NULL);
+			}
+
+			/* The sequence "-%" cannot be a valid */
+			return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
 		}
 
-		switch (lex->state) {
-		case UC_LEX_BLOCK_COMMENT_START:
-			lex->state = UC_LEX_BLOCK_COMMENT;
-			lex->block = COMMENT;
-			break;
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASSUB, NULL);
 
-		case UC_LEX_BLOCK_STATEMENT_START:
-			lex->state = UC_LEX_IDENTIFY_TOKEN;
-			lex->block = STATEMENTS;
-			break;
+		if (check_char(lex, '-'))
+			return emit_op(lex, -2, TK_DEC, NULL);
 
-		case UC_LEX_BLOCK_EXPRESSION_START:
-			lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
-			break;
+		return emit_op(lex, -1, TK_SUB, NULL);
 
-		default:
-			break;
+	case ',':
+		return emit_op(lex, -1, TK_COMMA, NULL);
+
+	case '+':
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASADD, NULL);
+
+		if (check_char(lex, '+'))
+			return emit_op(lex, -2, TK_INC, NULL);
+
+		return emit_op(lex, -1, TK_ADD, NULL);
+
+	case '*':
+		if (check_char(lex, '*')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASEXP, NULL);
+
+			return emit_op(lex, -2, TK_EXP, NULL);
 		}
 
-		return rv;
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASMUL, NULL);
 
+		return emit_op(lex, -1, TK_MUL, NULL);
 
-	case UC_LEX_BLOCK_COMMENT:
-		/* scan forward through buffer to identify end token */
-		while (lex->bufstart < lex->bufend - 2) {
-			if (buf_startswith(lex, "-#}")) {
-				lex->state = UC_LEX_IDENTIFY_BLOCK;
-				lex->modifier = MINUS;
-				buf_consume(lex, 3);
-				lex->lastoff = lex->source->off;
-				break;
-			}
-			else if (buf_startswith(lex, "#}")) {
-				lex->state = UC_LEX_IDENTIFY_BLOCK;
-				buf_consume(lex, 2);
-				lex->lastoff = lex->source->off;
-				break;
-			}
+	case '(':
+		return emit_op(lex, -1, TK_LPAREN, NULL);
+
+	case ')':
+		return emit_op(lex, -1, TK_RPAREN, NULL);
 
-			buf_consume(lex, 1);
+	case '\'':
+	case '"':
+	case '`':
+		lex->lastoff = lex->source->off - 1;
+
+		return parse_string(lex, ch);
+
+	case '&':
+		if (check_char(lex, '&')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_ASAND, NULL);
+
+			return emit_op(lex, -2, TK_AND, NULL);
 		}
 
-		/* we're at eof */
-		if (lex->eof) {
-			lex->state = UC_LEX_EOF;
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASBAND, NULL);
+
+		return emit_op(lex, -1, TK_BAND, NULL);
 
-			buf_consume(lex, lex->bufend - lex->bufstart);
+	case '%':
+		if (tpl && check_char(lex, '}'))
+			return emit_op(lex, -2, TK_RSTM, NULL);
 
-			return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
+		if (check_char(lex, '='))
+			return emit_op(lex, -2, TK_ASMOD, NULL);
+
+		return emit_op(lex, -1, TK_MOD, NULL);
+
+	case '!':
+		if (check_char(lex, '=')) {
+			if (check_char(lex, '='))
+				return emit_op(lex, -3, TK_NES, NULL);
+
+			return emit_op(lex, -2, TK_NE, NULL);
 		}
 
-		break;
+		return emit_op(lex, -1, TK_NOT, NULL);
 
+	case EOF:
+		return emit_op(lex, -1, TK_EOF, NULL);
 
-	case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
-		lex->state = UC_LEX_IDENTIFY_TOKEN;
-		lex->block = EXPRESSION;
+	default:
+		if (isalpha(ch) || ch == '_')
+			return parse_label(lex, ch);
 
-		return emit_op(lex, lex->source->off, TK_LEXP, NULL);
+		if (isdigit(ch))
+			return parse_number(lex, ch);
 
+		return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
+	}
+}
 
-	case UC_LEX_IDENTIFY_TOKEN:
-		/* skip leading whitespace */
-		for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++)
-			;
+static uc_token_t *
+lex_step(uc_lexer_t *lex)
+{
+	const char *strip = NULL;
+	uc_token_t *tok;
+	size_t *nest;
+	int ch;
 
-		buf_consume(lex, i);
+	while (lex->state != UC_LEX_EOF) {
+		switch (lex->state) {
+		case UC_LEX_IDENTIFY_BLOCK:
+			ch = next_char(lex);
 
-		if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN)
-			return NULL;
+			/* previous block had strip trailing whitespace flag, skip leading whitespace */
+			if (lex->modifier == MINUS) {
+				while (isspace(ch))
+					ch = next_char(lex);
 
-		for (i = 0; i < sizeof(search.str); i++)
-			search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0;
+				lex->modifier = UNSPEC;
+			}
 
-		for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
-			/* remaining buffer data is shorter than token, skip */
-			if (tok->plen > buf_remaining(lex))
-				continue;
+			/* previous block was a statement block and trim_blocks is enabled, skip leading newline */
+			else if (lex->modifier == NEWLINE) {
+				if (ch == '\n')
+					ch = next_char(lex);
 
-			c = buf_remaining(lex) ? lex->bufstart[0] : 0;
+				lex->modifier = UNSPEC;
+			}
 
-			if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn)
-			              : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) {
-				lex->lastoff = lex->source->off;
+			/* scan forward through buffer to identify block start token */
+			while (ch != EOF) {
+				if (ch == '{') {
+					ch = next_char(lex);
 
-				/* token has a parse method, switch state */
-				if (tok->parse) {
-					lex->tok = tok;
-					lex->state = UC_LEX_PARSE_TOKEN;
+					switch (ch) {
+					/* found start of comment block */
+					case '#':
+						lex->state = UC_LEX_BLOCK_COMMENT;
+						lex->block = COMMENT;
 
-					buf_consume(lex, tok->plen);
+						if (check_char(lex, '-'))
+							strip = " \n\t\v\f\r";
 
-					return NULL;
-				}
+						break;
 
-				/* in raw code mode, ignore template tag tokens */
-				if (lex->config && lex->config->raw_mode &&
-				    (tok->type == TK_LSTM || tok->type == TK_RSTM ||
-				     tok->type == TK_LEXP || tok->type == TK_REXP)) {
-					continue;
-				}
+					/* found start of expression block */
+					case '{':
+						lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
 
-				/* disallow nesting blocks */
-				if (tok->type == TK_LSTM || tok->type == TK_LEXP) {
-					buf_consume(lex, tok->plen);
+						if (check_char(lex, '-'))
+							strip = " \n\t\v\f\r";
 
-					return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
-				}
+						break;
 
-				/* found end of block */
-				else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) ||
-				         (lex->block == EXPRESSION && tok->type == TK_REXP)) {
-					/* strip whitespace after block */
-					if (tok->u.pat[0] == '-')
-						lex->modifier = MINUS;
+					/* found start of statement block */
+					case '%':
+						lex->state = UC_LEX_IDENTIFY_TOKEN;
+						lex->block = STATEMENTS;
 
-					/* strip newline after statement block */
-					else if (lex->block == STATEMENTS &&
-					         lex->config && lex->config->trim_blocks)
-						lex->modifier = NEWLINE;
+						if (check_char(lex, '-'))
+							strip = " \n\t\v\f\r";
+						else if (check_char(lex, '+'))
+							strip = NULL;
+						else if (lex->config && lex->config->lstrip_blocks)
+							strip = " \t\v\f\r";
 
-					lex->state = UC_LEX_IDENTIFY_BLOCK;
-					lex->block = NONE;
-				}
+						break;
+
+					default:
+						/* not a start tag, remember char and move on */
+						uc_vector_push(&lex->buffer, '{');
+						continue;
+					}
 
-				/* track opening braces */
-				else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
-					nest = uc_vector_last(&lex->templates);
-					(*nest)++;
+					break;
 				}
 
-				/* check end of placeholder expression */
-				else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
-					nest = uc_vector_last(&lex->templates);
+				uc_vector_push(&lex->buffer, ch);
+				ch = next_char(lex);
+			}
 
-					if (*nest == 0) {
-						lex->templates.count--;
-						lex->state = UC_LEX_PARSE_TOKEN;
-						lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */
-					}
-					else {
-						(*nest)--;
-					}
+			if (ch == EOF)
+				lex->state = UC_LEX_EOF;
+
+			/* push out leading text */
+			tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
+			lex->lastoff = lex->source->off - 2;
+
+			if (!tok)
+				continue;
+
+			return tok;
+
+
+		case UC_LEX_BLOCK_COMMENT:
+			ch = next_char(lex);
+
+			/* scan forward through buffer to identify end token */
+			while (ch != EOF) {
+				if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
+					lex->modifier = MINUS;
+					break;
 				}
 
-				/* do not report statement tags to the parser */
-				if (tok->type != 0 && tok->type != TK_LSTM)
-					rv = emit_op(lex, lex->source->off,
-						(tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL);
-				else
-					rv = NULL;
+				if (ch == '#' && check_char(lex, '}'))
+					break;
+
+				ch = next_char(lex);
+			}
 
-				buf_consume(lex, tok->plen);
+			if (ch == EOF) {
+				lex->state = UC_LEX_EOF;
 
-				return rv;
+				return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
 			}
-		}
 
-		/* no possible return beyond this point can advance,
-		   mark lex state as eof */
-		lex->state = UC_LEX_EOF;
+			lex->lastoff = lex->source->off;
+			lex->state = UC_LEX_IDENTIFY_BLOCK;
 
-		/* no token matched and we do have remaining data, junk */
-		if (buf_remaining(lex))
-			return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character"));
+			continue;
 
-		/* we're at eof, allow unclosed statement blocks */
-		if (lex->block == STATEMENTS)
-			return NULL;
 
-		/* premature EOF */
-		return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block"));
+		case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
+			lex->state = UC_LEX_IDENTIFY_TOKEN;
+			lex->block = EXPRESSION;
 
+			return emit_op(lex, lex->source->off, TK_LEXP, NULL);
 
-	case UC_LEX_PARSE_TOKEN:
-		tok = lex->tok;
-		rv = tok->parse(lex);
 
-		if (rv) {
-			memset(lex->esc, 0, sizeof(lex->esc));
-			lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN;
-			lex->is_placeholder = false;
-			lex->tok = NULL;
+		case UC_LEX_IDENTIFY_TOKEN:
+			do { tok = lex_find_token(lex); } while (tok == NULL);
 
-			if (rv == UC_LEX_CONTINUE_PARSING)
-				rv = NULL;
+			/* disallow nesting blocks */
+			if (tok->type == TK_LSTM || tok->type == TK_LEXP)
+				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
 
-			return rv;
-		}
+			/* found end of statement block */
+			if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
+				/* strip newline after statement block? */
+				if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
+					lex->modifier = NEWLINE;
 
-		break;
+				lex->lastoff = lex->source->off;
+				lex->state = UC_LEX_IDENTIFY_BLOCK;
+				lex->block = NONE;
 
+				tok = emit_op(lex, -2, TK_SCOL, NULL);
+			}
 
-	case UC_LEX_PLACEHOLDER:
-		lex->state = UC_LEX_IDENTIFY_TOKEN;
+			/* found end of expression block */
+			else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
+				lex->lastoff = lex->source->off;
+				lex->state = UC_LEX_IDENTIFY_BLOCK;
+				lex->block = NONE;
+			}
+
+			/* track opening braces */
+			else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
+				nest = uc_vector_last(&lex->templates);
+				(*nest)++;
+			}
+
+			/* check end of placeholder expression */
+			else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
+				nest = uc_vector_last(&lex->templates);
+
+				if (*nest == 0) {
+					lex->templates.count--;
+					lex->state = UC_LEX_PLACEHOLDER_END;
+				}
+				else {
+					(*nest)--;
+				}
+			}
+
+			/* premature EOF? */
+			else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
+				lex->state = UC_LEX_EOF;
+
+				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
+			}
 
-		uc_vector_push(&lex->templates, 0);
+			return tok;
 
-		return emit_op(lex, lex->source->off, TK_PLACEH, NULL);
 
+		case UC_LEX_PLACEHOLDER_START:
+			lex->state = UC_LEX_IDENTIFY_TOKEN;
+
+			uc_vector_push(&lex->templates, 0);
+
+			return emit_op(lex, -2, TK_PLACEH, NULL);
 
-	case UC_LEX_EOF:
-		break;
+
+		case UC_LEX_PLACEHOLDER_END:
+			lex->state = UC_LEX_IDENTIFY_TOKEN;
+
+			return parse_string(lex, '`');
+
+
+		case UC_LEX_EOF:
+			break;
+		}
 	}
 
-	return NULL;
+	return emit_op(lex, lex->source->off, TK_EOF, NULL);
 }
 
 void
@@ -1115,24 +1012,15 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
 	lex->config = config;
 	lex->source = uc_source_get(source);
 
-	lex->eof = 0;
-	lex->is_escape = 0;
-
 	lex->block = NONE;
 	lex->modifier = UNSPEC;
 
-	lex->buflen = 0;
-	lex->buf = NULL;
-	lex->bufstart = NULL;
-	lex->bufend = NULL;
-
-	lex->lookbehindlen = 0;
-	lex->lookbehind = NULL;
+	lex->rlen = 0;
+	lex->rpos = 0;
+	lex->rbuf = NULL;
 
-	lex->tok = NULL;
-
-	lex->esclen = 0;
-	memset(lex->esc, 0, sizeof(lex->esc));
+	lex->buffer.count = 0;
+	lex->buffer.entries = NULL;
 
 	lex->lead_surrogate = 0;
 
@@ -1150,11 +1038,12 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
 void
 uc_lexer_free(uc_lexer_t *lex)
 {
+	uc_vector_clear(&lex->buffer);
 	uc_vector_clear(&lex->templates);
+
 	uc_source_put(lex->source);
 
-	free(lex->lookbehind);
-	free(lex->buf);
+	free(lex->rbuf);
 }
 
 uc_token_t *
@@ -1162,47 +1051,94 @@ uc_lexer_next_token(uc_lexer_t *lex)
 {
 	uc_token_t *rv = NULL;
 
-	while (lex->state != UC_LEX_EOF) {
-		rv = lex_step(lex, lex->source->fp);
-
-		if (rv != NULL)
-			break;
-	}
-
-	if (rv) {
-		lex->no_keyword = false;
-		lex->no_regexp = false;
+	rv = lex_step(lex);
 
-		return rv;
-	}
+	lex->no_keyword = false;
+	lex->no_regexp = false;
 
-	return emit_op(lex, lex->source->off, TK_EOF, NULL);
+	return rv;
 }
 
 const char *
 uc_tokenname(unsigned type)
 {
 	static char buf[sizeof("'endfunction'")];
-	size_t i;
-
-	switch (type) {
-	case 0:           return "End of file";
-	case TK_TEMPLATE: return "Template";
-	case TK_STRING:   return "String";
-	case TK_LABEL:    return "Label";
-	case TK_NUMBER:   return "Number";
-	case TK_DOUBLE:   return "Double";
-	case TK_REGEXP:   return "Regexp";
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tokens); i++) {
-		if (tokens[i].type != type)
-			continue;
+	const char *tokennames[] = {
+		[TK_LEXP] = "'{{'",
+		[TK_REXP] = "'}}'",
+		[TK_LSTM] = "'{%'",
+		[TK_RSTM] = "'%}'",
+		[TK_COMMA] = "','",
+		[TK_ASSIGN] = "'='",
+		[TK_ASADD] = "'+='",
+		[TK_ASSUB] = "'-='",
+		[TK_ASMUL] = "'*='",
+		[TK_ASDIV] = "'/='",
+		[TK_ASMOD] = "'%='",
+		[TK_ASLEFT] = "'<<='",
+		[TK_ASRIGHT] = "'>>='",
+		[TK_ASBAND] = "'&='",
+		[TK_ASBXOR] = "'^='",
+		[TK_ASBOR] = "'|='",
+		[TK_QMARK] = "'?'",
+		[TK_COLON] = "':'",
+		[TK_OR] = "'||'",
+		[TK_AND] = "'&&'",
+		[TK_BOR] = "'|'",
+		[TK_BXOR] = "'^'",
+		[TK_BAND] = "'&'",
+		[TK_EQS] = "'==='",
+		[TK_NES] = "'!=='",
+		[TK_EQ] = "'=='",
+		[TK_NE] = "'!='",
+		[TK_LT] = "'<'",
+		[TK_LE] = "'<='",
+		[TK_GT] = "'>'",
+		[TK_GE] = "'>='",
+		[TK_LSHIFT] = "'<<'",
+		[TK_RSHIFT] = "'>>'",
+		[TK_ADD] = "'+'",
+		[TK_SUB] = "'-'",
+		[TK_MUL] = "'*'",
+		[TK_DIV] = "'/'",
+		[TK_MOD] = "'%'",
+		[TK_EXP] = "'**'",
+		[TK_NOT] = "'!'",
+		[TK_COMPL] = "'~'",
+		[TK_INC] = "'++'",
+		[TK_DEC] = "'--'",
+		[TK_DOT] = "'.'",
+		[TK_LBRACK] = "'['",
+		[TK_RBRACK] = "']'",
+		[TK_LPAREN] = "'('",
+		[TK_RPAREN] = "')'",
+		[TK_LBRACE] = "'{'",
+		[TK_RBRACE] = "'}'",
+		[TK_SCOL] = "';'",
+		[TK_ELLIP] = "'...'",
+		[TK_ARROW] = "'=>'",
+		[TK_QLBRACK] = "'?.['",
+		[TK_QLPAREN] = "'?.('",
+		[TK_QDOT] = "'?.'",
+		[TK_ASEXP] = "'**='",
+		[TK_ASAND] = "'&&='",
+		[TK_ASOR] = "'||='",
+		[TK_ASNULLISH] = "'\?\?='",
+		[TK_NULLISH] = "'\?\?'",
+		[TK_PLACEH] = "'${'",
+
+		[TK_TEXT] = "Text",
+		[TK_LABEL] = "Label",
+		[TK_NUMBER] = "Number",
+		[TK_DOUBLE] = "Double",
+		[TK_STRING] = "String",
+		[TK_REGEXP] = "Regexp",
+		[TK_TEMPLATE] = "Template",
+		[TK_ERROR] = "Error",
+		[TK_EOF] = "End of file",
+	};
 
-		snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat);
-
-		return buf;
-	}
+	size_t i;
 
 	for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
 		if (reserved_words[i].type != type)
@@ -1213,7 +1149,7 @@ uc_tokenname(unsigned type)
 		return buf;
 	}
 
-	return "?";
+	return tokennames[type] ? tokennames[type] : "?";
 }
 
 bool
author	Jo-Philipp Wich <jo@mein.io>	2022-07-14 14:33:12 +0200
committer	Jo-Philipp Wich <jo@mein.io>	2022-07-28 13:18:30 +0200
commit	03c8e4b465c8cffd2596d2741b29ad2ba4ec1765 (patch)
tree	6a43c9f54be5e3de4fcbc73b5ebaa518e642d3ad /lexer.c
parent	1219d7efa170bf38fb1bf6a10fa0d1f96e62f091 (diff)