treewide: rewrite ucode interpreter

Replace the former AST walking interpreter implementation with a single pass bytecode compiler and a corresponding virtual machine. The rewrite lays the groundwork for a couple of improvements with will be subsequently implemented: - Ability to precompile ucode sources into binary byte code - Strippable debug information - Reduced runtime memory usage Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2020-12-23 20:54:05 +0100
committer: Jo-Philipp Wich <jo@mein.io> 2021-02-17 14:10:51 +0100
commit: 3756806674da909ec6dc10ad25862b592792604e (patch)
tree: f2af7e47f8444caaff0a5a33599f381889db24e3 /lexer.c
parent: 77580a893283f2bde7ab46496bd3a3d7b2fc6784 (diff)
1 files changed, 638 insertions, 514 deletions
diff --git a/lexer.c b/lexer.c
index 46692ec..a1ae73d 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Jo-Philipp Wich <jo@mein.io>
+ * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -25,11 +25,11 @@
 #include <errno.h>
 #include <endian.h>
 
-#include "ast.h"
+#include "vm.h"
 #include "lib.h"
 #include "lexer.h"
-#include "parser.h"
 
+#define UC_LEX_CONTINUE_PARSING (void *)1
 
 struct keyword {
 	int type;
@@ -48,7 +48,7 @@ struct token {
 		char pat[4];
 	};
 	int plen;
-	uint32_t (*parse)(struct uc_state *s);
+	uc_token *(*parse)(uc_lexer *);
 };
 
 #define dec(o) \
@@ -58,109 +58,110 @@ struct token {
 	(((x) >= 'a') ? (10 + (x) - 'a') : \
 		(((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 
-static uint32_t parse_comment(struct uc_state *);
-static uint32_t parse_string(struct uc_state *);
-static uint32_t parse_regexp(struct uc_state *);
-static uint32_t parse_number(struct uc_state *);
-static uint32_t parse_label(struct uc_state *);
+static uc_token *parse_comment(uc_lexer *);
+static uc_token *parse_string(uc_lexer *);
+static uc_token *parse_regexp(uc_lexer *);
+static uc_token *parse_number(uc_lexer *);
+static uc_token *parse_label(uc_lexer *);
 
 static const struct token tokens[] = {
-	{ T_ASLEFT,		{ .pat = "<<=" },   3 },
-	{ T_ASRIGHT,	{ .pat = ">>=" },   3 },
-	{ T_LEXP,		{ .pat = "{{-" },   3 },
-	{ T_REXP,		{ .pat = "-}}" },   3 },
-	{ T_LSTM,		{ .pat = "{%+" },   3 },
-	{ T_LSTM,		{ .pat = "{%-" },   3 },
-	{ T_RSTM,		{ .pat = "-%}" },   3 },
-	{ T_EQS,		{ .pat = "===" },   3 },
-	{ T_NES,		{ .pat = "!==" },   3 },
-	{ T_ELLIP,		{ .pat = "..." },   3 },
-	{ T_AND,		{ .pat = "&&" },    2 },
-	{ T_ASADD,		{ .pat = "+=" },    2 },
-	{ T_ASBAND,		{ .pat = "&=" },    2 },
-	{ T_ASBOR,		{ .pat = "|=" },    2 },
-	{ T_ASBXOR,		{ .pat = "^=" },    2 },
-	//{ T_ASDIV,		{ .pat = "/=" },    2 },
-	{ T_ASMOD,		{ .pat = "%=" },    2 },
-	{ T_ASMUL,		{ .pat = "*=" },    2 },
-	{ T_ASSUB,		{ .pat = "-=" },    2 },
-	{ T_DEC,		{ .pat = "--" },    2 },
-	{ T_INC,		{ .pat = "++" },    2 },
-	{ T_EQ,			{ .pat = "==" },    2 },
-	{ T_NE,			{ .pat = "!=" },    2 },
-	{ T_LE,			{ .pat = "<=" },    2 },
-	{ T_GE,			{ .pat = ">=" },    2 },
-	{ T_LSHIFT,		{ .pat = "<<" },    2 },
-	{ T_RSHIFT,		{ .pat = ">>" },    2 },
+	{ TK_ASLEFT,	{ .pat = "<<=" },   3 },
+	{ TK_ASRIGHT,	{ .pat = ">>=" },   3 },
+	{ TK_LEXP,		{ .pat = "{{-" },   3 },
+	{ TK_REXP,		{ .pat = "-}}" },   3 },
+	{ TK_LSTM,		{ .pat = "{%+" },   3 },
+	{ TK_LSTM,		{ .pat = "{%-" },   3 },
+	{ TK_RSTM,		{ .pat = "-%}" },   3 },
+	{ TK_EQS,		{ .pat = "===" },   3 },
+	{ TK_NES,		{ .pat = "!==" },   3 },
+	{ TK_ELLIP,		{ .pat = "..." },   3 },
+	{ TK_AND,		{ .pat = "&&" },    2 },
+	{ TK_ASADD,		{ .pat = "+=" },    2 },
+	{ TK_ASBAND,	{ .pat = "&=" },    2 },
+	{ TK_ASBOR,		{ .pat = "|=" },    2 },
+	{ TK_ASBXOR,	{ .pat = "^=" },    2 },
+	//{ TK_ASDIV,	{ .pat = "/=" },    2 },
+	{ TK_ASMOD,		{ .pat = "%=" },    2 },
+	{ TK_ASMUL,		{ .pat = "*=" },    2 },
+	{ TK_ASSUB,		{ .pat = "-=" },    2 },
+	{ TK_DEC,		{ .pat = "--" },    2 },
+	{ TK_INC,		{ .pat = "++" },    2 },
+	{ TK_EQ,		{ .pat = "==" },    2 },
+	{ TK_NE,		{ .pat = "!=" },    2 },
+	{ TK_LE,		{ .pat = "<=" },    2 },
+	{ TK_GE,		{ .pat = ">=" },    2 },
+	{ TK_LSHIFT,	{ .pat = "<<" },    2 },
+	{ TK_RSHIFT,	{ .pat = ">>" },    2 },
 	{ 0,			{ .pat = "//" },    2, parse_comment },
 	{ 0,			{ .pat = "/*" },    2, parse_comment },
-	{ T_OR,			{ .pat = "||" },    2 },
-	{ T_LEXP,		{ .pat = "{{" },    2 },
-	{ T_REXP,		{ .pat = "}}" },    2 },
-	{ T_LSTM,		{ .pat = "{%" },    2 },
-	{ T_RSTM,		{ .pat = "%}" },    2 },
-	{ T_ARROW,		{ .pat = "=>" },    2 },
-	{ T_ADD,		{ .pat = "+" },     1 },
-	{ T_ASSIGN,		{ .pat = "=" },     1 },
-	{ T_BAND,		{ .pat = "&" },     1 },
-	{ T_BOR,		{ .pat = "|" },     1 },
-	{ T_LBRACK,		{ .pat = "[" },     1 },
-	{ T_RBRACK,		{ .pat = "]" },     1 },
-	{ T_BXOR,		{ .pat = "^" },     1 },
-	{ T_LBRACE,		{ .pat = "{" },     1 },
-	{ T_RBRACE,		{ .pat = "}" },     1 },
-	{ T_COLON,		{ .pat = ":" },     1 },
-	{ T_COMMA,		{ .pat = "," },     1 },
-	{ T_COMPL,		{ .pat = "~" },     1 },
-	//{ T_DIV,		{ .pat = "/" },     1 },
-	{ T_GT,			{ .pat = ">" },     1 },
-	{ T_NOT,		{ .pat = "!" },     1 },
-	{ T_LT,			{ .pat = "<" },     1 },
-	{ T_MOD,		{ .pat = "%" },     1 },
-	{ T_MUL,		{ .pat = "*" },     1 },
-	{ T_LPAREN,		{ .pat = "(" },     1 },
-	{ T_RPAREN,		{ .pat = ")" },     1 },
-	{ T_QMARK,		{ .pat = "?" },     1 },
-	{ T_SCOL,		{ .pat = ";" },     1 },
-	{ T_SUB,		{ .pat = "-" },     1 },
-	{ T_DOT,		{ .pat = "." },     1 },
-	{ T_STRING,		{ .pat = "'" },     1, parse_string },
-	{ T_STRING,		{ .pat = "\"" },    1, parse_string },
-	{ T_REGEXP,		{ .pat = "/" },     1, parse_regexp },
-	{ T_LABEL,		{ .pat = "_" },     1, parse_label  },
-	{ T_LABEL,		{ .pat = "az" },    0, parse_label  },
-	{ T_LABEL,		{ .pat = "AZ" },    0, parse_label  },
-	{ T_NUMBER,		{ .pat = "09" },    0, parse_number },
+	{ TK_OR,		{ .pat = "||" },    2 },
+	{ TK_LEXP,		{ .pat = "{{" },    2 },
+	{ TK_REXP,		{ .pat = "}}" },    2 },
+	{ TK_LSTM,		{ .pat = "{%" },    2 },
+	{ TK_RSTM,		{ .pat = "%}" },    2 },
+	{ TK_ARROW,		{ .pat = "=>" },    2 },
+	{ TK_ADD,		{ .pat = "+" },     1 },
+	{ TK_ASSIGN,	{ .pat = "=" },     1 },
+	{ TK_BAND,		{ .pat = "&" },     1 },
+	{ TK_BOR,		{ .pat = "|" },     1 },
+	{ TK_LBRACK,	{ .pat = "[" },     1 },
+	{ TK_RBRACK,	{ .pat = "]" },     1 },
+	{ TK_BXOR,		{ .pat = "^" },     1 },
+	{ TK_LBRACE,	{ .pat = "{" },     1 },
+	{ TK_RBRACE,	{ .pat = "}" },     1 },
+	{ TK_COLON,		{ .pat = ":" },     1 },
+	{ TK_COMMA,		{ .pat = "," },     1 },
+	{ TK_COMPL,		{ .pat = "~" },     1 },
+	//{ TK_DIV,		{ .pat = "/" },     1 },
+	{ TK_GT,		{ .pat = ">" },     1 },
+	{ TK_NOT,		{ .pat = "!" },     1 },
+	{ TK_LT,		{ .pat = "<" },     1 },
+	{ TK_MOD,		{ .pat = "%" },     1 },
+	{ TK_MUL,		{ .pat = "*" },     1 },
+	{ TK_LPAREN,	{ .pat = "(" },     1 },
+	{ TK_RPAREN,	{ .pat = ")" },     1 },
+	{ TK_QMARK,		{ .pat = "?" },     1 },
+	{ TK_SCOL,		{ .pat = ";" },     1 },
+	//{ TK_SUB,		{ .pat = "-" },     1 },
+	{ TK_DOT,		{ .pat = "." },     1 },
+	{ TK_STRING,	{ .pat = "'" },     1, parse_string },
+	{ TK_STRING,	{ .pat = "\"" },    1, parse_string },
+	{ TK_REGEXP,	{ .pat = "/" },     1, parse_regexp },
+	{ TK_LABEL,		{ .pat = "_" },     1, parse_label  },
+	{ TK_LABEL,		{ .pat = "az" },    0, parse_label  },
+	{ TK_LABEL,		{ .pat = "AZ" },    0, parse_label  },
+	{ TK_NUMBER,	{ .pat = "-" },     1, parse_number },
+	{ TK_NUMBER,	{ .pat = "09" },    0, parse_number },
 };
 
 static const struct keyword reserved_words[] = {
-	{ T_ENDFUNC,	"endfunction", 11 },
-	{ T_DOUBLE,		"Infinity", 8, { .d = INFINITY } },
-	{ T_CONTINUE,	"continue", 8 },
-	{ T_ENDWHILE,	"endwhile", 8 },
-	{ T_FUNC,		"function", 8 },
-	{ T_DEFAULT,	"default", 7 },
-	{ T_RETURN,		"return", 6 },
-	{ T_ENDFOR,		"endfor", 6 },
-	{ T_SWITCH,		"switch", 6 },
-	{ T_LOCAL,		"local", 5 },
-	{ T_ENDIF,		"endif", 5 },
-	{ T_WHILE,		"while", 5 },
-	{ T_BREAK,		"break", 5 },
-	{ T_CATCH,		"catch", 5 },
-	{ T_BOOL,		"false", 5, { .b = false } },
-	{ T_BOOL,		"true",  4, { .b = true } },
-	{ T_ELIF,		"elif",  4 },
-	{ T_ELSE,		"else",  4 },
-	{ T_THIS,		"this",  4 },
-	{ T_NULL,		"null",  4 },
-	{ T_CASE,		"case",  4 },
-	{ T_DOUBLE,		"NaN",   3, { .d = NAN } },
-	{ T_TRY,		"try",   3 },
-	{ T_FOR,		"for",   3 },
-	{ T_LOCAL,		"let",   3 },
-	{ T_IF,			"if",    2 },
-	{ T_IN,			"in",    2 },
+	{ TK_ENDFUNC,	"endfunction", 11 },
+	{ TK_DOUBLE,	"Infinity", 8, { .d = INFINITY } },
+	{ TK_CONTINUE,	"continue", 8 },
+	{ TK_ENDWHILE,	"endwhile", 8 },
+	{ TK_FUNC,		"function", 8 },
+	{ TK_DEFAULT,	"default", 7 },
+	{ TK_RETURN,	"return", 6 },
+	{ TK_ENDFOR,	"endfor", 6 },
+	{ TK_SWITCH,	"switch", 6 },
+	{ TK_LOCAL,		"local", 5 },
+	{ TK_ENDIF,		"endif", 5 },
+	{ TK_WHILE,		"while", 5 },
+	{ TK_BREAK,		"break", 5 },
+	{ TK_CATCH,		"catch", 5 },
+	{ TK_BOOL,		"false", 5, { .b = false } },
+	{ TK_BOOL,		"true",  4, { .b = true } },
+	{ TK_ELIF,		"elif",  4 },
+	{ TK_ELSE,		"else",  4 },
+	{ TK_THIS,		"this",  4 },
+	{ TK_NULL,		"null",  4 },
+	{ TK_CASE,		"case",  4 },
+	{ TK_DOUBLE,	"NaN",   3, { .d = NAN } },
+	{ TK_TRY,		"try",   3 },
+	{ TK_FOR,		"for",   3 },
+	{ TK_LOCAL,		"let",   3 },
+	{ TK_IF,		"if",    2 },
+	{ TK_IN,		"in",    2 },
 };
 
 
@@ -221,125 +222,216 @@ utf8enc(char **out, int *rem, int code)
 /* length of the longest token in our lookup table */
 #define UT_LEX_MAX_TOKEN_LEN 3
 
-static uint32_t emit_op(struct uc_state *state, uint32_t pos, int type, struct json_object *val)
+static uc_token *
+emit_op(uc_lexer *lex, uint32_t pos, int type, struct json_object *val)
 {
-	uint32_t off = uc_new_op(state, type, val, UINT32_MAX);
-
-	OP(off)->off = pos;
+	lex->curr.type = type;
+	lex->curr.val = val;
+	lex->curr.pos = pos;
 
 	/* Follow JSLint logic and treat a slash after any of the
 	 * `(,=:[!&|?{};` characters as the beginning of a regex
 	 * literal... */
 	switch (type) {
-	case T_LPAREN:
-	case T_COMMA:
-
-	case T_ASADD:
-	case T_ASBAND:
-	case T_ASBOR:
-	case T_ASBXOR:
-	case T_ASDIV:
-	case T_ASLEFT:
-	case T_ASMOD:
-	case T_ASMUL:
-	case T_ASRIGHT:
-	case T_ASSIGN:
-	case T_ASSUB:
-	case T_EQ:
-	case T_EQS:
-	case T_GE:
-	case T_LE:
-	case T_NE:
-	case T_NES:
-
-	case T_COLON:
-	case T_LBRACK:
-	case T_NOT:
-
-	case T_AND:
-	case T_BAND:
-
-	case T_OR:
-	case T_BOR:
-
-	case T_QMARK:
-
-	case T_LBRACE:
-	case T_RBRACE:
-
-	case T_LSTM:
-	case T_LEXP:
-
-	case T_SCOL:
-		state->lex.expect_div = false;
+	case TK_LPAREN:
+	case TK_COMMA:
+
+	case TK_ASADD:
+	case TK_ASBAND:
+	case TK_ASBOR:
+	case TK_ASBXOR:
+	case TK_ASDIV:
+	case TK_ASLEFT:
+	case TK_ASMOD:
+	case TK_ASMUL:
+	case TK_ASRIGHT:
+	case TK_ASSIGN:
+	case TK_ASSUB:
+	case TK_EQ:
+	case TK_EQS:
+	case TK_GE:
+	case TK_LE:
+	case TK_NE:
+	case TK_NES:
+
+	case TK_COLON:
+	case TK_LBRACK:
+	case TK_NOT:
+
+	case TK_AND:
+	case TK_BAND:
+
+	case TK_OR:
+	case TK_BOR:
+
+	case TK_QMARK:
+
+	case TK_LBRACE:
+	case TK_RBRACE:
+
+	case TK_LSTM:
+	case TK_LEXP:
+
+	case TK_SCOL:
+		lex->expect_div = false;
 		break;
 
 	default:
-		state->lex.expect_div = true;
+		lex->expect_div = true;
 	}
 
-	return off;
+	return &lex->curr;
 }
 
-static void lookbehind_append(struct uc_state *s, const char *data, size_t len)
+static void lookbehind_append(uc_lexer *lex, const char *data, size_t len)
 {
 	if (len) {
-		s->lex.lookbehind = xrealloc(s->lex.lookbehind, s->lex.lookbehindlen + len);
-		memcpy(s->lex.lookbehind + s->lex.lookbehindlen, data, len);
-		s->lex.lookbehindlen += len;
+		lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len);
+		memcpy(lex->lookbehind + lex->lookbehindlen, data, len);
+		lex->lookbehindlen += len;
 	}
 }
 
-static void lookbehind_reset(struct uc_state *s) {
-	free(s->lex.lookbehind);
-	s->lex.lookbehind = NULL;
-	s->lex.lookbehindlen = 0;
+static void lookbehind_reset(uc_lexer *lex) {
+	free(lex->lookbehind);
+	lex->lookbehind = NULL;
+	lex->lookbehindlen = 0;
 }
 
-static uint32_t lookbehind_to_text(struct uc_state *s, uint32_t pos, int type, const char *strip_trailing_chars) {
-	uint32_t rv = 0;
+static uc_token *
+lookbehind_to_text(uc_lexer *lex, uint32_t pos, int type, const char *strip_trailing_chars) {
+	uc_token *rv = NULL;
 
-	if (s->lex.lookbehind) {
+	if (lex->lookbehind) {
 		if (strip_trailing_chars) {
-			while (s->lex.lookbehindlen > 0 && strchr(strip_trailing_chars, s->lex.lookbehind[s->lex.lookbehindlen-1]))
-				s->lex.lookbehindlen--;
+			while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1]))
+				lex->lookbehindlen--;
 		}
 
-		rv = emit_op(s, pos, type, xjs_new_string_len(s->lex.lookbehind, s->lex.lookbehindlen));
+		rv = emit_op(lex, pos, type, xjs_new_string_len(lex->lookbehind, lex->lookbehindlen));
 
-		lookbehind_reset(s);
+		lookbehind_reset(lex);
 	}
 
 	return rv;
 }
 
-static inline size_t buf_remaining(struct uc_state *s) {
-	return (s->lex.bufend - s->lex.bufstart);
+static inline size_t
+buf_remaining(uc_lexer *lex) {
+	return (lex->bufend - lex->bufstart);
 }
 
-static inline bool _buf_startswith(struct uc_state *s, const char *str, size_t len) {
-	return (buf_remaining(s) >= len && !strncmp(s->lex.bufstart, str, len));
+static inline bool
+_buf_startswith(uc_lexer *lex, const char *str, size_t len) {
+	return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len));
 }
 
 #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
 
-static void buf_consume(struct uc_state *s, ssize_t len) {
-	s->lex.bufstart += len;
-	s->source->off += len;
+#if 0
+static void add_lineinfo(struct uc_state *s, size_t off)
+{
+	uc_lineinfo *lines = &s->source->lineinfo;
+	size_t linelen;
+
+	linelen = off - s->lex.lastlineoff;
+
+	/* lineinfo is encoded in bytes: the most significant bit specifies whether
+	 * to advance the line count by one or not, while the remaining 7 bits encode
+	 * the amounts of bytes on the current line.
+	 *
+	 * If a line has more than 127 characters, the first byte will be set to
+	 * 0xff (1 1111111) and subsequent bytes will encode the remaining characters
+	 * in bits 1..7 while setting bit 8 to 0. A line with 400 characters will thus
+	 * be encoded as 0xff 0x7f 0x7f 0x13 (1:1111111 + 0:1111111 + 0:1111111 + 0:1111111).
+	 *
+	 * The newline character itself is not counted, so an empty line is encoded as
+	 * 0x80 (1:0000000).
+	 */
+	uc_vector_grow(lines);
+	lines->entries[lines->count++] = 0x80 + (linelen & 0x7f);
+	linelen -= (linelen & 0x7f);
+
+	while (linelen > 0) {
+		uc_vector_grow(lines);
+		lines->entries[lines->count++] = (linelen & 0x7f);
+		linelen -= (linelen & 0x7f);
+	}
+
+	s->lex.lastlineoff = off + 1;
+	s->lex.line++;
 }
+#endif
 
-static uint32_t
-parse_comment(struct uc_state *s)
+static void
+next_lineinfo(uc_lexer *lex)
 {
-	const struct token *tok = s->lex.tok;
-	const char *ptr, *end;
-	size_t elen;
+	uc_lineinfo *lines = &lex->source->lineinfo;
 
-	if (!buf_remaining(s)) {
-		uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated comment");
+	uc_vector_grow(lines);
+	lines->entries[lines->count++] = 0x80;
+}
+
+static void
+update_lineinfo(uc_lexer *lex, size_t off)
+{
+	uc_lineinfo *lines = &lex->source->lineinfo;
+	uint8_t *entry;
 
-		return 0;
+	entry = uc_vector_last(lines);
+
+	if ((entry[0] & 0x7f) + off <= 0x7f) {
+		entry[0] += off;
 	}
+	else {
+		off -= (0x7f - (entry[0] & 0x7f));
+		entry[0] |= 0x7f;
+
+		while (off > 0) {
+			uc_vector_grow(lines);
+			entry = uc_vector_last(lines);
+			entry[1] = (off & 0x7f);
+			off -= (off & 0x7f);
+			lines->count++;
+		}
+	}
+}
+
+static void
+buf_consume(uc_lexer *lex, size_t len) {
+	size_t i, linelen;
+
+	if (!lex->source->lineinfo.count)
+		next_lineinfo(lex);
+
+	for (i = 0, linelen = 0; i < len; i++) {
+		if (lex->bufstart[i] == '\n') {
+			update_lineinfo(lex, linelen);
+			next_lineinfo(lex);
+
+			linelen = 0;
+		}
+		else {
+			linelen++;
+		}
+	}
+
+	if (linelen)
+		update_lineinfo(lex, linelen);
+
+	lex->bufstart += len;
+	lex->source->off += len;
+}
+
+static uc_token *
+parse_comment(uc_lexer *lex)
+{
+	const struct token *tok = lex->tok;
+	const char *ptr, *end;
+	size_t elen;
+
+	if (!buf_remaining(lex))
+		return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated comment"));
 
 	if (!strcmp(tok->pat, "//")) {
 		end = "\n";
@@ -350,20 +442,21 @@ parse_comment(struct uc_state *s)
 		elen = 2;
 	}
 
-	for (ptr = s->lex.bufstart; ptr < s->lex.bufend - elen; ptr++) {
+	for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) {
 		if (!strncmp(ptr, end, elen)) {
-			buf_consume(s, (ptr - s->lex.bufstart) + elen);
+			buf_consume(lex, (ptr - lex->bufstart) + elen);
 
-			return UINT32_MAX;
+			return UC_LEX_CONTINUE_PARSING;
 		}
 	}
 
-	buf_consume(s, ptr - s->lex.bufstart);
+	buf_consume(lex, ptr - lex->bufstart);
 
-	return 0;
+	return NULL;
 }
 
-static void append_utf8(struct uc_state *s, int code) {
+static void
+append_utf8(uc_lexer *lex, int code) {
 	char ustr[8], *up;
 	int rem;
 
@@ -371,38 +464,35 @@ static void append_utf8(struct uc_state *s, int code) {
 	rem = sizeof(ustr);
 
 	if (utf8enc(&up, &rem, code))
-		lookbehind_append(s, ustr, up - ustr);
+		lookbehind_append(lex, ustr, up - ustr);
 }
 
-static uint32_t
-parse_string(struct uc_state *s)
+static uc_token *
+parse_string(uc_lexer *lex)
 {
-	const struct token *tok = s->lex.tok;
+	const struct token *tok = lex->tok;
 	char q = tok->pat[0];
 	char *ptr, *c;
-	uint32_t rv;
+	uc_token *rv;
 	int code;
 
-	if (!buf_remaining(s)) {
-		uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated string");
-
-		return 0;
-	}
+	if (!buf_remaining(lex))
+		return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated string"));
 
-	for (ptr = s->lex.bufstart; ptr < s->lex.bufend; ptr++) {
+	for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
 		/* continuation of escape sequence */
-		if (s->lex.is_escape) {
-			if (s->lex.esclen == 0) {
+		if (lex->is_escape) {
+			if (lex->esclen == 0) {
 				/* non-unicode escape following a lead surrogate, emit replacement... */
-				if (s->lex.lead_surrogate && *ptr != 'u') {
-					append_utf8(s, 0xFFFD);
-					s->lex.lead_surrogate = 0;
+				if (lex->lead_surrogate && *ptr != 'u') {
+					append_utf8(lex, 0xFFFD);
+					lex->lead_surrogate = 0;
 				}
 
 				switch ((q == '/') ? 0 : *ptr) {
 				case 'u':
 				case 'x':
-					s->lex.esc[s->lex.esclen++] = *ptr;
+					lex->esc[lex->esclen++] = *ptr;
 					break;
 
 				case '0':
@@ -413,65 +503,62 @@ parse_string(struct uc_state *s)
 				case '5':
 				case '6':
 				case '7':
-					s->lex.esc[s->lex.esclen++] = 'o';
-					s->lex.esc[s->lex.esclen++] = *ptr;
+					lex->esc[lex->esclen++] = 'o';
+					lex->esc[lex->esclen++] = *ptr;
 					break;
 
 				default:
-					s->lex.is_escape = false;
+					lex->is_escape = false;
 					c = strchr("a\ab\be\ef\fn\nr\rt\tv\v", *ptr);
 
 					if (c && *c >= 'a') {
-						lookbehind_append(s, c + 1, 1);
+						lookbehind_append(lex, c + 1, 1);
 					}
 					else {
 						/* regex mode => retain backslash */
 						if (q == '/')
-							lookbehind_append(s, "\\", 1);
+							lookbehind_append(lex, "\\", 1);
 
-						lookbehind_append(s, ptr, 1);
+						lookbehind_append(lex, ptr, 1);
 					}
 
-					buf_consume(s, (ptr + 1) - s->lex.bufstart);
+					buf_consume(lex, (ptr + 1) - lex->bufstart);
 
 					break;
 				}
 			}
 			else {
-				switch (s->lex.esc[0]) {
+				switch (lex->esc[0]) {
 				case 'u':
-					if (s->lex.esclen < 5) {
-						if (!isxdigit(*ptr)) {
-							uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence");
+					if (lex->esclen < 5) {
+						if (!isxdigit(*ptr))
+							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence"));
 
-							return 0;
-						}
-
-						s->lex.esc[s->lex.esclen++] = *ptr;
+						lex->esc[lex->esclen++] = *ptr;
 					}
 
-					if (s->lex.esclen == 5) {
-						code = hex(s->lex.esc[1]) * 16 * 16 * 16 +
-						       hex(s->lex.esc[2]) * 16 * 16 +
-						       hex(s->lex.esc[3]) * 16 +
-						       hex(s->lex.esc[4]);
+					if (lex->esclen == 5) {
+						code = hex(lex->esc[1]) * 16 * 16 * 16 +
+						       hex(lex->esc[2]) * 16 * 16 +
+						       hex(lex->esc[3]) * 16 +
+						       hex(lex->esc[4]);
 
 						/* is a leading surrogate value */
 						if ((code & 0xFC00) == 0xD800) {
 							/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
-							if (s->lex.lead_surrogate)
-								append_utf8(s, 0xFFFD);
+							if (lex->lead_surrogate)
+								append_utf8(lex, 0xFFFD);
 
 							/* store surrogate value and advance to next escape sequence */
-							s->lex.lead_surrogate = code;
+							lex->lead_surrogate = code;
 						}
 
 						/* is a trailing surrogate value */
 						else if ((code & 0xFC00) == 0xDC00) {
 							/* found a trailing surrogate following a leading one, combine and encode */
-							if (s->lex.lead_surrogate) {
-								code = 0x10000 + ((s->lex.lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
-								s->lex.lead_surrogate = 0;
+							if (lex->lead_surrogate) {
+								code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+								lex->lead_surrogate = 0;
 							}
 
 							/* trailing surrogate not following a leading one, ignore and use replacement char */
@@ -479,87 +566,81 @@ parse_string(struct uc_state *s)
 								code = 0xFFFD;
 							}
 
-							append_utf8(s, code);
+							append_utf8(lex, code);
 						}
 
 						/* is a normal codepoint */
 						else {
-							append_utf8(s, code);
+							append_utf8(lex, code);
 						}
 
-						s->lex.esclen = 0;
-						s->lex.is_escape = false;
-						buf_consume(s, (ptr + 1) - s->lex.bufstart);
+						lex->esclen = 0;
+						lex->is_escape = false;
+						buf_consume(lex, (ptr + 1) - lex->bufstart);
 					}
 
 					break;
 
 				case 'x':
-					if (s->lex.esclen < 3) {
-						if (!isxdigit(*ptr)) {
-							uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence");
-
-							return 0;
-						}
+					if (lex->esclen < 3) {
+						if (!isxdigit(*ptr))
+							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence"));
 
-						s->lex.esc[s->lex.esclen++] = *ptr;
+						lex->esc[lex->esclen++] = *ptr;
 					}
 
-					if (s->lex.esclen == 3) {
-						append_utf8(s, hex(s->lex.esc[1]) * 16 + hex(s->lex.esc[2]));
+					if (lex->esclen == 3) {
+						append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2]));
 
-						s->lex.esclen = 0;
-						s->lex.is_escape = false;
-						buf_consume(s, (ptr + 1) - s->lex.bufstart);
+						lex->esclen = 0;
+						lex->is_escape = false;
+						buf_consume(lex, (ptr + 1) - lex->bufstart);
 					}
 
 					break;
 
 				case 'o':
-					if (s->lex.esclen < 4) {
+					if (lex->esclen < 4) {
 						/* found a non-octal char */
 						if (*ptr < '0' || *ptr > '7') {
 							/* pad sequence to three chars */
-							switch (s->lex.esclen) {
+							switch (lex->esclen) {
 							case 3:
-								s->lex.esc[3] = s->lex.esc[2];
-								s->lex.esc[2] = s->lex.esc[1];
-								s->lex.esc[1] = '0';
+								lex->esc[3] = lex->esc[2];
+								lex->esc[2] = lex->esc[1];
+								lex->esc[1] = '0';
 								break;
 
 							case 2:
-								s->lex.esc[3] = s->lex.esc[1];
-								s->lex.esc[2] = '0';
-								s->lex.esc[1] = '0';
+								lex->esc[3] = lex->esc[1];
+								lex->esc[2] = '0';
+								lex->esc[1] = '0';
 								break;
 							}
 
-							s->lex.esclen = 4;
-							buf_consume(s, ptr-- - s->lex.bufstart);
+							lex->esclen = 4;
+							buf_consume(lex, ptr-- - lex->bufstart);
 						}
 
 						/* append */
 						else {
-							s->lex.esc[s->lex.esclen++] = *ptr;
-							buf_consume(s, (ptr + 1) - s->lex.bufstart);
+							lex->esc[lex->esclen++] = *ptr;
+							buf_consume(lex, (ptr + 1) - lex->bufstart);
 						}
 					}
 
-					if (s->lex.esclen == 4) {
-						code = dec(s->lex.esc[1]) * 8 * 8 +
-						       dec(s->lex.esc[2]) * 8 +
-						       dec(s->lex.esc[3]);
+					if (lex->esclen == 4) {
+						code = dec(lex->esc[1]) * 8 * 8 +
+						       dec(lex->esc[2]) * 8 +
+						       dec(lex->esc[3]);
 
-						if (code > 255) {
-							uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence");
+						if (code > 255)
+							return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence"));
 
-							return 0;
-						}
-
-						append_utf8(s, code);
+						append_utf8(lex, code);
 
-						s->lex.esclen = 0;
-						s->lex.is_escape = false;
+						lex->esclen = 0;
+						lex->is_escape = false;
 					}
 
 					break;
@@ -569,29 +650,29 @@ parse_string(struct uc_state *s)
 
 		/* terminating char */
 		else if (*ptr == q) {
-			lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-			buf_consume(s, (ptr + 1) - s->lex.bufstart);
+			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+			buf_consume(lex, (ptr + 1) - lex->bufstart);
 
-			rv = lookbehind_to_text(s, s->lex.lastoff, T_STRING, NULL);
+			rv = lookbehind_to_text(lex, lex->lastoff, TK_STRING, NULL);
 
 			if (!rv)
-				rv = emit_op(s, s->lex.lastoff, T_STRING, xjs_new_string_len("", 0));
+				rv = emit_op(lex, lex->lastoff, TK_STRING, xjs_new_string_len("", 0));
 
 			return rv;
 		}
 
 		/* escape sequence start */
 		else if (*ptr == '\\') {
-			s->lex.is_escape = true;
-			lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-			buf_consume(s, ptr - s->lex.bufstart);
+			lex->is_escape = true;
+			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+			buf_consume(lex, ptr - lex->bufstart);
 		}
 	}
 
-	lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-	buf_consume(s, ptr - s->lex.bufstart);
+	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+	buf_consume(lex, ptr - lex->bufstart);
 
-	return 0;
+	return NULL;
 }
 
 
@@ -614,90 +695,85 @@ enum {
 	UT_LEX_PARSE_REGEX_FLAGS
 };
 
-static uint32_t
-parse_regexp(struct uc_state *state)
+static uc_token *
+parse_regexp(uc_lexer *lex)
 {
-	struct json_object *pattern;
-	struct uc_op *op;
-	uint32_t rv;
-	char *err;
+	bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
+	uc_token *rv;
+	size_t len;
+	char *s;
 
-	switch (state->lex.esc[0]) {
+	switch (lex->esc[0]) {
 	case UT_LEX_PARSE_REGEX_INIT:
-		if (state->lex.expect_div) {
-			state->lex.expect_div = false;
+		if (lex->expect_div) {
+			lex->expect_div = false;
 
-			if (buf_startswith(state, "=")) {
-				buf_consume(state, 1);
+			if (buf_startswith(lex, "=")) {
+				buf_consume(lex, 1);
 
-				return emit_op(state, state->source->off, T_ASDIV, NULL);
+				return emit_op(lex, lex->source->off, TK_ASDIV, NULL);
 			}
 
-			return emit_op(state, state->source->off, T_DIV, NULL);
+			return emit_op(lex, lex->source->off, TK_DIV, NULL);
 		}
 
-		state->lex.esc[0] = UT_LEX_PARSE_REGEX_PATTERN;
+		lex->esc[0] = UT_LEX_PARSE_REGEX_PATTERN;
 		break;
 
 	case UT_LEX_PARSE_REGEX_PATTERN:
-		rv = parse_string(state);
+		rv = parse_string(lex);
 
-		if (rv != 0 && rv != UINT32_MAX) {
-			state->lex.lookbehind = (char *)OP(rv);
-			state->lex.esc[0] = UT_LEX_PARSE_REGEX_FLAGS;
+		if (rv && rv->type == TK_ERROR)
+			return rv;
+
+		if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) {
+			lex->lookbehind = (char *)rv;
+			lex->esc[0] = UT_LEX_PARSE_REGEX_FLAGS;
 		}
 
 		break;
 
 	case UT_LEX_PARSE_REGEX_FLAGS:
-		op = (struct uc_op *)state->lex.lookbehind;
+		rv = (uc_token *)lex->lookbehind;
 
-		while (state->lex.bufstart < state->lex.bufend) {
-			switch (state->lex.bufstart[0]) {
+		while (lex->bufstart < lex->bufend) {
+			switch (lex->bufstart[0]) {
 			case 'g':
-				buf_consume(state, 1);
-				op->is_reg_global = true;
+				buf_consume(lex, 1);
+				is_reg_global = true;
 				break;
 
 			case 'i':
-				buf_consume(state, 1);
-				op->is_reg_icase = true;
+				buf_consume(lex, 1);
+				is_reg_icase = true;
 				break;
 
 			case 's':
-				buf_consume(state, 1);
-				op->is_reg_newline = true;
+				buf_consume(lex, 1);
+				is_reg_newline = true;
 				break;
 
 			default:
-				state->lex.lookbehind = NULL;
-
-				pattern = uc_new_regexp(json_object_get_string(op->val),
-				                        op->is_reg_icase,
-				                        op->is_reg_newline,
-				                        op->is_reg_global,
-				                        &err);
+				lex->lookbehind = NULL;
 
-				json_object_put(op->val);
+				len = xasprintf(&s, "%c%*s",
+					(is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
+					json_object_get_string_len(rv->val),
+					json_object_get_string(rv->val));
 
-				op->type = T_REGEXP;
-				op->val = pattern;
+				json_object_set_string_len(rv->val, s, len);
+				free(s);
 
-				if (!pattern) {
-					uc_new_exception(state, op->off, "Syntax error: %s", err);
-					free(err);
+				rv->type = TK_REGEXP;
 
-					return 0;
-				}
-
-				return op - state->pool;
+				return rv;
 			}
 		}
 
 		break;
 	}
 
-	return 0;
+	return NULL;
 }
 
 
@@ -711,50 +787,50 @@ parse_regexp(struct uc_state *state)
  *  -UT_ERROR_OVERLONG_STRING	Label too long
  */
 
-static uint32_t
-parse_label(struct uc_state *s)
+static uc_token *
+parse_label(uc_lexer *lex)
 {
-	const struct token *tok = s->lex.tok;
+	const struct token *tok = lex->tok;
 	const struct keyword *word;
-	uint32_t rv;
+	uc_token *rv;
 	char *ptr;
 	size_t i;
 
-	if (!s->lex.lookbehind && tok->plen)
-		lookbehind_append(s, tok->pat, tok->plen);
+	if (!lex->lookbehind && tok->plen)
+		lookbehind_append(lex, tok->pat, tok->plen);
 
-	if (!buf_remaining(s) || (s->lex.bufstart[0] != '_' && !isalnum(s->lex.bufstart[0]))) {
+	if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
 		for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
-			if (s->lex.lookbehindlen == word->plen && !strncmp(s->lex.lookbehind, word->pat, word->plen)) {
-				lookbehind_reset(s);
+			if (lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
+				lookbehind_reset(lex);
 
 				switch (word->type) {
-				case T_DOUBLE:
-					rv = emit_op(s, s->source->off - word->plen, word->type, uc_new_double(word->d));
+				case TK_DOUBLE:
+					rv = emit_op(lex, lex->source->off - word->plen, word->type, uc_double_new(word->d));
 					break;
 
-				case T_BOOL:
-					rv = emit_op(s, s->source->off - word->plen, word->type, xjs_new_boolean(word->b));
+				case TK_BOOL:
+					rv = emit_op(lex, lex->source->off - word->plen, word->type, xjs_new_boolean(word->b));
 					break;
 
 				default:
-					rv = emit_op(s, s->source->off - word->plen, word->type, NULL);
+					rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL);
 				}
 
 				return rv;
 			}
 		}
 
-		return lookbehind_to_text(s, s->source->off - s->lex.lookbehindlen, T_LABEL, NULL);
+		return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL);
 	}
 
-	for (ptr = s->lex.bufstart; ptr < s->lex.bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
+	for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
 		;
 
-	lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-	buf_consume(s, ptr - s->lex.bufstart);
+	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+	buf_consume(lex, ptr - lex->bufstart);
 
-	return 0;
+	return NULL;
 }
 
 
@@ -769,198 +845,206 @@ parse_label(struct uc_state *s)
  */
 
 static inline bool
-is_numeric_char(struct uc_state *s, char c)
+is_numeric_char(uc_lexer *lex, char c)
 {
-	char prev = s->lex.lookbehindlen ? s->lex.lookbehind[s->lex.lookbehindlen-1] : 0;
+	char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0;
 
 	if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+'))
 		return true;
 
-	return (isxdigit(c) || c == 'x' || c == 'X' || c == '.');
+	return prev ? (isxdigit(c) || c == 'x' || c == 'X' || c == '.') : (isdigit(c) || c == '.');
 }
 
-static uint32_t
-parse_number(struct uc_state *state)
+static uc_token *
+parse_number(uc_lexer *lex)
 {
-	uint32_t rv = 0;
+	const struct token *tok = lex->tok;
+	uc_token *rv = NULL;
 	long long int n;
 	char *ptr, *e;
 	double d;
 
-	if (!buf_remaining(state) || !is_numeric_char(state, state->lex.bufstart[0])) {
-		lookbehind_append(state, "\0", 1);
+	if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) {
+		if (lex->lookbehindlen == 0 && !is_numeric_char(lex, lex->bufstart[0]))
+			return emit_op(lex, lex->source->off, TK_SUB, NULL);
 
-		n = strtoll(state->lex.lookbehind, &e, 0);
+		lookbehind_append(lex, "\0", 1);
+
+		n = strtoll(lex->lookbehind, &e, 0);
 
 		if (*e == '.' || *e == 'e' || *e == 'E') {
-			d = strtod(state->lex.lookbehind, &e);
+			d = strtod(lex->lookbehind, &e);
+
+			if (tok->pat[0] == '-')
+				d = -d;
 
-			if (e > state->lex.lookbehind && *e == 0)
-				rv = emit_op(state, state->source->off - (e - state->lex.lookbehind), T_DOUBLE, uc_new_double(d));
+			if (e > lex->lookbehind && *e == 0)
+				rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, uc_double_new(d));
 			else
-				uc_new_exception(state, state->source->off - (state->lex.lookbehindlen - (e - state->lex.lookbehind) - 1),
-				                 "Syntax error: Invalid number literal");
+				rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, xjs_new_string("Invalid number literal"));
 		}
 		else if (*e == 0) {
-			rv = emit_op(state, state->source->off - (e - state->lex.lookbehind), T_NUMBER, xjs_new_int64(n));
-			OP(rv)->is_overflow = (errno == ERANGE);
+			if (tok->pat[0] == '-')
+				n = (errno == ERANGE) ? INT64_MIN : -n;
+
+			rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, xjs_new_int64(n));
+			//OP(rv)->is_overflow = (errno == ERANGE);
 		}
 		else {
-			uc_new_exception(state, state->source->off - (state->lex.lookbehindlen - (e - state->lex.lookbehind) - 1),
-			                 "Syntax error: Invalid number literal");
+			rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, xjs_new_string("Invalid number literal"));
 		}
 
-		lookbehind_reset(state);
+		lookbehind_reset(lex);
 
 		return rv;
 	}
 
-	for (ptr = state->lex.bufstart; ptr < state->lex.bufend && is_numeric_char(state, *ptr); ptr++)
+	for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++)
 		;
 
-	lookbehind_append(state, state->lex.bufstart, ptr - state->lex.bufstart);
-	buf_consume(state, ptr - state->lex.bufstart);
+	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+	buf_consume(lex, ptr - lex->bufstart);
 
-	return 0;
+	return NULL;
 }
 
-static uint32_t
-lex_step(struct uc_state *s, FILE *fp)
+static uc_token *
+lex_step(uc_lexer *lex, FILE *fp)
 {
 	uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
 	union { uint32_t n; char str[4]; } search;
 	const struct token *tok;
 	size_t rlen, rem;
 	char *ptr, c;
-	uint32_t rv;
+	uc_token *rv;
 	size_t i;
 
 	/* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining,
 	 * move the remaining bytes to the beginning and read more data */
-	if (buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) {
-		if (!s->lex.buf) {
-			s->lex.buflen = 128;
-			s->lex.buf = xalloc(s->lex.buflen);
+	if (buf_remaining(lex) < UT_LEX_MAX_TOKEN_LEN) {
+		if (!lex->buf) {
+			lex->buflen = 128;
+			lex->buf = xalloc(lex->buflen);
 		}
 
-		rem = s->lex.bufend - s->lex.bufstart;
+		rem = lex->bufend - lex->bufstart;
 
-		memcpy(s->lex.buf, s->lex.bufstart, rem);
+		memcpy(lex->buf, lex->bufstart, rem);
 
-		rlen = fread(s->lex.buf + rem, 1, s->lex.buflen - rem, fp);
+		rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp);
 
-		s->lex.bufstart = s->lex.buf;
-		s->lex.bufend   = s->lex.buf + rlen + rem;
+		lex->bufstart = lex->buf;
+		lex->bufend   = lex->buf + rlen + rem;
 
 		if (rlen == 0 && (ferror(fp) || feof(fp)))
-			s->lex.eof = 1;
+			lex->eof = 1;
 	}
 
-	switch (s->lex.state) {
+	switch (lex->state) {
 	case UT_LEX_IDENTIFY_BLOCK:
 		/* previous block had strip trailing whitespace flag, skip leading whitespace */
-		if (s->lex.skip_leading_whitespace) {
-			while (buf_remaining(s) && isspace(s->lex.bufstart[0]))
-				buf_consume(s, 1);
+		if (lex->skip_leading_whitespace) {
+			while (buf_remaining(lex) && isspace(lex->bufstart[0]))
+				buf_consume(lex, 1);
 
-			s->lex.skip_leading_whitespace = false;
+			lex->skip_leading_whitespace = false;
 		}
 
 		/* previous block was a statement block and trim_blocks is enabld, skip leading newline */
-		else if (s->lex.skip_leading_newline) {
-			if (buf_startswith(s, "\n"))
-				buf_consume(s, 1);
+		else if (lex->skip_leading_newline) {
+			if (buf_startswith(lex, "\n"))
+				buf_consume(lex, 1);
 
-			s->lex.skip_leading_newline = false;
+			lex->skip_leading_newline = false;
 		}
 
 		/* scan forward through buffer to identify start token */
-		for (ptr = s->lex.bufstart; ptr < s->lex.bufend - strlen("{#"); ptr++) {
+		for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) {
 			/* found start of comment block */
 			if (!strncmp(ptr, "{#", 2)) {
-				lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-				buf_consume(s, (ptr + 2) - s->lex.bufstart);
-				s->lex.lastoff = s->source->off - 2;
-				s->lex.state = UT_LEX_BLOCK_COMMENT_START;
+				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+				buf_consume(lex, (ptr + 2) - lex->bufstart);
+				lex->lastoff = lex->source->off - 2;
+				lex->state = UT_LEX_BLOCK_COMMENT_START;
 
-				return 0;
+				return NULL;
 			}
 
 			/* found start of expression block */
 			else if (!strncmp(ptr, "{{", 2)) {
-				lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-				buf_consume(s, (ptr + 2) - s->lex.bufstart);
-				s->lex.lastoff = s->source->off - 2;
-				s->lex.state = UT_LEX_BLOCK_EXPRESSION_START;
+				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+				buf_consume(lex, (ptr + 2) - lex->bufstart);
+				lex->lastoff = lex->source->off - 2;
+				lex->state = UT_LEX_BLOCK_EXPRESSION_START;
 
-				return 0;
+				return NULL;
 			}
 
 			/* found start of statement block */
 			else if (!strncmp(ptr, "{%", 2)) {
-				lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-				buf_consume(s, (ptr + 2) - s->lex.bufstart);
-				s->lex.lastoff = s->source->off - 2;
-				s->lex.state = UT_LEX_BLOCK_STATEMENT_START;
+				lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+				buf_consume(lex, (ptr + 2) - lex->bufstart);
+				lex->lastoff = lex->source->off - 2;
+				lex->state = UT_LEX_BLOCK_STATEMENT_START;
 
-				return 0;
+				return NULL;
 			}
 		}
 
 		/* we're at eof */
-		if (s->lex.eof) {
-			lookbehind_append(s, ptr, s->lex.bufend - ptr);
-			s->lex.state = UT_LEX_EOF;
+		if (lex->eof) {
+			lookbehind_append(lex, ptr, lex->bufend - ptr);
+			lex->state = UT_LEX_EOF;
 
-			return lookbehind_to_text(s, s->lex.lastoff, T_TEXT, NULL);
+			return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL);
 		}
 
-		lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
-		buf_consume(s, ptr - s->lex.bufstart);
+		lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+		buf_consume(lex, ptr - lex->bufstart);
 		break;
 
 
 	case UT_LEX_BLOCK_COMMENT_START:
 	case UT_LEX_BLOCK_EXPRESSION_START:
 	case UT_LEX_BLOCK_STATEMENT_START:
-		rv = 0;
-		s->lex.skip_leading_whitespace = 0;
+		rv = NULL;
+		lex->skip_leading_whitespace = 0;
 
 		/* strip whitespace before block */
-		if (buf_startswith(s, "-")) {
-			rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \n\t\v\f\r");
-			buf_consume(s, 1);
+		if (buf_startswith(lex, "-")) {
+			rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r");
+			buf_consume(lex, 1);
 		}
 
 		/* disable lstrip flag (only valid for statement blocks) */
-		else if (s->lex.state == UT_LEX_BLOCK_STATEMENT_START) {
+		else if (lex->state == UT_LEX_BLOCK_STATEMENT_START) {
 			/* disable lstrip flag */
-			if (buf_startswith(s, "+")) {
-				rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL);
-				buf_consume(s, 1);
+			if (buf_startswith(lex, "+")) {
+				rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
+				buf_consume(lex, 1);
 			}
 
 			/* global block lstrip */
-			else if (s->lstrip_blocks) {
-				rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \t\v\f\r");
+			else if (lex->config && lex->config->lstrip_blocks) {
+				rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \t\v\f\r");
 			}
 		}
 		else {
-			rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL);
+			rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
 		}
 
-		switch (s->lex.state) {
+		switch (lex->state) {
 		case UT_LEX_BLOCK_COMMENT_START:
-			s->lex.state = UT_LEX_BLOCK_COMMENT;
+			lex->state = UT_LEX_BLOCK_COMMENT;
 			break;
 
 		case UT_LEX_BLOCK_STATEMENT_START:
-			s->lex.within_statement_block = 1;
-			s->lex.state = UT_LEX_IDENTIFY_TOKEN;
+			lex->within_statement_block = 1;
+			lex->state = UT_LEX_IDENTIFY_TOKEN;
 			break;
 
 		case UT_LEX_BLOCK_EXPRESSION_START:
-			s->lex.state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG;
+			lex->state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG;
 			break;
 
 		default:
@@ -972,152 +1056,151 @@ lex_step(struct uc_state *s, FILE *fp)
 
 	case UT_LEX_BLOCK_COMMENT:
 		/* scan forward through buffer to identify end token */
-		while (s->lex.bufstart < s->lex.bufend - 2) {
-			if (buf_startswith(s, "-#}")) {
-				s->lex.state = UT_LEX_IDENTIFY_BLOCK;
-				s->lex.skip_leading_whitespace = 1;
-				buf_consume(s, 3);
-				s->lex.lastoff = s->source->off;
+		while (lex->bufstart < lex->bufend - 2) {
+			if (buf_startswith(lex, "-#}")) {
+				lex->state = UT_LEX_IDENTIFY_BLOCK;
+				lex->skip_leading_whitespace = 1;
+				buf_consume(lex, 3);
+				lex->lastoff = lex->source->off;
 				break;
 			}
-			else if (buf_startswith(s, "#}")) {
-				s->lex.state = UT_LEX_IDENTIFY_BLOCK;
-				s->lex.skip_leading_whitespace = 0;
-				buf_consume(s, 2);
-				s->lex.lastoff = s->source->off;
+			else if (buf_startswith(lex, "#}")) {
+				lex->state = UT_LEX_IDENTIFY_BLOCK;
+				lex->skip_leading_whitespace = 0;
+				buf_consume(lex, 2);
+				lex->lastoff = lex->source->off;
 				break;
 			}
 
-			buf_consume(s, 1);
+			buf_consume(lex, 1);
 		}
 
 		/* we're at eof */
-		if (s->lex.eof)
-			uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated template block");
+		if (lex->eof) {
+			lex->state = UT_LEX_EOF;
+
+			buf_consume(lex, lex->bufend - lex->bufstart);
+
+			return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated template block"));
+		}
 
 		break;
 
 
 	case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG:
-		s->lex.within_expression_block = 1;
-		s->lex.state = UT_LEX_IDENTIFY_TOKEN;
+		lex->within_expression_block = 1;
+		lex->state = UT_LEX_IDENTIFY_TOKEN;
 
-		return emit_op(s, s->source->off, T_LEXP, NULL);
+		return emit_op(lex, lex->source->off, TK_LEXP, NULL);
 
 
 	case UT_LEX_IDENTIFY_TOKEN:
 		/* skip leading whitespace */
-		for (i = 0; i < buf_remaining(s) && isspace(s->lex.bufstart[i]); i++)
+		for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++)
 			;
 
-		buf_consume(s, i);
+		buf_consume(lex, i);
 
-		if (i > 0 && buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN)
-			return 0;
+		if (i > 0 && buf_remaining(lex) < UT_LEX_MAX_TOKEN_LEN)
+			return NULL;
 
 		for (i = 0; i < sizeof(search.str); i++)
-			search.str[i] = (i < buf_remaining(s)) ? s->lex.bufstart[i] : 0;
+			search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0;
 
 		for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
 			/* remaining buffer data is shorter than token, skip */
-			if (tok->plen > buf_remaining(s))
+			if (tok->plen > buf_remaining(lex))
 				continue;
 
-			c = s->lex.bufstart[0];
+			c = buf_remaining(lex) ? lex->bufstart[0] : 0;
 
 			if (tok->plen ? ((search.n & masks[tok->plen]) == tok->patn)
 			              : (c >= tok->pat[0] && c <= tok->pat[1])) {
-				buf_consume(s, tok->plen);
-
-				s->lex.lastoff = s->source->off - tok->plen;
+				lex->lastoff = lex->source->off;
 
 				/* token has a parse method, switch state */
 				if (tok->parse) {
-					s->lex.tok = tok;
-					s->lex.state = UT_LEX_PARSE_TOKEN;
+					lex->tok = tok;
+					lex->state = UT_LEX_PARSE_TOKEN;
 
-					return 0;
+					buf_consume(lex, tok->plen);
+
+					return NULL;
 				}
 
 				/* disallow nesting blocks */
-				if ((s->lex.within_expression_block &&
-				     (tok->type == T_LSTM || tok->type == T_RSTM || tok->type == T_LEXP)) ||
-				    (s->lex.within_statement_block &&
-				     (tok->type == T_LEXP || tok->type == T_REXP || tok->type == T_LSTM))) {
-					uc_new_exception(s, s->source->off - tok->plen, "Syntax error: Template blocks may not be nested");
+				if ((lex->within_expression_block &&
+				     (tok->type == TK_LSTM || tok->type == TK_RSTM || tok->type == TK_LEXP)) ||
+				    (lex->within_statement_block &&
+				     (tok->type == TK_LEXP || tok->type == TK_REXP || tok->type == TK_LSTM))) {
+					buf_consume(lex, tok->plen);
 
-					return 0;
+					return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, xjs_new_string("Template blocks may not be nested"));
 				}
 
 				/* found end of block */
-				else if ((s->lex.within_statement_block && tok->type == T_RSTM) ||
-				         (s->lex.within_expression_block && tok->type == T_REXP)) {
+				else if ((lex->within_statement_block && tok->type == TK_RSTM) ||
+				         (lex->within_expression_block && tok->type == TK_REXP)) {
 					/* emit additional empty statement (semicolon) at end of template block */
-					if (!s->lex.semicolon_emitted) {
-						s->lex.semicolon_emitted = true;
-
-						/* rewind */
-						buf_consume(s, -tok->plen);
+					if (!lex->semicolon_emitted) {
+						lex->semicolon_emitted = true;
 
-						return emit_op(s, s->source->off, T_SCOL, NULL);
+						return emit_op(lex, lex->source->off, TK_SCOL, NULL);
 					}
 
 					/* strip whitespace after block */
 					if (tok->pat[0] == '-')
-						s->lex.skip_leading_whitespace = true;
+						lex->skip_leading_whitespace = true;
 
 					/* strip newline after statement block */
-					else if (s->lex.within_statement_block && s->trim_blocks)
-						s->lex.skip_leading_newline = true;
-
-					s->lex.semicolon_emitted = false;
-					s->lex.within_statement_block = false;
-					s->lex.within_expression_block = false;
-					s->lex.state = UT_LEX_IDENTIFY_BLOCK;
-					s->lex.lastoff = s->source->off;
+					else if (lex->within_statement_block &&
+					         lex->config && lex->config->trim_blocks)
+						lex->skip_leading_newline = true;
+
+					lex->semicolon_emitted = false;
+					lex->within_statement_block = false;
+					lex->within_expression_block = false;
+					lex->state = UT_LEX_IDENTIFY_BLOCK;
 				}
 
 				/* do not report statement tags to the parser */
-				if (tok->type != 0 && tok->type != T_LSTM && tok->type != T_RSTM)
-					rv = emit_op(s, s->source->off - tok->plen, tok->type, NULL);
+				if (tok->type != 0 && tok->type != TK_LSTM && tok->type != TK_RSTM)
+					rv = emit_op(lex, lex->source->off, tok->type, NULL);
 				else
-					rv = 0;
+					rv = NULL;
+
+				buf_consume(lex, tok->plen);
 
 				return rv;
 			}
 		}
 
 		/* no token matched and we do have remaining data, junk */
-		if (buf_remaining(s)) {
-			uc_new_exception(s, s->source->off, "Syntax error: Unexpected character");
-
-			return 0;
-		}
+		if (buf_remaining(lex))
+			return emit_op(lex, lex->source->off, TK_ERROR, xjs_new_string("Unexpected character"));
 
 		/* we're at eof, allow unclosed statement blocks */
-		if (s->lex.within_statement_block) {
-			s->lex.state = UT_LEX_EOF;
+		if (lex->within_statement_block) {
+			lex->state = UT_LEX_EOF;
 
-			return 0;
+			return NULL;
 		}
 
 		/* premature EOF */
-		uc_new_exception(s, s->source->off, "Syntax error: Unterminated template block");
-
-		break;
+		return emit_op(lex, lex->source->off, TK_ERROR, xjs_new_string("Unterminated template block"));
 
 
 	case UT_LEX_PARSE_TOKEN:
-		tok = s->lex.tok;
-		rv = tok->parse(s);
+		tok = lex->tok;
+		rv = tok->parse(lex);
 
 		if (rv) {
-			memset(s->lex.esc, 0, sizeof(s->lex.esc));
-			s->lex.state = UT_LEX_IDENTIFY_TOKEN;
-			s->lex.tok = NULL;
+			memset(lex->esc, 0, sizeof(lex->esc));
+			lex->state = UT_LEX_IDENTIFY_TOKEN;
+			lex->tok = NULL;
 
-			if (rv == UINT32_MAX)
-				rv = 0;
+			if (rv == UC_LEX_CONTINUE_PARSING)
+				rv = NULL;
 
 			return rv;
 		}
@@ -1129,25 +1212,66 @@ lex_step(struct uc_state *s, FILE *fp)
 		break;
 	}
 
-	return 0;
+	return NULL;
 }
 
-uint32_t
-uc_get_token(struct uc_state *s, FILE *fp)
+void
+uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source)
 {
-	uint32_t rv;
+	lex->state = UT_LEX_IDENTIFY_BLOCK;
 
-	while (s->lex.state != UT_LEX_EOF) {
-		rv = lex_step(s, fp);
+	lex->config = config;
+	lex->source = uc_source_get(source);
 
-		if (rv == 0 && s->exception)
-			break;
+	lex->eof = 0;
+	lex->skip_leading_whitespace = 0;
+	lex->skip_leading_newline = 0;
+	lex->within_statement_block = 0;
+	lex->within_statement_block = 0;
+	lex->semicolon_emitted = 0;
+	lex->expect_div = 0;
+	lex->is_escape = 0;
+
+	lex->buflen = 0;
+	lex->buf = NULL;
+	lex->bufstart = NULL;
+	lex->bufend = NULL;
+
+	lex->lookbehindlen = 0;
+	lex->lookbehind = NULL;
+
+	lex->tok = NULL;
+
+	lex->esclen = 0;
+	memset(lex->esc, 0, sizeof(lex->esc));
+
+	lex->lead_surrogate = 0;
+
+	lex->lastoff = 0;
+}
+
+void
+uc_lexer_free(uc_lexer *lex)
+{
+	uc_source_put(lex->source);
+
+	free(lex->lookbehind);
+	free(lex->buf);
+}
+
+uc_token *
+uc_lexer_next_token(uc_lexer *lex)
+{
+	uc_token *rv;
+
+	while (lex->state != UT_LEX_EOF) {
+		rv = lex_step(lex, lex->source->fp);
 
-		if (rv > 0)
+		if (rv != NULL)
 			return rv;
 	}
 
-	return 0;
+	return emit_op(lex, lex->source->off, TK_EOF, NULL);
 }
 
 const char *
@@ -1158,11 +1282,11 @@ uc_get_tokenname(int type)
 
 	switch (type) {
 	case 0:        return "End of file";
-	case T_STRING: return "String";
-	case T_LABEL:  return "Label";
-	case T_NUMBER: return "Number";
-	case T_DOUBLE: return "Double";
-	case T_REGEXP: return "Regexp";
+	case TK_STRING: return "String";
+	case TK_LABEL:  return "Label";
+	case TK_NUMBER: return "Number";
+	case TK_DOUBLE: return "Double";
+	case TK_REGEXP: return "Regexp";
 	}
 
 	for (i = 0; i < ARRAY_SIZE(tokens); i++) {
author	Jo-Philipp Wich <jo@mein.io>	2020-12-23 20:54:05 +0100
committer	Jo-Philipp Wich <jo@mein.io>	2021-02-17 14:10:51 +0100
commit	3756806674da909ec6dc10ad25862b592792604e (patch)
tree	f2af7e47f8444caaff0a5a33599f381889db24e3 /lexer.c
parent	77580a893283f2bde7ab46496bd3a3d7b2fc6784 (diff)