diff options
author | Jo-Philipp Wich <jo@mein.io> | 2020-12-23 20:54:05 +0100 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2021-02-17 14:10:51 +0100 |
commit | 3756806674da909ec6dc10ad25862b592792604e (patch) | |
tree | f2af7e47f8444caaff0a5a33599f381889db24e3 /lexer.c | |
parent | 77580a893283f2bde7ab46496bd3a3d7b2fc6784 (diff) |
treewide: rewrite ucode interpreter
Replace the former AST walking interpreter implementation with a single pass
bytecode compiler and a corresponding virtual machine.
The rewrite lays the groundwork for a couple of improvements with will be
subsequently implemented:
- Ability to precompile ucode sources into binary byte code
- Strippable debug information
- Reduced runtime memory usage
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 1152 |
1 files changed, 638 insertions, 514 deletions
@@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Jo-Philipp Wich <jo@mein.io> + * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io> * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -25,11 +25,11 @@ #include <errno.h> #include <endian.h> -#include "ast.h" +#include "vm.h" #include "lib.h" #include "lexer.h" -#include "parser.h" +#define UC_LEX_CONTINUE_PARSING (void *)1 struct keyword { int type; @@ -48,7 +48,7 @@ struct token { char pat[4]; }; int plen; - uint32_t (*parse)(struct uc_state *s); + uc_token *(*parse)(uc_lexer *); }; #define dec(o) \ @@ -58,109 +58,110 @@ struct token { (((x) >= 'a') ? (10 + (x) - 'a') : \ (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) -static uint32_t parse_comment(struct uc_state *); -static uint32_t parse_string(struct uc_state *); -static uint32_t parse_regexp(struct uc_state *); -static uint32_t parse_number(struct uc_state *); -static uint32_t parse_label(struct uc_state *); +static uc_token *parse_comment(uc_lexer *); +static uc_token *parse_string(uc_lexer *); +static uc_token *parse_regexp(uc_lexer *); +static uc_token *parse_number(uc_lexer *); +static uc_token *parse_label(uc_lexer *); static const struct token tokens[] = { - { T_ASLEFT, { .pat = "<<=" }, 3 }, - { T_ASRIGHT, { .pat = ">>=" }, 3 }, - { T_LEXP, { .pat = "{{-" }, 3 }, - { T_REXP, { .pat = "-}}" }, 3 }, - { T_LSTM, { .pat = "{%+" }, 3 }, - { T_LSTM, { .pat = "{%-" }, 3 }, - { T_RSTM, { .pat = "-%}" }, 3 }, - { T_EQS, { .pat = "===" }, 3 }, - { T_NES, { .pat = "!==" }, 3 }, - { T_ELLIP, { .pat = "..." }, 3 }, - { T_AND, { .pat = "&&" }, 2 }, - { T_ASADD, { .pat = "+=" }, 2 }, - { T_ASBAND, { .pat = "&=" }, 2 }, - { T_ASBOR, { .pat = "|=" }, 2 }, - { T_ASBXOR, { .pat = "^=" }, 2 }, - //{ T_ASDIV, { .pat = "/=" }, 2 }, - { T_ASMOD, { .pat = "%=" }, 2 }, - { T_ASMUL, { .pat = "*=" }, 2 }, - { T_ASSUB, { .pat = "-=" }, 2 }, - { T_DEC, { .pat = "--" }, 2 }, - { T_INC, { .pat = "++" }, 2 }, - { T_EQ, { .pat = "==" }, 2 }, - { T_NE, { .pat = "!=" }, 2 }, - { T_LE, { .pat = "<=" }, 2 }, - { T_GE, { .pat = ">=" }, 2 }, - { T_LSHIFT, { .pat = "<<" }, 2 }, - { T_RSHIFT, { .pat = ">>" }, 2 }, + { TK_ASLEFT, { .pat = "<<=" }, 3 }, + { TK_ASRIGHT, { .pat = ">>=" }, 3 }, + { TK_LEXP, { .pat = "{{-" }, 3 }, + { TK_REXP, { .pat = "-}}" }, 3 }, + { TK_LSTM, { .pat = "{%+" }, 3 }, + { TK_LSTM, { .pat = "{%-" }, 3 }, + { TK_RSTM, { .pat = "-%}" }, 3 }, + { TK_EQS, { .pat = "===" }, 3 }, + { TK_NES, { .pat = "!==" }, 3 }, + { TK_ELLIP, { .pat = "..." }, 3 }, + { TK_AND, { .pat = "&&" }, 2 }, + { TK_ASADD, { .pat = "+=" }, 2 }, + { TK_ASBAND, { .pat = "&=" }, 2 }, + { TK_ASBOR, { .pat = "|=" }, 2 }, + { TK_ASBXOR, { .pat = "^=" }, 2 }, + //{ TK_ASDIV, { .pat = "/=" }, 2 }, + { TK_ASMOD, { .pat = "%=" }, 2 }, + { TK_ASMUL, { .pat = "*=" }, 2 }, + { TK_ASSUB, { .pat = "-=" }, 2 }, + { TK_DEC, { .pat = "--" }, 2 }, + { TK_INC, { .pat = "++" }, 2 }, + { TK_EQ, { .pat = "==" }, 2 }, + { TK_NE, { .pat = "!=" }, 2 }, + { TK_LE, { .pat = "<=" }, 2 }, + { TK_GE, { .pat = ">=" }, 2 }, + { TK_LSHIFT, { .pat = "<<" }, 2 }, + { TK_RSHIFT, { .pat = ">>" }, 2 }, { 0, { .pat = "//" }, 2, parse_comment }, { 0, { .pat = "/*" }, 2, parse_comment }, - { T_OR, { .pat = "||" }, 2 }, - { T_LEXP, { .pat = "{{" }, 2 }, - { T_REXP, { .pat = "}}" }, 2 }, - { T_LSTM, { .pat = "{%" }, 2 }, - { T_RSTM, { .pat = "%}" }, 2 }, - { T_ARROW, { .pat = "=>" }, 2 }, - { T_ADD, { .pat = "+" }, 1 }, - { T_ASSIGN, { .pat = "=" }, 1 }, - { T_BAND, { .pat = "&" }, 1 }, - { T_BOR, { .pat = "|" }, 1 }, - { T_LBRACK, { .pat = "[" }, 1 }, - { T_RBRACK, { .pat = "]" }, 1 }, - { T_BXOR, { .pat = "^" }, 1 }, - { T_LBRACE, { .pat = "{" }, 1 }, - { T_RBRACE, { .pat = "}" }, 1 }, - { T_COLON, { .pat = ":" }, 1 }, - { T_COMMA, { .pat = "," }, 1 }, - { T_COMPL, { .pat = "~" }, 1 }, - //{ T_DIV, { .pat = "/" }, 1 }, - { T_GT, { .pat = ">" }, 1 }, - { T_NOT, { .pat = "!" }, 1 }, - { T_LT, { .pat = "<" }, 1 }, - { T_MOD, { .pat = "%" }, 1 }, - { T_MUL, { .pat = "*" }, 1 }, - { T_LPAREN, { .pat = "(" }, 1 }, - { T_RPAREN, { .pat = ")" }, 1 }, - { T_QMARK, { .pat = "?" }, 1 }, - { T_SCOL, { .pat = ";" }, 1 }, - { T_SUB, { .pat = "-" }, 1 }, - { T_DOT, { .pat = "." }, 1 }, - { T_STRING, { .pat = "'" }, 1, parse_string }, - { T_STRING, { .pat = "\"" }, 1, parse_string }, - { T_REGEXP, { .pat = "/" }, 1, parse_regexp }, - { T_LABEL, { .pat = "_" }, 1, parse_label }, - { T_LABEL, { .pat = "az" }, 0, parse_label }, - { T_LABEL, { .pat = "AZ" }, 0, parse_label }, - { T_NUMBER, { .pat = "09" }, 0, parse_number }, + { TK_OR, { .pat = "||" }, 2 }, + { TK_LEXP, { .pat = "{{" }, 2 }, + { TK_REXP, { .pat = "}}" }, 2 }, + { TK_LSTM, { .pat = "{%" }, 2 }, + { TK_RSTM, { .pat = "%}" }, 2 }, + { TK_ARROW, { .pat = "=>" }, 2 }, + { TK_ADD, { .pat = "+" }, 1 }, + { TK_ASSIGN, { .pat = "=" }, 1 }, + { TK_BAND, { .pat = "&" }, 1 }, + { TK_BOR, { .pat = "|" }, 1 }, + { TK_LBRACK, { .pat = "[" }, 1 }, + { TK_RBRACK, { .pat = "]" }, 1 }, + { TK_BXOR, { .pat = "^" }, 1 }, + { TK_LBRACE, { .pat = "{" }, 1 }, + { TK_RBRACE, { .pat = "}" }, 1 }, + { TK_COLON, { .pat = ":" }, 1 }, + { TK_COMMA, { .pat = "," }, 1 }, + { TK_COMPL, { .pat = "~" }, 1 }, + //{ TK_DIV, { .pat = "/" }, 1 }, + { TK_GT, { .pat = ">" }, 1 }, + { TK_NOT, { .pat = "!" }, 1 }, + { TK_LT, { .pat = "<" }, 1 }, + { TK_MOD, { .pat = "%" }, 1 }, + { TK_MUL, { .pat = "*" }, 1 }, + { TK_LPAREN, { .pat = "(" }, 1 }, + { TK_RPAREN, { .pat = ")" }, 1 }, + { TK_QMARK, { .pat = "?" }, 1 }, + { TK_SCOL, { .pat = ";" }, 1 }, + //{ TK_SUB, { .pat = "-" }, 1 }, + { TK_DOT, { .pat = "." }, 1 }, + { TK_STRING, { .pat = "'" }, 1, parse_string }, + { TK_STRING, { .pat = "\"" }, 1, parse_string }, + { TK_REGEXP, { .pat = "/" }, 1, parse_regexp }, + { TK_LABEL, { .pat = "_" }, 1, parse_label }, + { TK_LABEL, { .pat = "az" }, 0, parse_label }, + { TK_LABEL, { .pat = "AZ" }, 0, parse_label }, + { TK_NUMBER, { .pat = "-" }, 1, parse_number }, + { TK_NUMBER, { .pat = "09" }, 0, parse_number }, }; static const struct keyword reserved_words[] = { - { T_ENDFUNC, "endfunction", 11 }, - { T_DOUBLE, "Infinity", 8, { .d = INFINITY } }, - { T_CONTINUE, "continue", 8 }, - { T_ENDWHILE, "endwhile", 8 }, - { T_FUNC, "function", 8 }, - { T_DEFAULT, "default", 7 }, - { T_RETURN, "return", 6 }, - { T_ENDFOR, "endfor", 6 }, - { T_SWITCH, "switch", 6 }, - { T_LOCAL, "local", 5 }, - { T_ENDIF, "endif", 5 }, - { T_WHILE, "while", 5 }, - { T_BREAK, "break", 5 }, - { T_CATCH, "catch", 5 }, - { T_BOOL, "false", 5, { .b = false } }, - { T_BOOL, "true", 4, { .b = true } }, - { T_ELIF, "elif", 4 }, - { T_ELSE, "else", 4 }, - { T_THIS, "this", 4 }, - { T_NULL, "null", 4 }, - { T_CASE, "case", 4 }, - { T_DOUBLE, "NaN", 3, { .d = NAN } }, - { T_TRY, "try", 3 }, - { T_FOR, "for", 3 }, - { T_LOCAL, "let", 3 }, - { T_IF, "if", 2 }, - { T_IN, "in", 2 }, + { TK_ENDFUNC, "endfunction", 11 }, + { TK_DOUBLE, "Infinity", 8, { .d = INFINITY } }, + { TK_CONTINUE, "continue", 8 }, + { TK_ENDWHILE, "endwhile", 8 }, + { TK_FUNC, "function", 8 }, + { TK_DEFAULT, "default", 7 }, + { TK_RETURN, "return", 6 }, + { TK_ENDFOR, "endfor", 6 }, + { TK_SWITCH, "switch", 6 }, + { TK_LOCAL, "local", 5 }, + { TK_ENDIF, "endif", 5 }, + { TK_WHILE, "while", 5 }, + { TK_BREAK, "break", 5 }, + { TK_CATCH, "catch", 5 }, + { TK_BOOL, "false", 5, { .b = false } }, + { TK_BOOL, "true", 4, { .b = true } }, + { TK_ELIF, "elif", 4 }, + { TK_ELSE, "else", 4 }, + { TK_THIS, "this", 4 }, + { TK_NULL, "null", 4 }, + { TK_CASE, "case", 4 }, + { TK_DOUBLE, "NaN", 3, { .d = NAN } }, + { TK_TRY, "try", 3 }, + { TK_FOR, "for", 3 }, + { TK_LOCAL, "let", 3 }, + { TK_IF, "if", 2 }, + { TK_IN, "in", 2 }, }; @@ -221,125 +222,216 @@ utf8enc(char **out, int *rem, int code) /* length of the longest token in our lookup table */ #define UT_LEX_MAX_TOKEN_LEN 3 -static uint32_t emit_op(struct uc_state *state, uint32_t pos, int type, struct json_object *val) +static uc_token * +emit_op(uc_lexer *lex, uint32_t pos, int type, struct json_object *val) { - uint32_t off = uc_new_op(state, type, val, UINT32_MAX); - - OP(off)->off = pos; + lex->curr.type = type; + lex->curr.val = val; + lex->curr.pos = pos; /* Follow JSLint logic and treat a slash after any of the * `(,=:[!&|?{};` characters as the beginning of a regex * literal... */ switch (type) { - case T_LPAREN: - case T_COMMA: - - case T_ASADD: - case T_ASBAND: - case T_ASBOR: - case T_ASBXOR: - case T_ASDIV: - case T_ASLEFT: - case T_ASMOD: - case T_ASMUL: - case T_ASRIGHT: - case T_ASSIGN: - case T_ASSUB: - case T_EQ: - case T_EQS: - case T_GE: - case T_LE: - case T_NE: - case T_NES: - - case T_COLON: - case T_LBRACK: - case T_NOT: - - case T_AND: - case T_BAND: - - case T_OR: - case T_BOR: - - case T_QMARK: - - case T_LBRACE: - case T_RBRACE: - - case T_LSTM: - case T_LEXP: - - case T_SCOL: - state->lex.expect_div = false; + case TK_LPAREN: + case TK_COMMA: + + case TK_ASADD: + case TK_ASBAND: + case TK_ASBOR: + case TK_ASBXOR: + case TK_ASDIV: + case TK_ASLEFT: + case TK_ASMOD: + case TK_ASMUL: + case TK_ASRIGHT: + case TK_ASSIGN: + case TK_ASSUB: + case TK_EQ: + case TK_EQS: + case TK_GE: + case TK_LE: + case TK_NE: + case TK_NES: + + case TK_COLON: + case TK_LBRACK: + case TK_NOT: + + case TK_AND: + case TK_BAND: + + case TK_OR: + case TK_BOR: + + case TK_QMARK: + + case TK_LBRACE: + case TK_RBRACE: + + case TK_LSTM: + case TK_LEXP: + + case TK_SCOL: + lex->expect_div = false; break; default: - state->lex.expect_div = true; + lex->expect_div = true; } - return off; + return &lex->curr; } -static void lookbehind_append(struct uc_state *s, const char *data, size_t len) +static void lookbehind_append(uc_lexer *lex, const char *data, size_t len) { if (len) { - s->lex.lookbehind = xrealloc(s->lex.lookbehind, s->lex.lookbehindlen + len); - memcpy(s->lex.lookbehind + s->lex.lookbehindlen, data, len); - s->lex.lookbehindlen += len; + lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len); + memcpy(lex->lookbehind + lex->lookbehindlen, data, len); + lex->lookbehindlen += len; } } -static void lookbehind_reset(struct uc_state *s) { - free(s->lex.lookbehind); - s->lex.lookbehind = NULL; - s->lex.lookbehindlen = 0; +static void lookbehind_reset(uc_lexer *lex) { + free(lex->lookbehind); + lex->lookbehind = NULL; + lex->lookbehindlen = 0; } -static uint32_t lookbehind_to_text(struct uc_state *s, uint32_t pos, int type, const char *strip_trailing_chars) { - uint32_t rv = 0; +static uc_token * +lookbehind_to_text(uc_lexer *lex, uint32_t pos, int type, const char *strip_trailing_chars) { + uc_token *rv = NULL; - if (s->lex.lookbehind) { + if (lex->lookbehind) { if (strip_trailing_chars) { - while (s->lex.lookbehindlen > 0 && strchr(strip_trailing_chars, s->lex.lookbehind[s->lex.lookbehindlen-1])) - s->lex.lookbehindlen--; + while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1])) + lex->lookbehindlen--; } - rv = emit_op(s, pos, type, xjs_new_string_len(s->lex.lookbehind, s->lex.lookbehindlen)); + rv = emit_op(lex, pos, type, xjs_new_string_len(lex->lookbehind, lex->lookbehindlen)); - lookbehind_reset(s); + lookbehind_reset(lex); } return rv; } -static inline size_t buf_remaining(struct uc_state *s) { - return (s->lex.bufend - s->lex.bufstart); +static inline size_t +buf_remaining(uc_lexer *lex) { + return (lex->bufend - lex->bufstart); } -static inline bool _buf_startswith(struct uc_state *s, const char *str, size_t len) { - return (buf_remaining(s) >= len && !strncmp(s->lex.bufstart, str, len)); +static inline bool +_buf_startswith(uc_lexer *lex, const char *str, size_t len) { + return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len)); } #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) -static void buf_consume(struct uc_state *s, ssize_t len) { - s->lex.bufstart += len; - s->source->off += len; +#if 0 +static void add_lineinfo(struct uc_state *s, size_t off) +{ + uc_lineinfo *lines = &s->source->lineinfo; + size_t linelen; + + linelen = off - s->lex.lastlineoff; + + /* lineinfo is encoded in bytes: the most significant bit specifies whether + * to advance the line count by one or not, while the remaining 7 bits encode + * the amounts of bytes on the current line. + * + * If a line has more than 127 characters, the first byte will be set to + * 0xff (1 1111111) and subsequent bytes will encode the remaining characters + * in bits 1..7 while setting bit 8 to 0. A line with 400 characters will thus + * be encoded as 0xff 0x7f 0x7f 0x13 (1:1111111 + 0:1111111 + 0:1111111 + 0:1111111). + * + * The newline character itself is not counted, so an empty line is encoded as + * 0x80 (1:0000000). + */ + uc_vector_grow(lines); + lines->entries[lines->count++] = 0x80 + (linelen & 0x7f); + linelen -= (linelen & 0x7f); + + while (linelen > 0) { + uc_vector_grow(lines); + lines->entries[lines->count++] = (linelen & 0x7f); + linelen -= (linelen & 0x7f); + } + + s->lex.lastlineoff = off + 1; + s->lex.line++; } +#endif -static uint32_t -parse_comment(struct uc_state *s) +static void +next_lineinfo(uc_lexer *lex) { - const struct token *tok = s->lex.tok; - const char *ptr, *end; - size_t elen; + uc_lineinfo *lines = &lex->source->lineinfo; - if (!buf_remaining(s)) { - uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated comment"); + uc_vector_grow(lines); + lines->entries[lines->count++] = 0x80; +} + +static void +update_lineinfo(uc_lexer *lex, size_t off) +{ + uc_lineinfo *lines = &lex->source->lineinfo; + uint8_t *entry; - return 0; + entry = uc_vector_last(lines); + + if ((entry[0] & 0x7f) + off <= 0x7f) { + entry[0] += off; } + else { + off -= (0x7f - (entry[0] & 0x7f)); + entry[0] |= 0x7f; + + while (off > 0) { + uc_vector_grow(lines); + entry = uc_vector_last(lines); + entry[1] = (off & 0x7f); + off -= (off & 0x7f); + lines->count++; + } + } +} + +static void +buf_consume(uc_lexer *lex, size_t len) { + size_t i, linelen; + + if (!lex->source->lineinfo.count) + next_lineinfo(lex); + + for (i = 0, linelen = 0; i < len; i++) { + if (lex->bufstart[i] == '\n') { + update_lineinfo(lex, linelen); + next_lineinfo(lex); + + linelen = 0; + } + else { + linelen++; + } + } + + if (linelen) + update_lineinfo(lex, linelen); + + lex->bufstart += len; + lex->source->off += len; +} + +static uc_token * +parse_comment(uc_lexer *lex) +{ + const struct token *tok = lex->tok; + const char *ptr, *end; + size_t elen; + + if (!buf_remaining(lex)) + return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated comment")); if (!strcmp(tok->pat, "//")) { end = "\n"; @@ -350,20 +442,21 @@ parse_comment(struct uc_state *s) elen = 2; } - for (ptr = s->lex.bufstart; ptr < s->lex.bufend - elen; ptr++) { + for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) { if (!strncmp(ptr, end, elen)) { - buf_consume(s, (ptr - s->lex.bufstart) + elen); + buf_consume(lex, (ptr - lex->bufstart) + elen); - return UINT32_MAX; + return UC_LEX_CONTINUE_PARSING; } } - buf_consume(s, ptr - s->lex.bufstart); + buf_consume(lex, ptr - lex->bufstart); - return 0; + return NULL; } -static void append_utf8(struct uc_state *s, int code) { +static void +append_utf8(uc_lexer *lex, int code) { char ustr[8], *up; int rem; @@ -371,38 +464,35 @@ static void append_utf8(struct uc_state *s, int code) { rem = sizeof(ustr); if (utf8enc(&up, &rem, code)) - lookbehind_append(s, ustr, up - ustr); + lookbehind_append(lex, ustr, up - ustr); } -static uint32_t -parse_string(struct uc_state *s) +static uc_token * +parse_string(uc_lexer *lex) { - const struct token *tok = s->lex.tok; + const struct token *tok = lex->tok; char q = tok->pat[0]; char *ptr, *c; - uint32_t rv; + uc_token *rv; int code; - if (!buf_remaining(s)) { - uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated string"); - - return 0; - } + if (!buf_remaining(lex)) + return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated string")); - for (ptr = s->lex.bufstart; ptr < s->lex.bufend; ptr++) { + for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) { /* continuation of escape sequence */ - if (s->lex.is_escape) { - if (s->lex.esclen == 0) { + if (lex->is_escape) { + if (lex->esclen == 0) { /* non-unicode escape following a lead surrogate, emit replacement... */ - if (s->lex.lead_surrogate && *ptr != 'u') { - append_utf8(s, 0xFFFD); - s->lex.lead_surrogate = 0; + if (lex->lead_surrogate && *ptr != 'u') { + append_utf8(lex, 0xFFFD); + lex->lead_surrogate = 0; } switch ((q == '/') ? 0 : *ptr) { case 'u': case 'x': - s->lex.esc[s->lex.esclen++] = *ptr; + lex->esc[lex->esclen++] = *ptr; break; case '0': @@ -413,65 +503,62 @@ parse_string(struct uc_state *s) case '5': case '6': case '7': - s->lex.esc[s->lex.esclen++] = 'o'; - s->lex.esc[s->lex.esclen++] = *ptr; + lex->esc[lex->esclen++] = 'o'; + lex->esc[lex->esclen++] = *ptr; break; default: - s->lex.is_escape = false; + lex->is_escape = false; c = strchr("a\ab\be\ef\fn\nr\rt\tv\v", *ptr); if (c && *c >= 'a') { - lookbehind_append(s, c + 1, 1); + lookbehind_append(lex, c + 1, 1); } else { /* regex mode => retain backslash */ if (q == '/') - lookbehind_append(s, "\\", 1); + lookbehind_append(lex, "\\", 1); - lookbehind_append(s, ptr, 1); + lookbehind_append(lex, ptr, 1); } - buf_consume(s, (ptr + 1) - s->lex.bufstart); + buf_consume(lex, (ptr + 1) - lex->bufstart); break; } } else { - switch (s->lex.esc[0]) { + switch (lex->esc[0]) { case 'u': - if (s->lex.esclen < 5) { - if (!isxdigit(*ptr)) { - uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); + if (lex->esclen < 5) { + if (!isxdigit(*ptr)) + return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence")); - return 0; - } - - s->lex.esc[s->lex.esclen++] = *ptr; + lex->esc[lex->esclen++] = *ptr; } - if (s->lex.esclen == 5) { - code = hex(s->lex.esc[1]) * 16 * 16 * 16 + - hex(s->lex.esc[2]) * 16 * 16 + - hex(s->lex.esc[3]) * 16 + - hex(s->lex.esc[4]); + if (lex->esclen == 5) { + code = hex(lex->esc[1]) * 16 * 16 * 16 + + hex(lex->esc[2]) * 16 * 16 + + hex(lex->esc[3]) * 16 + + hex(lex->esc[4]); /* is a leading surrogate value */ if ((code & 0xFC00) == 0xD800) { /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ - if (s->lex.lead_surrogate) - append_utf8(s, 0xFFFD); + if (lex->lead_surrogate) + append_utf8(lex, 0xFFFD); /* store surrogate value and advance to next escape sequence */ - s->lex.lead_surrogate = code; + lex->lead_surrogate = code; } /* is a trailing surrogate value */ else if ((code & 0xFC00) == 0xDC00) { /* found a trailing surrogate following a leading one, combine and encode */ - if (s->lex.lead_surrogate) { - code = 0x10000 + ((s->lex.lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); - s->lex.lead_surrogate = 0; + if (lex->lead_surrogate) { + code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); + lex->lead_surrogate = 0; } /* trailing surrogate not following a leading one, ignore and use replacement char */ @@ -479,87 +566,81 @@ parse_string(struct uc_state *s) code = 0xFFFD; } - append_utf8(s, code); + append_utf8(lex, code); } /* is a normal codepoint */ else { - append_utf8(s, code); + append_utf8(lex, code); } - s->lex.esclen = 0; - s->lex.is_escape = false; - buf_consume(s, (ptr + 1) - s->lex.bufstart); + lex->esclen = 0; + lex->is_escape = false; + buf_consume(lex, (ptr + 1) - lex->bufstart); } break; case 'x': - if (s->lex.esclen < 3) { - if (!isxdigit(*ptr)) { - uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); - - return 0; - } + if (lex->esclen < 3) { + if (!isxdigit(*ptr)) + return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence")); - s->lex.esc[s->lex.esclen++] = *ptr; + lex->esc[lex->esclen++] = *ptr; } - if (s->lex.esclen == 3) { - append_utf8(s, hex(s->lex.esc[1]) * 16 + hex(s->lex.esc[2])); + if (lex->esclen == 3) { + append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2])); - s->lex.esclen = 0; - s->lex.is_escape = false; - buf_consume(s, (ptr + 1) - s->lex.bufstart); + lex->esclen = 0; + lex->is_escape = false; + buf_consume(lex, (ptr + 1) - lex->bufstart); } break; case 'o': - if (s->lex.esclen < 4) { + if (lex->esclen < 4) { /* found a non-octal char */ if (*ptr < '0' || *ptr > '7') { /* pad sequence to three chars */ - switch (s->lex.esclen) { + switch (lex->esclen) { case 3: - s->lex.esc[3] = s->lex.esc[2]; - s->lex.esc[2] = s->lex.esc[1]; - s->lex.esc[1] = '0'; + lex->esc[3] = lex->esc[2]; + lex->esc[2] = lex->esc[1]; + lex->esc[1] = '0'; break; case 2: - s->lex.esc[3] = s->lex.esc[1]; - s->lex.esc[2] = '0'; - s->lex.esc[1] = '0'; + lex->esc[3] = lex->esc[1]; + lex->esc[2] = '0'; + lex->esc[1] = '0'; break; } - s->lex.esclen = 4; - buf_consume(s, ptr-- - s->lex.bufstart); + lex->esclen = 4; + buf_consume(lex, ptr-- - lex->bufstart); } /* append */ else { - s->lex.esc[s->lex.esclen++] = *ptr; - buf_consume(s, (ptr + 1) - s->lex.bufstart); + lex->esc[lex->esclen++] = *ptr; + buf_consume(lex, (ptr + 1) - lex->bufstart); } } - if (s->lex.esclen == 4) { - code = dec(s->lex.esc[1]) * 8 * 8 + - dec(s->lex.esc[2]) * 8 + - dec(s->lex.esc[3]); + if (lex->esclen == 4) { + code = dec(lex->esc[1]) * 8 * 8 + + dec(lex->esc[2]) * 8 + + dec(lex->esc[3]); - if (code > 255) { - uc_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); + if (code > 255) + return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, xjs_new_string("Invalid escape sequence")); - return 0; - } - - append_utf8(s, code); + append_utf8(lex, code); - s->lex.esclen = 0; - s->lex.is_escape = false; + lex->esclen = 0; + lex->is_escape = false; } break; @@ -569,29 +650,29 @@ parse_string(struct uc_state *s) /* terminating char */ else if (*ptr == q) { - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, (ptr + 1) - s->lex.bufstart); + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, (ptr + 1) - lex->bufstart); - rv = lookbehind_to_text(s, s->lex.lastoff, T_STRING, NULL); + rv = lookbehind_to_text(lex, lex->lastoff, TK_STRING, NULL); if (!rv) - rv = emit_op(s, s->lex.lastoff, T_STRING, xjs_new_string_len("", 0)); + rv = emit_op(lex, lex->lastoff, TK_STRING, xjs_new_string_len("", 0)); return rv; } /* escape sequence start */ else if (*ptr == '\\') { - s->lex.is_escape = true; - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, ptr - s->lex.bufstart); + lex->is_escape = true; + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, ptr - lex->bufstart); } } - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, ptr - s->lex.bufstart); + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, ptr - lex->bufstart); - return 0; + return NULL; } @@ -614,90 +695,85 @@ enum { UT_LEX_PARSE_REGEX_FLAGS }; -static uint32_t -parse_regexp(struct uc_state *state) +static uc_token * +parse_regexp(uc_lexer *lex) { - struct json_object *pattern; - struct uc_op *op; - uint32_t rv; - char *err; + bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; + uc_token *rv; + size_t len; + char *s; - switch (state->lex.esc[0]) { + switch (lex->esc[0]) { case UT_LEX_PARSE_REGEX_INIT: - if (state->lex.expect_div) { - state->lex.expect_div = false; + if (lex->expect_div) { + lex->expect_div = false; - if (buf_startswith(state, "=")) { - buf_consume(state, 1); + if (buf_startswith(lex, "=")) { + buf_consume(lex, 1); - return emit_op(state, state->source->off, T_ASDIV, NULL); + return emit_op(lex, lex->source->off, TK_ASDIV, NULL); } - return emit_op(state, state->source->off, T_DIV, NULL); + return emit_op(lex, lex->source->off, TK_DIV, NULL); } - state->lex.esc[0] = UT_LEX_PARSE_REGEX_PATTERN; + lex->esc[0] = UT_LEX_PARSE_REGEX_PATTERN; break; case UT_LEX_PARSE_REGEX_PATTERN: - rv = parse_string(state); + rv = parse_string(lex); - if (rv != 0 && rv != UINT32_MAX) { - state->lex.lookbehind = (char *)OP(rv); - state->lex.esc[0] = UT_LEX_PARSE_REGEX_FLAGS; + if (rv && rv->type == TK_ERROR) + return rv; + + if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) { + lex->lookbehind = (char *)rv; + lex->esc[0] = UT_LEX_PARSE_REGEX_FLAGS; } break; case UT_LEX_PARSE_REGEX_FLAGS: - op = (struct uc_op *)state->lex.lookbehind; + rv = (uc_token *)lex->lookbehind; - while (state->lex.bufstart < state->lex.bufend) { - switch (state->lex.bufstart[0]) { + while (lex->bufstart < lex->bufend) { + switch (lex->bufstart[0]) { case 'g': - buf_consume(state, 1); - op->is_reg_global = true; + buf_consume(lex, 1); + is_reg_global = true; break; case 'i': - buf_consume(state, 1); - op->is_reg_icase = true; + buf_consume(lex, 1); + is_reg_icase = true; break; case 's': - buf_consume(state, 1); - op->is_reg_newline = true; + buf_consume(lex, 1); + is_reg_newline = true; break; default: - state->lex.lookbehind = NULL; - - pattern = uc_new_regexp(json_object_get_string(op->val), - op->is_reg_icase, - op->is_reg_newline, - op->is_reg_global, - &err); + lex->lookbehind = NULL; - json_object_put(op->val); + len = xasprintf(&s, "%c%*s", + (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), + json_object_get_string_len(rv->val), + json_object_get_string(rv->val)); - op->type = T_REGEXP; - op->val = pattern; + json_object_set_string_len(rv->val, s, len); + free(s); - if (!pattern) { - uc_new_exception(state, op->off, "Syntax error: %s", err); - free(err); + rv->type = TK_REGEXP; - return 0; - } - - return op - state->pool; + return rv; } } break; } - return 0; + return NULL; } @@ -711,50 +787,50 @@ parse_regexp(struct uc_state *state) * -UT_ERROR_OVERLONG_STRING Label too long */ -static uint32_t -parse_label(struct uc_state *s) +static uc_token * +parse_label(uc_lexer *lex) { - const struct token *tok = s->lex.tok; + const struct token *tok = lex->tok; const struct keyword *word; - uint32_t rv; + uc_token *rv; char *ptr; size_t i; - if (!s->lex.lookbehind && tok->plen) - lookbehind_append(s, tok->pat, tok->plen); + if (!lex->lookbehind && tok->plen) + lookbehind_append(lex, tok->pat, tok->plen); - if (!buf_remaining(s) || (s->lex.bufstart[0] != '_' && !isalnum(s->lex.bufstart[0]))) { + if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) { for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { - if (s->lex.lookbehindlen == word->plen && !strncmp(s->lex.lookbehind, word->pat, word->plen)) { - lookbehind_reset(s); + if (lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) { + lookbehind_reset(lex); switch (word->type) { - case T_DOUBLE: - rv = emit_op(s, s->source->off - word->plen, word->type, uc_new_double(word->d)); + case TK_DOUBLE: + rv = emit_op(lex, lex->source->off - word->plen, word->type, uc_double_new(word->d)); break; - case T_BOOL: - rv = emit_op(s, s->source->off - word->plen, word->type, xjs_new_boolean(word->b)); + case TK_BOOL: + rv = emit_op(lex, lex->source->off - word->plen, word->type, xjs_new_boolean(word->b)); break; default: - rv = emit_op(s, s->source->off - word->plen, word->type, NULL); + rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL); } return rv; } } - return lookbehind_to_text(s, s->source->off - s->lex.lookbehindlen, T_LABEL, NULL); + return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL); } - for (ptr = s->lex.bufstart; ptr < s->lex.bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) + for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) ; - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, ptr - s->lex.bufstart); + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, ptr - lex->bufstart); - return 0; + return NULL; } @@ -769,198 +845,206 @@ parse_label(struct uc_state *s) */ static inline bool -is_numeric_char(struct uc_state *s, char c) +is_numeric_char(uc_lexer *lex, char c) { - char prev = s->lex.lookbehindlen ? s->lex.lookbehind[s->lex.lookbehindlen-1] : 0; + char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0; if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) return true; - return (isxdigit(c) || c == 'x' || c == 'X' || c == '.'); + return prev ? (isxdigit(c) || c == 'x' || c == 'X' || c == '.') : (isdigit(c) || c == '.'); } -static uint32_t -parse_number(struct uc_state *state) +static uc_token * +parse_number(uc_lexer *lex) { - uint32_t rv = 0; + const struct token *tok = lex->tok; + uc_token *rv = NULL; long long int n; char *ptr, *e; double d; - if (!buf_remaining(state) || !is_numeric_char(state, state->lex.bufstart[0])) { - lookbehind_append(state, "\0", 1); + if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) { + if (lex->lookbehindlen == 0 && !is_numeric_char(lex, lex->bufstart[0])) + return emit_op(lex, lex->source->off, TK_SUB, NULL); - n = strtoll(state->lex.lookbehind, &e, 0); + lookbehind_append(lex, "\0", 1); + + n = strtoll(lex->lookbehind, &e, 0); if (*e == '.' || *e == 'e' || *e == 'E') { - d = strtod(state->lex.lookbehind, &e); + d = strtod(lex->lookbehind, &e); + + if (tok->pat[0] == '-') + d = -d; - if (e > state->lex.lookbehind && *e == 0) - rv = emit_op(state, state->source->off - (e - state->lex.lookbehind), T_DOUBLE, uc_new_double(d)); + if (e > lex->lookbehind && *e == 0) + rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, uc_double_new(d)); else - uc_new_exception(state, state->source->off - (state->lex.lookbehindlen - (e - state->lex.lookbehind) - 1), - "Syntax error: Invalid number literal"); + rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, xjs_new_string("Invalid number literal")); } else if (*e == 0) { - rv = emit_op(state, state->source->off - (e - state->lex.lookbehind), T_NUMBER, xjs_new_int64(n)); - OP(rv)->is_overflow = (errno == ERANGE); + if (tok->pat[0] == '-') + n = (errno == ERANGE) ? INT64_MIN : -n; + + rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, xjs_new_int64(n)); + //OP(rv)->is_overflow = (errno == ERANGE); } else { - uc_new_exception(state, state->source->off - (state->lex.lookbehindlen - (e - state->lex.lookbehind) - 1), - "Syntax error: Invalid number literal"); + rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, xjs_new_string("Invalid number literal")); } - lookbehind_reset(state); + lookbehind_reset(lex); return rv; } - for (ptr = state->lex.bufstart; ptr < state->lex.bufend && is_numeric_char(state, *ptr); ptr++) + for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++) ; - lookbehind_append(state, state->lex.bufstart, ptr - state->lex.bufstart); - buf_consume(state, ptr - state->lex.bufstart); + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, ptr - lex->bufstart); - return 0; + return NULL; } -static uint32_t -lex_step(struct uc_state *s, FILE *fp) +static uc_token * +lex_step(uc_lexer *lex, FILE *fp) { uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; union { uint32_t n; char str[4]; } search; const struct token *tok; size_t rlen, rem; char *ptr, c; - uint32_t rv; + uc_token *rv; size_t i; /* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining, * move the remaining bytes to the beginning and read more data */ - if (buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) { - if (!s->lex.buf) { - s->lex.buflen = 128; - s->lex.buf = xalloc(s->lex.buflen); + if (buf_remaining(lex) < UT_LEX_MAX_TOKEN_LEN) { + if (!lex->buf) { + lex->buflen = 128; + lex->buf = xalloc(lex->buflen); } - rem = s->lex.bufend - s->lex.bufstart; + rem = lex->bufend - lex->bufstart; - memcpy(s->lex.buf, s->lex.bufstart, rem); + memcpy(lex->buf, lex->bufstart, rem); - rlen = fread(s->lex.buf + rem, 1, s->lex.buflen - rem, fp); + rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp); - s->lex.bufstart = s->lex.buf; - s->lex.bufend = s->lex.buf + rlen + rem; + lex->bufstart = lex->buf; + lex->bufend = lex->buf + rlen + rem; if (rlen == 0 && (ferror(fp) || feof(fp))) - s->lex.eof = 1; + lex->eof = 1; } - switch (s->lex.state) { + switch (lex->state) { case UT_LEX_IDENTIFY_BLOCK: /* previous block had strip trailing whitespace flag, skip leading whitespace */ - if (s->lex.skip_leading_whitespace) { - while (buf_remaining(s) && isspace(s->lex.bufstart[0])) - buf_consume(s, 1); + if (lex->skip_leading_whitespace) { + while (buf_remaining(lex) && isspace(lex->bufstart[0])) + buf_consume(lex, 1); - s->lex.skip_leading_whitespace = false; + lex->skip_leading_whitespace = false; } /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ - else if (s->lex.skip_leading_newline) { - if (buf_startswith(s, "\n")) - buf_consume(s, 1); + else if (lex->skip_leading_newline) { + if (buf_startswith(lex, "\n")) + buf_consume(lex, 1); - s->lex.skip_leading_newline = false; + lex->skip_leading_newline = false; } /* scan forward through buffer to identify start token */ - for (ptr = s->lex.bufstart; ptr < s->lex.bufend - strlen("{#"); ptr++) { + for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) { /* found start of comment block */ if (!strncmp(ptr, "{#", 2)) { - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, (ptr + 2) - s->lex.bufstart); - s->lex.lastoff = s->source->off - 2; - s->lex.state = UT_LEX_BLOCK_COMMENT_START; + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, (ptr + 2) - lex->bufstart); + lex->lastoff = lex->source->off - 2; + lex->state = UT_LEX_BLOCK_COMMENT_START; - return 0; + return NULL; } /* found start of expression block */ else if (!strncmp(ptr, "{{", 2)) { - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, (ptr + 2) - s->lex.bufstart); - s->lex.lastoff = s->source->off - 2; - s->lex.state = UT_LEX_BLOCK_EXPRESSION_START; + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, (ptr + 2) - lex->bufstart); + lex->lastoff = lex->source->off - 2; + lex->state = UT_LEX_BLOCK_EXPRESSION_START; - return 0; + return NULL; } /* found start of statement block */ else if (!strncmp(ptr, "{%", 2)) { - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, (ptr + 2) - s->lex.bufstart); - s->lex.lastoff = s->source->off - 2; - s->lex.state = UT_LEX_BLOCK_STATEMENT_START; + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, (ptr + 2) - lex->bufstart); + lex->lastoff = lex->source->off - 2; + lex->state = UT_LEX_BLOCK_STATEMENT_START; - return 0; + return NULL; } } /* we're at eof */ - if (s->lex.eof) { - lookbehind_append(s, ptr, s->lex.bufend - ptr); - s->lex.state = UT_LEX_EOF; + if (lex->eof) { + lookbehind_append(lex, ptr, lex->bufend - ptr); + lex->state = UT_LEX_EOF; - return lookbehind_to_text(s, s->lex.lastoff, T_TEXT, NULL); + return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL); } - lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); - buf_consume(s, ptr - s->lex.bufstart); + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, ptr - lex->bufstart); break; case UT_LEX_BLOCK_COMMENT_START: case UT_LEX_BLOCK_EXPRESSION_START: case UT_LEX_BLOCK_STATEMENT_START: - rv = 0; - s->lex.skip_leading_whitespace = 0; + rv = NULL; + lex->skip_leading_whitespace = 0; /* strip whitespace before block */ - if (buf_startswith(s, "-")) { - rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \n\t\v\f\r"); - buf_consume(s, 1); + if (buf_startswith(lex, "-")) { + rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r"); + buf_consume(lex, 1); } /* disable lstrip flag (only valid for statement blocks) */ - else if (s->lex.state == UT_LEX_BLOCK_STATEMENT_START) { + else if (lex->state == UT_LEX_BLOCK_STATEMENT_START) { /* disable lstrip flag */ - if (buf_startswith(s, "+")) { - rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL); - buf_consume(s, 1); + if (buf_startswith(lex, "+")) { + rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); + buf_consume(lex, 1); } /* global block lstrip */ - else if (s->lstrip_blocks) { - rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \t\v\f\r"); + else if (lex->config && lex->config->lstrip_blocks) { + rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \t\v\f\r"); } } else { - rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL); + rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); } - switch (s->lex.state) { + switch (lex->state) { case UT_LEX_BLOCK_COMMENT_START: - s->lex.state = UT_LEX_BLOCK_COMMENT; + lex->state = UT_LEX_BLOCK_COMMENT; break; case UT_LEX_BLOCK_STATEMENT_START: - s->lex.within_statement_block = 1; - s->lex.state = UT_LEX_IDENTIFY_TOKEN; + lex->within_statement_block = 1; + lex->state = UT_LEX_IDENTIFY_TOKEN; break; case UT_LEX_BLOCK_EXPRESSION_START: - s->lex.state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG; + lex->state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG; break; default: @@ -972,152 +1056,151 @@ lex_step(struct uc_state *s, FILE *fp) case UT_LEX_BLOCK_COMMENT: /* scan forward through buffer to identify end token */ - while (s->lex.bufstart < s->lex.bufend - 2) { - if (buf_startswith(s, "-#}")) { - s->lex.state = UT_LEX_IDENTIFY_BLOCK; - s->lex.skip_leading_whitespace = 1; - buf_consume(s, 3); - s->lex.lastoff = s->source->off; + while (lex->bufstart < lex->bufend - 2) { + if (buf_startswith(lex, "-#}")) { + lex->state = UT_LEX_IDENTIFY_BLOCK; + lex->skip_leading_whitespace = 1; + buf_consume(lex, 3); + lex->lastoff = lex->source->off; break; } - else if (buf_startswith(s, "#}")) { - s->lex.state = UT_LEX_IDENTIFY_BLOCK; - s->lex.skip_leading_whitespace = 0; - buf_consume(s, 2); - s->lex.lastoff = s->source->off; + else if (buf_startswith(lex, "#}")) { + lex->state = UT_LEX_IDENTIFY_BLOCK; + lex->skip_leading_whitespace = 0; + buf_consume(lex, 2); + lex->lastoff = lex->source->off; break; } - buf_consume(s, 1); + buf_consume(lex, 1); } /* we're at eof */ - if (s->lex.eof) - uc_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated template block"); + if (lex->eof) { + lex->state = UT_LEX_EOF; + + buf_consume(lex, lex->bufend - lex->bufstart); + + return emit_op(lex, lex->lastoff, TK_ERROR, xjs_new_string("Unterminated template block")); + } break; case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG: - s->lex.within_expression_block = 1; - s->lex.state = UT_LEX_IDENTIFY_TOKEN; + lex->within_expression_block = 1; + lex->state = UT_LEX_IDENTIFY_TOKEN; - return emit_op(s, s->source->off, T_LEXP, NULL); + return emit_op(lex, lex->source->off, TK_LEXP, NULL); case UT_LEX_IDENTIFY_TOKEN: /* skip leading whitespace */ - for (i = 0; i < buf_remaining(s) && isspace(s->lex.bufstart[i]); i++) + for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++) ; - buf_consume(s, i); + buf_consume(lex, i); - if (i > 0 && buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) - return 0; + if (i > 0 && buf_remaining(lex) < UT_LEX_MAX_TOKEN_LEN) + return NULL; for (i = 0; i < sizeof(search.str); i++) - search.str[i] = (i < buf_remaining(s)) ? s->lex.bufstart[i] : 0; + search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0; for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) { /* remaining buffer data is shorter than token, skip */ - if (tok->plen > buf_remaining(s)) + if (tok->plen > buf_remaining(lex)) continue; - c = s->lex.bufstart[0]; + c = buf_remaining(lex) ? lex->bufstart[0] : 0; if (tok->plen ? ((search.n & masks[tok->plen]) == tok->patn) : (c >= tok->pat[0] && c <= tok->pat[1])) { - buf_consume(s, tok->plen); - - s->lex.lastoff = s->source->off - tok->plen; + lex->lastoff = lex->source->off; /* token has a parse method, switch state */ if (tok->parse) { - s->lex.tok = tok; - s->lex.state = UT_LEX_PARSE_TOKEN; + lex->tok = tok; + lex->state = UT_LEX_PARSE_TOKEN; - return 0; + buf_consume(lex, tok->plen); + + return NULL; } /* disallow nesting blocks */ - if ((s->lex.within_expression_block && - (tok->type == T_LSTM || tok->type == T_RSTM || tok->type == T_LEXP)) || - (s->lex.within_statement_block && - (tok->type == T_LEXP || tok->type == T_REXP || tok->type == T_LSTM))) { - uc_new_exception(s, s->source->off - tok->plen, "Syntax error: Template blocks may not be nested"); + if ((lex->within_expression_block && + (tok->type == TK_LSTM || tok->type == TK_RSTM || tok->type == TK_LEXP)) || + (lex->within_statement_block && + (tok->type == TK_LEXP || tok->type == TK_REXP || tok->type == TK_LSTM))) { + buf_consume(lex, tok->plen); - return 0; + return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, xjs_new_string("Template blocks may not be nested")); } /* found end of block */ - else if ((s->lex.within_statement_block && tok->type == T_RSTM) || - (s->lex.within_expression_block && tok->type == T_REXP)) { + else if ((lex->within_statement_block && tok->type == TK_RSTM) || + (lex->within_expression_block && tok->type == TK_REXP)) { /* emit additional empty statement (semicolon) at end of template block */ - if (!s->lex.semicolon_emitted) { - s->lex.semicolon_emitted = true; - - /* rewind */ - buf_consume(s, -tok->plen); + if (!lex->semicolon_emitted) { + lex->semicolon_emitted = true; - return emit_op(s, s->source->off, T_SCOL, NULL); + return emit_op(lex, lex->source->off, TK_SCOL, NULL); } /* strip whitespace after block */ if (tok->pat[0] == '-') - s->lex.skip_leading_whitespace = true; + lex->skip_leading_whitespace = true; /* strip newline after statement block */ - else if (s->lex.within_statement_block && s->trim_blocks) - s->lex.skip_leading_newline = true; - - s->lex.semicolon_emitted = false; - s->lex.within_statement_block = false; - s->lex.within_expression_block = false; - s->lex.state = UT_LEX_IDENTIFY_BLOCK; - s->lex.lastoff = s->source->off; + else if (lex->within_statement_block && + lex->config && lex->config->trim_blocks) + lex->skip_leading_newline = true; + + lex->semicolon_emitted = false; + lex->within_statement_block = false; + lex->within_expression_block = false; + lex->state = UT_LEX_IDENTIFY_BLOCK; } /* do not report statement tags to the parser */ - if (tok->type != 0 && tok->type != T_LSTM && tok->type != T_RSTM) - rv = emit_op(s, s->source->off - tok->plen, tok->type, NULL); + if (tok->type != 0 && tok->type != TK_LSTM && tok->type != TK_RSTM) + rv = emit_op(lex, lex->source->off, tok->type, NULL); else - rv = 0; + rv = NULL; + + buf_consume(lex, tok->plen); return rv; } } /* no token matched and we do have remaining data, junk */ - if (buf_remaining(s)) { - uc_new_exception(s, s->source->off, "Syntax error: Unexpected character"); - - return 0; - } + if (buf_remaining(lex)) + return emit_op(lex, lex->source->off, TK_ERROR, xjs_new_string("Unexpected character")); /* we're at eof, allow unclosed statement blocks */ - if (s->lex.within_statement_block) { - s->lex.state = UT_LEX_EOF; + if (lex->within_statement_block) { + lex->state = UT_LEX_EOF; - return 0; + return NULL; } /* premature EOF */ - uc_new_exception(s, s->source->off, "Syntax error: Unterminated template block"); - - break; + return emit_op(lex, lex->source->off, TK_ERROR, xjs_new_string("Unterminated template block")); case UT_LEX_PARSE_TOKEN: - tok = s->lex.tok; - rv = tok->parse(s); + tok = lex->tok; + rv = tok->parse(lex); if (rv) { - memset(s->lex.esc, 0, sizeof(s->lex.esc)); - s->lex.state = UT_LEX_IDENTIFY_TOKEN; - s->lex.tok = NULL; + memset(lex->esc, 0, sizeof(lex->esc)); + lex->state = UT_LEX_IDENTIFY_TOKEN; + lex->tok = NULL; - if (rv == UINT32_MAX) - rv = 0; + if (rv == UC_LEX_CONTINUE_PARSING) + rv = NULL; return rv; } @@ -1129,25 +1212,66 @@ lex_step(struct uc_state *s, FILE *fp) break; } - return 0; + return NULL; } -uint32_t -uc_get_token(struct uc_state *s, FILE *fp) +void +uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source) { - uint32_t rv; + lex->state = UT_LEX_IDENTIFY_BLOCK; - while (s->lex.state != UT_LEX_EOF) { - rv = lex_step(s, fp); + lex->config = config; + lex->source = uc_source_get(source); - if (rv == 0 && s->exception) - break; + lex->eof = 0; + lex->skip_leading_whitespace = 0; + lex->skip_leading_newline = 0; + lex->within_statement_block = 0; + lex->within_statement_block = 0; + lex->semicolon_emitted = 0; + lex->expect_div = 0; + lex->is_escape = 0; + + lex->buflen = 0; + lex->buf = NULL; + lex->bufstart = NULL; + lex->bufend = NULL; + + lex->lookbehindlen = 0; + lex->lookbehind = NULL; + + lex->tok = NULL; + + lex->esclen = 0; + memset(lex->esc, 0, sizeof(lex->esc)); + + lex->lead_surrogate = 0; + + lex->lastoff = 0; +} + +void +uc_lexer_free(uc_lexer *lex) +{ + uc_source_put(lex->source); + + free(lex->lookbehind); + free(lex->buf); +} + +uc_token * +uc_lexer_next_token(uc_lexer *lex) +{ + uc_token *rv; + + while (lex->state != UT_LEX_EOF) { + rv = lex_step(lex, lex->source->fp); - if (rv > 0) + if (rv != NULL) return rv; } - return 0; + return emit_op(lex, lex->source->off, TK_EOF, NULL); } const char * @@ -1158,11 +1282,11 @@ uc_get_tokenname(int type) switch (type) { case 0: return "End of file"; - case T_STRING: return "String"; - case T_LABEL: return "Label"; - case T_NUMBER: return "Number"; - case T_DOUBLE: return "Double"; - case T_REGEXP: return "Regexp"; + case TK_STRING: return "String"; + case TK_LABEL: return "Label"; + case TK_NUMBER: return "Number"; + case TK_DOUBLE: return "Double"; + case TK_REGEXP: return "Regexp"; } for (i = 0; i < ARRAY_SIZE(tokens); i++) { |