diff options
-rw-r--r-- | ast.c | 29 | ||||
-rw-r--r-- | ast.h | 41 | ||||
-rw-r--r-- | eval.c | 2 | ||||
-rw-r--r-- | lexer.c | 1380 | ||||
-rw-r--r-- | lexer.h | 7 | ||||
-rw-r--r-- | lib.c | 12 | ||||
-rw-r--r-- | main.c | 4 | ||||
-rw-r--r-- | parser.y | 4 | ||||
-rw-r--r-- | tests/02_runtime/04_switch_case | 8 |
9 files changed, 845 insertions, 642 deletions
@@ -69,7 +69,6 @@ ut_new_op(struct ut_state *s, int type, struct json_object *val, ...) newop->is_first = !s->poolsize; newop->is_op = true; - newop->off = s->off; newop->type = type; newop->val = val; @@ -478,15 +477,16 @@ ut_parent_scope(struct ut_scope *scope) static void ut_reset(struct ut_state *s) { - s->semicolon_emitted = false; - s->start_tag_seen = false; - s->blocktype = UT_BLOCK_NONE; - s->off = 0; - if (s->error.code == UT_ERROR_EXCEPTION) json_object_put(s->error.info.exception); + else if (s->error.code == UT_ERROR_INVALID_REGEXP) + free(s->error.info.regexp_error); memset(&s->error, 0, sizeof(s->error)); + + free(s->lex.lookbehind); + free(s->lex.buf); + memset(&s->lex, 0, sizeof(s->lex)); } void @@ -532,11 +532,9 @@ ut_free(struct ut_state *s) enum ut_error_type ut_parse(struct ut_state *s, const char *expr) { - int len = strlen(expr); - const char *ptr = expr; + FILE *fp = fmemopen((char *)expr, strlen(expr), "r"); struct ut_op *op; void *pParser; - int mlen = 0; uint32_t off; if (!s) @@ -549,23 +547,18 @@ ut_parse(struct ut_state *s, const char *expr) if (!pParser) return UT_ERROR_OUT_OF_MEMORY; - while (len > 0) { - off = ut_get_token(s, ptr, &mlen); + while (s->lex.state != UT_LEX_EOF) { + off = ut_get_token(s, fp); op = ut_get_op(s, off); - if (mlen < 0) { - s->error.code = -mlen; + if (s->error.code) goto out; - } if (op) Parse(pParser, op->type, off, s); if (s->error.code) goto out; - - len -= mlen; - ptr += mlen; } Parse(pParser, 0, 0, s); @@ -573,6 +566,8 @@ ut_parse(struct ut_state *s, const char *expr) out: ParseFree(pParser, free); + fclose(fp); + return s->error.code; } @@ -23,6 +23,7 @@ #include <stdio.h> #include <stdbool.h> #include <stdarg.h> +#include <string.h> #ifdef JSONC #include <json.h> @@ -47,11 +48,16 @@ enum ut_error_type { UT_ERROR_EXCEPTION }; -enum ut_block_type { - UT_BLOCK_NONE, - UT_BLOCK_STATEMENT, - UT_BLOCK_EXPRESSION, - UT_BLOCK_COMMENT +enum ut_lex_state { + UT_LEX_IDENTIFY_BLOCK, + UT_LEX_BLOCK_COMMENT_START, + UT_LEX_BLOCK_EXPRESSION_START, + UT_LEX_BLOCK_EXPRESSION_EMIT_TAG, + UT_LEX_BLOCK_STATEMENT_START, + UT_LEX_BLOCK_COMMENT, + UT_LEX_IDENTIFY_TOKEN, + UT_LEX_PARSE_TOKEN, + UT_LEX_EOF }; struct ut_op { @@ -98,16 +104,31 @@ struct ut_state { struct ut_op *pool; uint32_t poolsize; uint32_t main; - uint8_t semicolon_emitted:1; - uint8_t start_tag_seen:1; uint8_t srand_called:1; uint8_t trim_blocks:1; uint8_t lstrip_blocks:1; uint8_t strict_declarations:1; uint8_t skip_shebang:1; - uint8_t expect_div:1; - size_t off; - enum ut_block_type blocktype; + struct { + enum ut_lex_state state; + uint8_t eof:1; + uint8_t skip_leading_whitespace:1; + uint8_t skip_leading_newline:1; + uint8_t within_expression_block:1; + uint8_t within_statement_block:1; + uint8_t semicolon_emitted:1; + uint8_t expect_div:1; + uint8_t is_escape:1; + size_t buflen; + char *buf, *bufstart, *bufend; + size_t lookbehindlen; + char *lookbehind; + const void *tok; + char esc[5]; + uint8_t esclen; + int lead_surrogate; + size_t off, lastoff; + } lex; struct { enum ut_error_type code; union { @@ -1019,7 +1019,7 @@ ut_invoke(struct ut_state *state, uint32_t off, struct json_object *this, json_object_put(rv); rv = ut_exception(state, ut_get_off(state, tag), "Syntax error: %s statement must be inside loop", - tokennames[tag->type]); + ut_get_tokenname(tag->type)); break; case T_RETURN: @@ -25,6 +25,7 @@ #include <errno.h> #include "ast.h" +#include "lib.h" #include "lexer.h" #include "parser.h" @@ -33,7 +34,11 @@ struct token { int type; const char *pat; int plen; - int (*parse)(const char *buf, struct ut_op *op, struct ut_state *s); + union { + uint32_t (*parse)(struct ut_state *s); + double d; + bool b; + }; }; #define dec(o) \ @@ -43,12 +48,11 @@ struct token { (((x) >= 'a') ? (10 + (x) - 'a') : \ (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) -static int parse_comment(const char *, struct ut_op *, struct ut_state *); -static int parse_string(const char *, struct ut_op *, struct ut_state *); -static int parse_regexp(const char *, struct ut_op *, struct ut_state *); -static int parse_number(const char *, struct ut_op *, struct ut_state *); -static int parse_label(const char *, struct ut_op *, struct ut_state *); -static int parse_bool(const char *, struct ut_op *, struct ut_state *); +static uint32_t parse_comment(struct ut_state *); +static uint32_t parse_string(struct ut_state *); +static uint32_t parse_regexp(struct ut_state *); +static uint32_t parse_number(struct ut_state *); +static uint32_t parse_label(struct ut_state *); static const struct token tokens[] = { { 0, " ", 1 }, @@ -81,8 +85,8 @@ static const struct token tokens[] = { { T_GE, ">=", 2 }, { T_LSHIFT, "<<", 2 }, { T_RSHIFT, ">>", 2 }, - { 0, "//", 2, parse_comment }, - { 0, "/*", 2, parse_comment }, + { 0, "//", 2, { .parse = parse_comment } }, + { 0, "/*", 2, { .parse = parse_comment } }, { T_OR, "||", 2 }, { T_LEXP, "{{", 2 }, { T_REXP, "}}", 2 }, @@ -112,18 +116,18 @@ static const struct token tokens[] = { { T_SCOL, ";", 1 }, { T_SUB, "-", 1 }, { T_DOT, ".", 1 }, - { T_STRING, "'", 1, parse_string }, - { T_STRING, "\"", 1, parse_string }, - { T_REGEXP, "/", 1, parse_regexp }, - { T_LABEL, "_", 1, parse_label }, - { T_LABEL, "az", 0, parse_label }, - { T_LABEL, "AZ", 0, parse_label }, - { T_NUMBER, "09", 0, parse_number }, + { T_STRING, "'", 1, { .parse = parse_string } }, + { T_STRING, "\"", 1, { .parse = parse_string } }, + { T_REGEXP, "/", 1, { .parse = parse_regexp } }, + { T_LABEL, "_", 1, { .parse = parse_label } }, + { T_LABEL, "az", 0, { .parse = parse_label } }, + { T_LABEL, "AZ", 0, { .parse = parse_label } }, + { T_NUMBER, "09", 0, { .parse = parse_number } }, }; static const struct token reserved_words[] = { { T_ENDFUNC, "endfunction", 11 }, - { T_NUMBER, "Infinity", 8, parse_number }, + { T_DOUBLE, "Infinity", 8, { .d = INFINITY } }, { T_CONTINUE, "continue", 8 }, { T_ENDWHILE, "endwhile", 8 }, { T_FUNC, "function", 8 }, @@ -136,101 +140,19 @@ static const struct token reserved_words[] = { { T_WHILE, "while", 5 }, { T_BREAK, "break", 5 }, { T_CATCH, "catch", 5 }, - { T_BOOL, "false", 5, parse_bool }, - { T_BOOL, "true", 4, parse_bool }, + { T_BOOL, "false", 5, { .b = false } }, + { T_BOOL, "true", 4, { .b = true } }, { T_ELSE, "else", 4 }, { T_THIS, "this", 4 }, { T_NULL, "null", 4 }, { T_CASE, "case", 4 }, - { T_NUMBER, "NaN", 3, parse_number }, + { T_DOUBLE, "NaN", 3, { .d = NAN } }, { T_TRY, "try", 3 }, { T_FOR, "for", 3 }, { T_IF, "if", 2 }, { T_IN, "in", 2 }, }; -const char *tokennames[__T_MAX] = { - [0] = "End of file", - [T_FUNC] = "'function'", - [T_LOCAL] = "'local'", - [T_WHILE] = "'while", - [T_ELSE] = "'else'", - [T_FOR] = "'for'", - [T_IF] = "'if'", - [T_IN] = "'in'", - [T_ASLEFT] = "'x<<=y'", - [T_ASRIGHT] = "'x>>=y'", - [T_AND] = "'x&&y'", - [T_ASADD] = "'x+=y'", - [T_ASBAND] = "'x&=y'", - [T_ASBOR] = "'x|=y'", - [T_ASBXOR] = "'x^=y'", - [T_ASDIV] = "'x/=y'", - [T_ASMOD] = "'x%=y'", - [T_ASMUL] = "'x*=y'", - [T_ASSUB] = "'x-=y'", - [T_DEC] = "'x--'", - [T_INC] = "'x++'", - [T_EQ] = "'x==y'", - [T_NE] = "'x!=y'", - [T_EQS] = "'x===y'", - [T_NES] = "'x!==y'", - [T_LE] = "'x<=y'", - [T_GE] = "'x>=y'", - [T_LSHIFT] = "'x<<y'", - [T_RSHIFT] = "'x>>y'", - [T_LEXP] = "'{{'", - [T_REXP] = "'}}'", - [T_OR] = "'x||y'", - [T_ADD] = "'x+y'", - [T_ASSIGN] = "'x=y'", - [T_BAND] = "'x&y'", - [T_BOR] = "'x|y'", - [T_LBRACK] = "'['", - [T_RBRACK] = "']'", - [T_BXOR] = "'x^y'", - [T_LBRACE] = "'{'", - [T_RBRACE] = "'}'", - [T_COLON] = "':'", - [T_COMMA] = "','", - [T_COMPL] = "'~x'", - [T_DIV] = "'x/y'", - [T_GT] = "'x>y'", - [T_NOT] = "'!x'", - [T_LT] = "'x<y'", - [T_MOD] = "'x%y'", - [T_MUL] = "'x*y'", - [T_LPAREN] = "'('", - [T_RPAREN] = "')'", - [T_QMARK] = "'?'", - [T_SCOL] = "';'", - [T_SUB] = "'x-y'", - [T_DOT] = "'.'", - [T_STRING] = "String", - [T_LABEL] = "Label", - [T_NUMBER] = "Number", - [T_DOUBLE] = "Double", - [T_BOOL] = "Bool", - [T_REGEXP] = "Regexp", - [T_TEXT] = "Text", - [T_ENDIF] = "'endif'", - [T_ENDFOR] = "'endfor'", - [T_ENDWHILE] = "'endwhile'", - [T_ENDFUNC] = "'endfuncton'", - [T_RETURN] = "'return'", - [T_BREAK] = "'break'", - [T_CONTINUE] = "'continue'", - [T_NULL] = "'null'", - [T_THIS] = "'this'", - [T_TRY] = "'try'", - [T_CATCH] = "'catch'", - [T_SWITCH] = "'switch'", - [T_CASE] = "'case'", - [T_DEFAULT] = "'default'", - //[T_LSTM] = "'{%'", - //[T_RSTM] = "'%}'" -}; - /* * Stores the given codepoint as a utf8 multibyte sequence into the given @@ -286,6 +208,117 @@ utf8enc(char **out, int *rem, int code) return true; } +/* length of the longest token in our lookup table */ +#define UT_LEX_MAX_TOKEN_LEN 3 + +static uint32_t emit_op(struct ut_state *s, uint32_t pos, int type, struct json_object *val) +{ + uint32_t off = ut_new_op(s, type, val, UINT32_MAX); + struct ut_op *op = ut_get_op(s, off); + + op->off = pos; + + /* Follow JSLint logic and treat a slash after any of the + * `(,=:[!&|?{};` characters as the beginning of a regex + * literal... */ + switch (type) { + case T_LPAREN: + case T_COMMA: + + case T_ASADD: + case T_ASBAND: + case T_ASBOR: + case T_ASBXOR: + case T_ASDIV: + case T_ASLEFT: + case T_ASMOD: + case T_ASMUL: + case T_ASRIGHT: + case T_ASSIGN: + case T_ASSUB: + case T_EQ: + case T_EQS: + case T_GE: + case T_LE: + case T_NE: + case T_NES: + + case T_COLON: + case T_LBRACK: + case T_NOT: + + case T_AND: + case T_BAND: + + case T_OR: + case T_BOR: + + case T_QMARK: + + case T_LBRACE: + case T_RBRACE: + + case T_LSTM: + case T_LEXP: + + case T_SCOL: + s->lex.expect_div = false; + break; + + default: + s->lex.expect_div = true; + } + + return off; +} + +static void lookbehind_append(struct ut_state *s, const char *data, size_t len) +{ + if (len) { + s->lex.lookbehind = xrealloc(s->lex.lookbehind, s->lex.lookbehindlen + len); + memcpy(s->lex.lookbehind + s->lex.lookbehindlen, data, len); + s->lex.lookbehindlen += len; + } +} + +static void lookbehind_reset(struct ut_state *s) { + free(s->lex.lookbehind); + s->lex.lookbehind = NULL; + s->lex.lookbehindlen = 0; +} + +static uint32_t lookbehind_to_text(struct ut_state *s, uint32_t pos, int type, const char *strip_trailing_chars) { + uint32_t rv = 0; + + if (s->lex.lookbehind) { + if (strip_trailing_chars) { + while (s->lex.lookbehindlen > 0 && strchr(strip_trailing_chars, s->lex.lookbehind[s->lex.lookbehindlen-1])) + s->lex.lookbehindlen--; + } + + rv = emit_op(s, pos, type, xjs_new_string_len(s->lex.lookbehind, s->lex.lookbehindlen)); + + lookbehind_reset(s); + } + + return rv; +} + +static inline size_t buf_remaining(struct ut_state *s) { + return (s->lex.bufend - s->lex.bufstart); +} + +static inline bool _buf_startswith(struct ut_state *s, const char *str, size_t len) { + return (buf_remaining(s) >= len && !strncmp(s->lex.bufstart, str, len)); +} + +#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) + +static void buf_consume(struct ut_state *s, ssize_t len) { + s->lex.bufstart += len; + s->lex.off += len; +} + /* * Parses a comment from the given buffer. * @@ -296,28 +329,50 @@ utf8enc(char **out, int *rem, int code) * -UT_ERROR_UNTERMINATED_COMMENT Unterminated string */ -static int -parse_comment(const char *buf, struct ut_op *op, struct ut_state *s) +static uint32_t +parse_comment(struct ut_state *s) { - const char *p = buf; + const struct token *tok = s->lex.tok; + const char *ptr, *end; + size_t elen; - /* single line comment */ - if (p[0] == '/' && p[1] == '/') { - while (*p != 0 && *p != '\n') - p++; + if (!buf_remaining(s)) { + s->error.code = UT_ERROR_UNTERMINATED_COMMENT; - return (p - buf); + return 0; } - /* multi line comment */ - while (*p) { - if (p[0] == '*' && p[1] == '/') - break; + if (!strcmp(tok->pat, "//")) { + end = "\n"; + elen = 1; + } + else { + end = "*/"; + elen = 2; + } + + for (ptr = s->lex.bufstart; ptr < s->lex.bufend - elen; ptr++) { + if (!strncmp(ptr, end, elen)) { + buf_consume(s, (ptr - s->lex.bufstart) + elen); - p++; + return UINT32_MAX; + } } - return *p ? (p - buf) + 2 : -UT_ERROR_UNTERMINATED_COMMENT; + buf_consume(s, ptr - s->lex.bufstart); + + return 0; +} + +static void append_utf8(struct ut_state *s, int code) { + char ustr[8], *up; + int rem; + + up = ustr; + rem = sizeof(ustr); + + if (utf8enc(&up, &rem, code)) + lookbehind_append(s, ustr, up - ustr); } /* @@ -332,269 +387,236 @@ parse_comment(const char *buf, struct ut_op *op, struct ut_state *s) * -UT_ERROR_OVERLONG_STRING String literal too long */ -static int -parse_string(const char *buf, struct ut_op *op, struct ut_state *s) +static uint32_t +parse_string(struct ut_state *s) { - char q = *(buf++); - char str[128] = { 0 }; - char *out = str; - const char *in = buf; - bool esc = false; - int rem = sizeof(str) - 1; - int lead_surrogate = 0; + const struct token *tok = s->lex.tok; + char q = tok->pat[0]; + char *ptr, *c; + uint32_t rv; int code; - while (*in) { - /* continuation of escape sequence */ - if (esc) { - /* \uFFFF */ - if (in[0] == 'u') { - if (isxdigit(in[1]) && isxdigit(in[2]) && - isxdigit(in[3]) && isxdigit(in[4])) { - code = hex(in[1]) * 16 * 16 * 16 + - hex(in[2]) * 16 * 16 + - hex(in[3]) * 16 + - hex(in[4]); - - /* is a leading surrogate value */ - if ((code & 0xFC00) == 0xD800) { - /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ - if (lead_surrogate) { - if (!utf8enc(&out, &rem, 0xFFFD)) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } - } + if (!buf_remaining(s)) { + s->error.code = UT_ERROR_UNTERMINATED_STRING; + s->lex.off = s->lex.lastoff; - /* store surrogate value and advance to next escape sequence */ - lead_surrogate = code; - goto next; - } + return 0; + } - /* is a trailing surrogate value */ - else if ((code & 0xFC00) == 0xDC00) { - /* found a trailing surrogate following a leading one, combine and encode */ - if (lead_surrogate) { - code = 0x10000 + ((lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); - lead_surrogate = 0; - } + for (ptr = s->lex.bufstart; ptr < s->lex.bufend; ptr++) { + /* continuation of escape sequence */ + if (s->lex.is_escape) { + if (s->lex.esclen == 0) { + /* non-unicode escape following a lead surrogate, emit replacement... */ + if (s->lex.lead_surrogate && *ptr != 'u') { + append_utf8(s, 0xFFFD); + s->lex.lead_surrogate = 0; + } - /* trailing surrogate not following a leading one, ignore and use replacement char */ - else { - code = 0xFFFD; - } + switch (*ptr) { + case 'u': + case 'x': + s->lex.esc[s->lex.esclen++] = *ptr; + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + /* regex mode => backref, retain literally */ + if (q == '/') { + s->lex.is_escape = false; + lookbehind_append(s, "\\", 1); + lookbehind_append(s, ptr, 1); + buf_consume(s, (ptr + 1) - s->lex.bufstart); } - if (!utf8enc(&out, &rem, code)) { - s->off += (in - buf); + /* string mode => likely octal */ + else if (*ptr < '8') { + s->lex.esc[s->lex.esclen++] = 'o'; + s->lex.esc[s->lex.esclen++] = *ptr; + } - return -UT_ERROR_OVERLONG_STRING; + /* non-octal char, add verbatim */ + else { + s->lex.is_escape = false; + lookbehind_append(s, ptr, 1); + buf_consume(s, (ptr + 1) - s->lex.bufstart); } -next: - in += 5; - } - else { - s->off += (in - buf); + break; - return -UT_ERROR_INVALID_ESCAPE; + default: + s->lex.is_escape = false; + c = strchr("a\ab\be\ef\fn\nr\rt\tv\v", *ptr); + lookbehind_append(s, c ? c + 1 : ptr, 1); + buf_consume(s, (ptr + 1) - s->lex.bufstart); + break; } } - - /* other escape sequences */ else { - /* found any non-utf8 escape sequence following a leading unicode surrogate, - emit replacement character and skip surrogate. */ - if (lead_surrogate) { - if (!utf8enc(&out, &rem, 0xFFFD)) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } - - lead_surrogate = 0; - } - - /* \xFF */ - if (in[0] == 'x') { - if (isxdigit(in[1]) && isxdigit(in[2])) { - if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; + switch (s->lex.esc[0]) { + case 'u': + if (s->lex.esclen < 5) { + if (!isxdigit(*ptr)) { + s->lex.off += s->lex.esclen + 1; + s->error.code = UT_ERROR_INVALID_ESCAPE; + + return 0; } - in += 3; + s->lex.esc[s->lex.esclen++] = *ptr; } - else { - s->off += (in - buf); - - return -UT_ERROR_INVALID_ESCAPE; - } - } - /* \1 .. \9 (regex backreference) */ - else if (q == '/' && in[0] >= '0' && in[0] <= '9') { - /* in regexp mode, retain backslash */ - if (rem-- < 1) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } - - *out++ = '\\'; - *out = *in; - } + if (s->lex.esclen == 5) { + code = hex(s->lex.esc[1]) * 16 * 16 * 16 + + hex(s->lex.esc[2]) * 16 * 16 + + hex(s->lex.esc[3]) * 16 + + hex(s->lex.esc[4]); - /* \377, \77 or \7 */ - else if (in[0] >= '0' && in[0] <= '7') { - if (lead_surrogate) { - if (!utf8enc(&out, &rem, 0xFFFD)) { - s->off += (in - buf); + /* is a leading surrogate value */ + if ((code & 0xFC00) == 0xD800) { + /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ + if (s->lex.lead_surrogate) + append_utf8(s, 0xFFFD); - return -UT_ERROR_OVERLONG_STRING; + /* store surrogate value and advance to next escape sequence */ + s->lex.lead_surrogate = code; } - lead_surrogate = 0; - } - - /* \377 */ - if (in[1] >= '0' && in[1] <= '7' && - in[2] >= '0' && in[2] <= '7') { - code = dec(in[0]) * 8 * 8 + - dec(in[1]) * 8 + - dec(in[2]); + /* is a trailing surrogate value */ + else if ((code & 0xFC00) == 0xDC00) { + /* found a trailing surrogate following a leading one, combine and encode */ + if (s->lex.lead_surrogate) { + code = 0x10000 + ((s->lex.lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); + s->lex.lead_surrogate = 0; + } - if (code > 255) { - s->off += (in - buf); + /* trailing surrogate not following a leading one, ignore and use replacement char */ + else { + code = 0xFFFD; + } - return -UT_ERROR_INVALID_ESCAPE; + append_utf8(s, code); } - if (!utf8enc(&out, &rem, code)) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; + /* is a normal codepoint */ + else { + append_utf8(s, code); } - in += 3; + s->lex.esclen = 0; + s->lex.is_escape = false; + buf_consume(s, (ptr + 1) - s->lex.bufstart); } - /* \77 */ - else if (in[1] >= '0' && in[1] <= '7') { - if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) { - s->off += (in - buf); + break; - return -UT_ERROR_OVERLONG_STRING; + case 'x': + if (s->lex.esclen < 3) { + if (!isxdigit(*ptr)) { + s->lex.off += s->lex.esclen + 1; + s->error.code = UT_ERROR_INVALID_ESCAPE; + return 0; } - in += 2; + s->lex.esc[s->lex.esclen++] = *ptr; } - /* \7 */ - else { - if (!utf8enc(&out, &rem, dec(in[0]))) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } + if (s->lex.esclen == 3) { + append_utf8(s, hex(s->lex.esc[1]) * 16 + hex(s->lex.esc[2])); - in += 1; + s->lex.esclen = 0; + s->lex.is_escape = false; + buf_consume(s, (ptr + 1) - s->lex.bufstart); } - } - /* single character escape */ - else { - if (lead_surrogate) { - if (!utf8enc(&out, &rem, 0xFFFD)) { - s->off += (in - buf); + break; + + case 'o': + if (s->lex.esclen < 4) { + /* found a non-octal char */ + if (*ptr < '0' || *ptr > '7') { + /* pad sequence to three chars */ + switch (s->lex.esclen) { + case 3: + s->lex.esc[3] = s->lex.esc[2]; + s->lex.esc[2] = s->lex.esc[1]; + s->lex.esc[1] = '0'; + break; + + case 2: + s->lex.esc[3] = s->lex.esc[1]; + s->lex.esc[2] = '0'; + s->lex.esc[1] = '0'; + break; + } - return -UT_ERROR_OVERLONG_STRING; + s->lex.esclen = 4; + buf_consume(s, ptr - s->lex.bufstart); } - lead_surrogate = 0; + /* append */ + else { + s->lex.esc[s->lex.esclen++] = *ptr; + buf_consume(s, (ptr + 1) - s->lex.bufstart); + } } - if (rem-- < 1) { - s->off += (in - buf); + if (s->lex.esclen == 4) { + code = dec(s->lex.esc[1]) * 8 * 8 + + dec(s->lex.esc[2]) * 8 + + dec(s->lex.esc[3]); - return -UT_ERROR_OVERLONG_STRING; - } - - switch (in[0]) { - case 'a': *out = '\a'; break; - case 'b': *out = '\b'; break; - case 'e': *out = '\e'; break; - case 'f': *out = '\f'; break; - case 'n': *out = '\n'; break; - case 'r': *out = '\r'; break; - case 't': *out = '\t'; break; - case 'v': *out = '\v'; break; - default: - /* in regexp mode, retain backslash */ - if (q == '/') { - if (rem-- < 1) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } + if (code > 255) { + s->lex.off += s->lex.esclen + 1; + s->error.code = UT_ERROR_INVALID_ESCAPE; - *out++ = '\\'; + return 0; } - *out = *in; - break; + append_utf8(s, code); + + s->lex.esclen = 0; + s->lex.is_escape = false; } - in++; - out++; + break; } } - - esc = false; - continue; } - /* begin of escape sequence */ - if (*in == '\\') { - in++; - esc = true; - continue; - } - - - /* there's a non-escape following a previous leading unicode surrogate, - * ignore surrogate and emit replacement char */ - if (lead_surrogate) { - if (!utf8enc(&out, &rem, 0xFFFD)) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; - } - - lead_surrogate = 0; - } + /* terminating char */ + else if (*ptr == q) { + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, (ptr + 1) - s->lex.bufstart); + rv = lookbehind_to_text(s, s->lex.lastoff, T_STRING, NULL); - /* terminating quote */ - if (*in == q) { - op->val = xjs_new_string_len(str, sizeof(str) - 1 - rem); + if (!rv) + rv = emit_op(s, s->lex.lastoff, T_STRING, xjs_new_string_len("", 0)); - return (in - buf) + 2; + return rv; } - /* ordinary char */ - if (rem-- < 1) { - s->off += (in - buf); - - return -UT_ERROR_OVERLONG_STRING; + /* escape sequence start */ + else if (*ptr == '\\') { + s->lex.is_escape = true; + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, ptr - s->lex.bufstart); } - - *out++ = *in++; } - return -UT_ERROR_UNTERMINATED_STRING; + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, ptr - s->lex.bufstart); + + return 0; } @@ -611,65 +633,97 @@ next: * -UT_ERROR_INVALID_REGEXP Could not compile regexp */ -static int -parse_regexp(const char *buf, struct ut_op *op, struct ut_state *s) +enum { + UT_LEX_PARSE_REGEX_INIT, + UT_LEX_PARSE_REGEX_PATTERN, + UT_LEX_PARSE_REGEX_FLAGS +}; + +static uint32_t +parse_regexp(struct ut_state *s) { - struct json_object *rv; - const char *p; + struct json_object *pattern; + struct ut_op *op; + uint32_t rv; char *err; - int len; - if (s->expect_div == 1) { - if (!strncmp(buf, "/=", 2)) { - op->type = T_ASDIV; - return 2; + switch (s->lex.esc[0]) { + case UT_LEX_PARSE_REGEX_INIT: + if (s->lex.expect_div) { + s->lex.expect_div = false; + + if (buf_startswith(s, "=")) { + buf_consume(s, 1); + + return emit_op(s, s->lex.off, T_ASDIV, NULL); + } + + return emit_op(s, s->lex.off, T_DIV, NULL); } - else { - op->type = T_DIV; - return 1; + + s->lex.esc[0] = UT_LEX_PARSE_REGEX_PATTERN; + break; + + case UT_LEX_PARSE_REGEX_PATTERN: + rv = parse_string(s); + + if (rv != 0 && rv != UINT32_MAX) { + s->lex.lookbehind = (char *)ut_get_op(s, rv); + s->lex.esc[0] = UT_LEX_PARSE_REGEX_FLAGS; } - } - len = parse_string(buf, op, s); + break; - if (len < 2) { - json_object_put(op->val); + case UT_LEX_PARSE_REGEX_FLAGS: + op = (struct ut_op *)s->lex.lookbehind; - return (len < 0) ? len : -UT_ERROR_UNTERMINATED_STRING; - } + while (s->lex.bufstart < s->lex.bufend) { + switch (s->lex.bufstart[0]) { + case 'g': + buf_consume(s, 1); + op->is_reg_global = true; + break; - for (p = buf + len; strchr("gis", *p); p++) { - switch (*p) { - case 'g': - op->is_reg_global = 1; - len++; - break; + case 'i': + buf_consume(s, 1); + op->is_reg_icase = true; + break; - case 'i': - op->is_reg_icase = 1; - len++; - break; + case 's': + buf_consume(s, 1); + op->is_reg_newline = true; + break; - case 's': - op->is_reg_newline = 1; - len++; - break; - } - } + default: + s->lex.lookbehind = NULL; + + pattern = ut_new_regexp(json_object_get_string(op->val), + op->is_reg_icase, + op->is_reg_newline, + op->is_reg_global, + &err); - p = json_object_get_string(op->val); - rv = ut_new_regexp(p, op->is_reg_icase, op->is_reg_newline, op->is_reg_global, &err); + json_object_put(op->val); - json_object_put(op->val); - op->val = rv; + op->type = T_REGEXP; + op->val = pattern; - if (!rv) { - s->error.info.regexp_error = err; + if (!pattern) { + s->error.info.regexp_error = err; + s->error.code = UT_ERROR_INVALID_REGEXP; + s->lex.off = s->lex.lastoff; + + return 0; + } + + return ut_get_off(s, op); + } + } - return -UT_ERROR_INVALID_REGEXP; + break; } - return len; + return 0; } @@ -683,41 +737,50 @@ parse_regexp(const char *buf, struct ut_op *op, struct ut_state *s) * -UT_ERROR_OVERLONG_STRING Label too long */ -static int -parse_label(const char *buf, struct ut_op *op, struct ut_state *s) +static uint32_t +parse_label(struct ut_state *s) { + const struct token *tok = s->lex.tok; const struct token *word; - char str[128] = { 0 }; - char *out = str; - const char *in = buf; - int rem = sizeof(str) - 1; - int i; - - while (*in == '_' || isalnum(*in)) { - if (rem-- < 1) { - s->off += (in - buf); - return -UT_ERROR_OVERLONG_STRING; - } + uint32_t rv; + char *ptr; + size_t i; - *out++ = *in++; - } + if (!s->lex.lookbehind && tok->plen) + lookbehind_append(s, tok->pat, tok->plen); - for (i = 0, word = &reserved_words[0]; - i < sizeof(reserved_words) / sizeof(reserved_words[0]); - i++, word = &reserved_words[i]) { - if (!strcmp(str, word->pat)) { - op->type = word->type; + if (!buf_remaining(s) || (s->lex.bufstart[0] != '_' && !isalnum(s->lex.bufstart[0]))) { + for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { + if (s->lex.lookbehindlen == word->plen && !strncmp(s->lex.lookbehind, word->pat, word->plen)) { + lookbehind_reset(s); - if (word->parse) - word->parse(str, op, s); + switch (word->type) { + case T_DOUBLE: + rv = emit_op(s, s->lex.off - word->plen, word->type, ut_new_double(word->d)); + break; - return (in - buf); + case T_BOOL: + rv = emit_op(s, s->lex.off - word->plen, word->type, xjs_new_boolean(word->b)); + break; + + default: + rv = emit_op(s, s->lex.off - word->plen, word->type, NULL); + } + + return rv; + } } + + return lookbehind_to_text(s, s->lex.off - s->lex.lookbehindlen, T_LABEL, NULL); } - op->val = xjs_new_string(str); + for (ptr = s->lex.bufstart; ptr < s->lex.bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) + ; + + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, ptr - s->lex.bufstart); - return (in - buf); + return 0; } @@ -731,287 +794,410 @@ parse_label(const char *buf, struct ut_op *op, struct ut_state *s) * -UT_ERROR_INVALID_ESCAPE Invalid number character */ -static int -parse_number(const char *buf, struct ut_op *op, struct ut_state *s) +static inline bool +is_numeric_char(struct ut_state *s, char c) +{ + char prev = s->lex.lookbehindlen ? s->lex.lookbehind[s->lex.lookbehindlen-1] : 0; + + if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) + return true; + + return (isxdigit(c) || c == 'x' || c == 'X' || c == '.'); +} + +static uint32_t +parse_number(struct ut_state *s) { + uint32_t rv = 0; long long int n; + char *ptr, *e; double d; - char *e; - if (!strncmp(buf, "Infinity", 8)) { - op->type = T_DOUBLE; - op->val = ut_new_double(INFINITY); + if (!buf_remaining(s) || !is_numeric_char(s, s->lex.bufstart[0])) { + lookbehind_append(s, "\0", 1); - return 8; - } - else if (!strncmp(buf, "NaN", 3)) { - op->type = T_DOUBLE; - op->val = ut_new_double(NAN); + n = strtoll(s->lex.lookbehind, &e, 0); + + if (*e == '.' || *e == 'e' || *e == 'E') { + d = strtod(s->lex.lookbehind, &e); + + if (e > s->lex.lookbehind && *e == 0) { + rv = emit_op(s, s->lex.off - (e - s->lex.lookbehind), T_DOUBLE, ut_new_double(d)); + } + else { + s->error.code = UT_ERROR_INVALID_ESCAPE; + s->lex.off -= s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1; + } + } + else if (*e == 0) { + rv = emit_op(s, s->lex.off - (e - s->lex.lookbehind), T_NUMBER, xjs_new_int64(n)); + ut_get_op(s, rv)->is_overflow = (errno == ERANGE); + } + else { + s->error.code = UT_ERROR_INVALID_ESCAPE; + s->lex.off -= s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1; + } + + lookbehind_reset(s); - return 3; + return rv; } - n = strtoll(buf, &e, 0); + for (ptr = s->lex.bufstart; ptr < s->lex.bufend && is_numeric_char(s, *ptr); ptr++) + ; - if (e > buf) { - if (*e == '.') { - d = strtod(buf, &e); + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, ptr - s->lex.bufstart); - if (e > buf) { - op->type = T_DOUBLE; - op->val = ut_new_double(d); + return 0; +} - return (e - buf); - } +static uint32_t +lex_step(struct ut_state *s, FILE *fp) +{ + const struct token *tok; + size_t rlen, rem; + char *ptr, c; + uint32_t rv; + size_t i; + + /* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining, + * move the remaining bytes to the beginning and read more data */ + if (buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) { + if (!s->lex.buf) { + s->lex.buflen = 128; + s->lex.buf = xalloc(s->lex.buflen); } + rem = s->lex.bufend - s->lex.bufstart; + + memcpy(s->lex.buf, s->lex.bufstart, rem); - op->type = T_NUMBER; - op->val = xjs_new_int64(n); - op->is_overflow = (errno == ERANGE); + rlen = fread(s->lex.buf + rem, 1, s->lex.buflen - rem, fp); - return (e - buf); + s->lex.bufstart = s->lex.buf; + s->lex.bufend = s->lex.buf + rlen + rem; + + if (rlen == 0 && (ferror(fp) || feof(fp))) + s->lex.eof = 1; } - return -UT_ERROR_INVALID_ESCAPE; -} + switch (s->lex.state) { + case UT_LEX_IDENTIFY_BLOCK: + /* previous block had strip trailing whitespace flag, skip leading whitespace */ + if (s->lex.skip_leading_whitespace) { + while (buf_remaining(s) && isspace(s->lex.bufstart[0])) + buf_consume(s, 1); + s->lex.skip_leading_whitespace = false; + } -/* - * Parses a bool literal from the given buffer. - * - * Returns the amount of consumed characters from the given buffer. - */ + /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ + else if (s->lex.skip_leading_newline) { + if (buf_startswith(s, "\n")) + buf_consume(s, 1); -static int -parse_bool(const char *buf, struct ut_op *op, struct ut_state *s) -{ - if (!strncmp(buf, "false", 5)) { - op->val = xjs_new_boolean(false); + s->lex.skip_leading_newline = false; + } - return 5; - } - else if (!strncmp(buf, "true", 4)) { - op->val = xjs_new_boolean(true); + /* scan forward through buffer to identify start token */ + for (ptr = s->lex.bufstart; ptr < s->lex.bufend - strlen("{#"); ptr++) { + /* found start of comment block */ + if (!strncmp(ptr, "{#", 2)) { + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, (ptr + 2) - s->lex.bufstart); + s->lex.lastoff = s->lex.off - 2; + s->lex.state = UT_LEX_BLOCK_COMMENT_START; - return 4; - } + return 0; + } - return 0; -} + /* found start of expression block */ + else if (!strncmp(ptr, "{{", 2)) { + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, (ptr + 2) - s->lex.bufstart); + s->lex.lastoff = s->lex.off - 2; + s->lex.state = UT_LEX_BLOCK_EXPRESSION_START; + return 0; + } -static int -match_token(const char *ptr, struct ut_op *op, struct ut_state *s) -{ - int i; - const struct token *tok; + /* found start of statement block */ + else if (!strncmp(ptr, "{%", 2)) { + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, (ptr + 2) - s->lex.bufstart); + s->lex.lastoff = s->lex.off - 2; + s->lex.state = UT_LEX_BLOCK_STATEMENT_START; - for (i = 0, tok = &tokens[0]; - i < sizeof(tokens) / sizeof(tokens[0]); - i++, tok = &tokens[i]) { - if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) || - (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1])) { - op->type = tok->type; + return 0; + } + } - if (tok->parse) - return tok->parse(ptr, op, s); + /* we're at eof */ + if (s->lex.eof) { + lookbehind_append(s, ptr, s->lex.bufend - ptr); + s->lex.state = UT_LEX_EOF; - return tok->plen; + return lookbehind_to_text(s, s->lex.lastoff, T_TEXT, NULL); } - } - return -UT_ERROR_UNEXPECTED_CHAR; -} + lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); + buf_consume(s, ptr - s->lex.bufstart); + break; -uint32_t -ut_get_token(struct ut_state *s, const char *input, int *mlen) -{ - struct ut_op op = { 0 }; - const char *o, *p; - uint32_t rv; - for (o = p = input; *p; p++) { - if (s->blocktype == UT_BLOCK_NONE) { - if (!strncmp(p, "{#", 2)) - s->blocktype = UT_BLOCK_COMMENT; - else if (!strncmp(p, "{{", 2)) - s->blocktype = UT_BLOCK_EXPRESSION; - else if (!strncmp(p, "{%", 2)) - s->blocktype = UT_BLOCK_STATEMENT; - - if (s->blocktype) { - *mlen = p - input; - s->start_tag_seen = 0; - s->off += *mlen; - - /* strip whitespace before block */ - if (p[2] == '-') { - while (p > o && isspace(p[-1])) - p--; - } + case UT_LEX_BLOCK_COMMENT_START: + case UT_LEX_BLOCK_EXPRESSION_START: + case UT_LEX_BLOCK_STATEMENT_START: + rv = 0; + s->lex.skip_leading_whitespace = 0; - /* lstrip */ - else if (s->lstrip_blocks && s->blocktype == UT_BLOCK_STATEMENT && p[2] != '+') { - while (p > o && p[-1] != '\n' && isspace(p[-1])) - p--; - } + /* strip whitespace before block */ + if (buf_startswith(s, "-")) { + rv = lookbehind_to_text(s, s->lex.off, T_TEXT, " \n\t\v\f\r"); + buf_consume(s, 1); + } - if (p == o) - return 0; + /* disable lstrip flag (only valid for statement blocks) */ + else if (s->lex.state == UT_LEX_BLOCK_STATEMENT_START) { + /* disable lstrip flag */ + if (buf_startswith(s, "+")) { + rv = lookbehind_to_text(s, s->lex.off, T_TEXT, NULL); + buf_consume(s, 1); + } - return ut_new_op(s, T_TEXT, xjs_new_string_len(o, p - o), UINT32_MAX); + /* global block lstrip */ + else if (s->lstrip_blocks) { + rv = lookbehind_to_text(s, s->lex.off, T_TEXT, " \t\v\f\r"); } } - else if (s->blocktype == UT_BLOCK_COMMENT) { - if (!strncmp(p, "#}", 2) || !strncmp(p, "-#}", 3)) { - *mlen = (p - input) + 2; + else { + rv = lookbehind_to_text(s, s->lex.off, T_TEXT, NULL); + } - /* strip whitespace after block */ - if (*p == '-') { - (*mlen)++; + switch (s->lex.state) { + case UT_LEX_BLOCK_COMMENT_START: + s->lex.state = UT_LEX_BLOCK_COMMENT; + break; - while (isspace(p[3])) { - (*mlen)++; - p++; - } - } + case UT_LEX_BLOCK_STATEMENT_START: + s->lex.within_statement_block = 1; + s->lex.state = UT_LEX_IDENTIFY_TOKEN; + break; - s->blocktype = UT_BLOCK_NONE; - s->off += *mlen; + case UT_LEX_BLOCK_EXPRESSION_START: + s->lex.state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG; + break; - return 0; - } + default: + break; } - else if (s->blocktype == UT_BLOCK_STATEMENT || s->blocktype == UT_BLOCK_EXPRESSION) { - *mlen = match_token(p, &op, s); - if (*mlen < 0) { - s->error.code = -*mlen; + return rv; - return 0; + + case UT_LEX_BLOCK_COMMENT: + /* scan forward through buffer to identify end token */ + while (s->lex.bufstart < s->lex.bufend - 2) { + if (buf_startswith(s, "-#}")) { + s->lex.state = UT_LEX_IDENTIFY_BLOCK; + s->lex.skip_leading_whitespace = 1; + buf_consume(s, 3); + s->lex.lastoff = s->lex.off; + break; + } + else if (buf_startswith(s, "#}")) { + s->lex.state = UT_LEX_IDENTIFY_BLOCK; + s->lex.skip_leading_whitespace = 0; + buf_consume(s, 2); + s->lex.lastoff = s->lex.off; + break; } - /* disallow nesting blocks */ - else if ((s->start_tag_seen && s->blocktype == UT_BLOCK_STATEMENT && - (op.type == T_LEXP || op.type == T_REXP || op.type == T_LSTM)) || - (s->start_tag_seen && s->blocktype == UT_BLOCK_EXPRESSION && - (op.type == T_LSTM || op.type == T_RSTM || op.type == T_LEXP))) { - s->error.code = UT_ERROR_NESTED_BLOCKS; + buf_consume(s, 1); + } - return 0; - } + /* we're at eof */ + if (s->lex.eof) { + s->lex.off = s->lex.lastoff; + s->error.code = UT_ERROR_UNTERMINATED_BLOCK; + } + + break; + + + case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG: + s->lex.within_expression_block = 1; + s->lex.state = UT_LEX_IDENTIFY_TOKEN; - /* emit additional empty statement (semicolon) at end of template block */ - else if ((s->blocktype == UT_BLOCK_STATEMENT && op.type == T_RSTM) || - (s->blocktype == UT_BLOCK_EXPRESSION && op.type == T_REXP)) { - if (!s->semicolon_emitted) { - s->semicolon_emitted = true; - op.type = T_SCOL; - *mlen = 0; + return emit_op(s, s->lex.off, T_LEXP, NULL); + + + case UT_LEX_IDENTIFY_TOKEN: + for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) { + /* remaining buffer data is shorter than token, skip */ + if (tok->plen > buf_remaining(s)) + continue; + + c = s->lex.bufstart[0]; + + if (tok->plen ? !strncmp(s->lex.bufstart, tok->pat, tok->plen) + : (c >= tok->pat[0] && c <= tok->pat[1])) { + buf_consume(s, tok->plen); + + /* token has a parse method, switch state */ + if (tok->parse) { + s->lex.tok = tok; + s->lex.state = UT_LEX_PARSE_TOKEN; + s->lex.lastoff = s->lex.off - tok->plen; + + return 0; } - else { - /* strip whitespace after block */ - if (*p == '-') { - while (isspace(p[3])) { - (*mlen)++; - p++; - } - } - else if (s->blocktype == UT_BLOCK_STATEMENT && - s->trim_blocks && p[2] == '\n') { - (*mlen)++; + + /* disallow nesting blocks */ + if ((s->lex.within_expression_block && + (tok->type == T_LSTM || tok->type == T_RSTM || tok->type == T_LEXP)) || + (s->lex.within_statement_block && + (tok->type == T_LEXP || tok->type == T_REXP || tok->type == T_LSTM))) { + s->error.code = UT_ERROR_NESTED_BLOCKS; + s->lex.off -= tok->plen; + + return 0; + } + + /* found end of block */ + else if ((s->lex.within_statement_block && tok->type == T_RSTM) || + (s->lex.within_expression_block && tok->type == T_REXP)) { + /* emit additional empty statement (semicolon) at end of template block */ + if (!s->lex.semicolon_emitted) { + s->lex.semicolon_emitted = true; + + /* rewind */ + buf_consume(s, -tok->plen); + + return emit_op(s, s->lex.off, T_SCOL, NULL); } - s->semicolon_emitted = false; - s->blocktype = UT_BLOCK_NONE; + /* strip whitespace after block */ + if (tok->pat[0] == '-') + s->lex.skip_leading_whitespace = true; + + /* strip newline after statement block */ + else if (s->lex.within_statement_block && s->trim_blocks) + s->lex.skip_leading_newline = true; + + s->lex.semicolon_emitted = false; + s->lex.within_statement_block = false; + s->lex.within_expression_block = false; + s->lex.state = UT_LEX_IDENTIFY_BLOCK; + s->lex.lastoff = s->lex.off; } + + /* do not report statement tags to the parser */ + if (tok->type != 0 && tok->type != T_LSTM && tok->type != T_RSTM) + rv = emit_op(s, s->lex.off - tok->plen, tok->type, NULL); + else + rv = 0; + + return rv; } + } - s->start_tag_seen = 1; - s->off += *mlen; + /* no token matched and we do have remaining data, junk */ + if (buf_remaining(s)) { + s->error.code = UT_ERROR_UNEXPECTED_CHAR; - /* do not report '{%' and '%}' tags to parser */ - if (op.type == T_LSTM || op.type == T_RSTM || op.type == 0) - return 0; + return 0; + } - rv = ut_new_op(s, op.type, op.val, UINT32_MAX); + /* we're at eof, allow unclosed statement blocks */ + if (s->lex.within_statement_block) { + s->lex.state = UT_LEX_EOF; - if (rv) { - s->pool[rv - 1].is_overflow = op.is_overflow; - s->pool[rv - 1].is_reg_icase = op.is_reg_icase; - s->pool[rv - 1].is_reg_global = op.is_reg_global; - s->pool[rv - 1].is_reg_newline = op.is_reg_newline; - } + return 0; + } - /* Follow JSLint logic and treat a slash after any of the - * `(,=:[!&|?{};` characters as the beginning of a regex - * literal... */ - switch (op.type) { - case T_LPAREN: - case T_COMMA: - - case T_ASADD: - case T_ASBAND: - case T_ASBOR: - case T_ASBXOR: - case T_ASDIV: - case T_ASLEFT: - case T_ASMOD: - case T_ASMUL: - case T_ASRIGHT: - case T_ASSIGN: - case T_ASSUB: - case T_EQ: - case T_EQS: - case T_GE: - case T_LE: - case T_NE: - case T_NES: - - case T_COLON: - case T_LBRACK: - case T_NOT: - - case T_AND: - case T_BAND: - - case T_OR: - case T_BOR: - - case T_QMARK: - - case T_LBRACE: - case T_RBRACE: - - case T_LSTM: - case T_LEXP: - - case T_SCOL: - s->expect_div = 0; - break; + /* premature EOF */ + s->error.code = UT_ERROR_UNTERMINATED_BLOCK; - default: - s->expect_div = 1; - } + break; + + + case UT_LEX_PARSE_TOKEN: + tok = s->lex.tok; + rv = tok->parse(s); + + if (rv) { + memset(s->lex.esc, 0, sizeof(s->lex.esc)); + s->lex.state = UT_LEX_IDENTIFY_TOKEN; + s->lex.tok = NULL; + + if (rv == UINT32_MAX) + rv = 0; return rv; } - } - /* allow unclosed '{%' blocks */ - if (s->blocktype == UT_BLOCK_EXPRESSION || s->blocktype == UT_BLOCK_COMMENT) { - s->error.code = UT_ERROR_UNTERMINATED_BLOCK; + break; - return 0; + + case UT_LEX_EOF: + break; } - if (p > input) { - *mlen = p - input; - s->off += *mlen; + return 0; +} - return ut_new_op(s, T_TEXT, xjs_new_string_len(o, p - o), UINT32_MAX); +uint32_t +ut_get_token(struct ut_state *s, FILE *fp) +{ + uint32_t rv; + + while (s->lex.state != UT_LEX_EOF) { + rv = lex_step(s, fp); + + if (rv == 0 && s->error.code) + break; + + if (rv > 0) + return rv; } return 0; } + +const char * +ut_get_tokenname(int type) +{ + static char buf[sizeof("'endfunction'")]; + size_t i; + + switch (type) { + case 0: return "End of file"; + case T_STRING: return "String"; + case T_LABEL: return "Label"; + case T_NUMBER: return "Number"; + case T_DOUBLE: return "Double"; + case T_REGEXP: return "Regexp"; + } + + for (i = 0; i < ARRAY_SIZE(tokens); i++) { + if (tokens[i].type != type) + continue; + + snprintf(buf, sizeof(buf), "'%s'", tokens[i].pat); + + return buf; + } + + for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { + if (reserved_words[i].type != type) + continue; + + snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); + + return buf; + } + + return "?"; +} @@ -24,12 +24,13 @@ #define T_CFUNC (__T_MAX + 1) #define T_RESSOURCE (__T_MAX + 2) -extern const char *tokennames[__T_MAX]; - bool utf8enc(char **out, int *rem, int code); uint32_t -ut_get_token(struct ut_state *s, const char *input, int *mlen); +ut_get_token(struct ut_state *s, FILE *fp); + +const char * +ut_get_tokenname(int type); #endif /* __LEXER_H_ */ @@ -171,7 +171,7 @@ char * ut_format_error(struct ut_state *state, const char *expr) { char *msg = NULL, *filename = state->filename; - size_t off = state ? state->off : 0; + size_t off = state ? state->lex.off : 0; struct ut_op *tag; bool first = true; size_t msglen = 0; @@ -217,20 +217,20 @@ ut_format_error(struct ut_state *state, const char *expr) sprintf_append(&msg, &msglen, "Syntax error: Unexpected token\n"); for (i = 0, max_i = 0; i < sizeof(state->error.info.tokens) * 8; i++) - if (ut_is_error_token(state, i) && tokennames[i]) + if (ut_is_error_token(state, i)) max_i = i; for (i = 0; i < sizeof(state->error.info.tokens) * 8; i++) { - if (ut_is_error_token(state, i) && tokennames[i]) { + if (ut_is_error_token(state, i)) { if (first) { - sprintf_append(&msg, &msglen, "Expecting %s", tokennames[i]); + sprintf_append(&msg, &msglen, "Expecting %s", ut_get_tokenname(i)); first = false; } else if (i < max_i) { - sprintf_append(&msg, &msglen, ", %s", tokennames[i]); + sprintf_append(&msg, &msglen, ", %s", ut_get_tokenname(i)); } else { - sprintf_append(&msg, &msglen, " or %s", tokennames[i]); + sprintf_append(&msg, &msglen, " or %s", ut_get_tokenname(i)); } } } @@ -74,7 +74,7 @@ static void dump_node(struct ut_op *op) { case T_STRING: case T_LABEL: case T_TEXT: - printf("n%p [label=\"%s<", op, tokennames[op->type]); + printf("n%p [label=\"%s<", op, ut_get_tokenname(op->type)); for (p = json_object_get_string(op->val); *p; p++) switch (*p) { @@ -98,7 +98,7 @@ static void dump_node(struct ut_op *op) { break; default: - printf("n%p [label=\"%s", op, tokennames[op->type]); + printf("n%p [label=\"%s", op, ut_get_tokenname(op->type)); if (op->is_postfix) printf(", postfix"); @@ -86,9 +86,9 @@ ut_no_empty_obj(struct ut_state *s, uint32_t off) s->error.code = UT_ERROR_UNEXPECTED_TOKEN; if (op) - s->off = op->off; + s->lex.off = op->off; - for (i = 0; i < sizeof(tokennames) / sizeof(tokennames[0]); i++) + for (i = 0; i < __T_MAX; i++) if (yy_find_shift_action(yypParser, (YYCODETYPE)i) < YYNSTATE + YYNRULE) ut_set_error_token(s, i); } diff --git a/tests/02_runtime/04_switch_case b/tests/02_runtime/04_switch_case index bc8b80e..4a9d3e9 100644 --- a/tests/02_runtime/04_switch_case +++ b/tests/02_runtime/04_switch_case @@ -89,10 +89,10 @@ default -- Expect stderr -- Syntax error: more than one switch default case -In line 6, byte 9: +In line 6, byte 2: ` default:` - Near here -----^ + ^-- Near here -- End -- @@ -242,10 +242,10 @@ one -- Expect stderr -- Died -In line 6, byte 7: +In line 6, byte 6: ` die();` - Near here ------^ + Near here -----^ -- End -- |