diff options
author | Jo-Philipp Wich <jo@mein.io> | 2022-07-14 14:33:12 +0200 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2022-07-28 13:18:30 +0200 |
commit | 03c8e4b465c8cffd2596d2741b29ad2ba4ec1765 (patch) | |
tree | 6a43c9f54be5e3de4fcbc73b5ebaa518e642d3ad /lexer.c | |
parent | 1219d7efa170bf38fb1bf6a10fa0d1f96e62f091 (diff) |
lexer: rewrite token scanner
- Use nested switches instead of lookup tables to detect tokens
- Simplify input buffer logic
- Reduce amount of intermediate states
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 1464 |
1 files changed, 700 insertions, 764 deletions
@@ -29,24 +29,12 @@ #include "ucode/lib.h" #include "ucode/lexer.h" -#define UC_LEX_CONTINUE_PARSING (void *)1 - struct keyword { unsigned type; const char *pat; unsigned plen; }; -struct token { - unsigned type; - union { - uint32_t patn; - char pat[4]; - } u; - unsigned plen; - uc_token_t *(*parse)(uc_lexer_t *); -}; - #define dec(o) \ ((o) - '0') @@ -56,94 +44,6 @@ struct token { #ifndef NO_COMPILE -static uc_token_t *parse_comment(uc_lexer_t *); -static uc_token_t *parse_string(uc_lexer_t *); -static uc_token_t *parse_regexp(uc_lexer_t *); -static uc_token_t *parse_number(uc_lexer_t *); -static uc_token_t *parse_label(uc_lexer_t *); - -static const struct token tokens[] = { - { TK_ASLEFT, { .pat = "<<=" }, 3, NULL }, - { TK_ASRIGHT, { .pat = ">>=" }, 3, NULL }, - { TK_LEXP, { .pat = "{{-" }, 3, NULL }, - { TK_REXP, { .pat = "-}}" }, 3, NULL }, - { TK_LSTM, { .pat = "{%+" }, 3, NULL }, - { TK_LSTM, { .pat = "{%-" }, 3, NULL }, - { TK_RSTM, { .pat = "-%}" }, 3, NULL }, - { TK_EQS, { .pat = "===" }, 3, NULL }, - { TK_NES, { .pat = "!==" }, 3, NULL }, - { TK_ELLIP, { .pat = "..." }, 3, NULL }, - { TK_QLBRACK, { .pat = "?.[" }, 3, NULL }, - { TK_QLPAREN, { .pat = "?.(" }, 3, NULL }, - { TK_ASEXP, { .pat = "**=" }, 3, NULL }, - { TK_ASAND, { .pat = "&&=" }, 3, NULL }, - { TK_ASOR, { .pat = "||=" }, 3, NULL }, - { TK_ASNULLISH, { .pat = "\?\?=" }, 3, NULL }, - { TK_AND, { .pat = "&&" }, 2, NULL }, - { TK_ASADD, { .pat = "+=" }, 2, NULL }, - { TK_ASBAND, { .pat = "&=" }, 2, NULL }, - { TK_ASBOR, { .pat = "|=" }, 2, NULL }, - { TK_ASBXOR, { .pat = "^=" }, 2, NULL }, - //{ TK_ASDIV, { .pat = "/=" }, 2, NULL }, - { TK_ASMOD, { .pat = "%=" }, 2, NULL }, - { TK_ASMUL, { .pat = "*=" }, 2, NULL }, - { TK_ASSUB, { .pat = "-=" }, 2, NULL }, - { TK_EXP, { .pat = "**" }, 2, NULL }, - { TK_DEC, { .pat = "--" }, 2, NULL }, - { TK_INC, { .pat = "++" }, 2, NULL }, - { TK_EQ, { .pat = "==" }, 2, NULL }, - { TK_NE, { .pat = "!=" }, 2, NULL }, - { TK_LE, { .pat = "<=" }, 2, NULL }, - { TK_GE, { .pat = ">=" }, 2, NULL }, - { TK_LSHIFT, { .pat = "<<" }, 2, NULL }, - { TK_RSHIFT, { .pat = ">>" }, 2, NULL }, - { 0, { .pat = "//" }, 2, parse_comment }, - { 0, { .pat = "/*" }, 2, parse_comment }, - { TK_OR, { .pat = "||" }, 2, NULL }, - { TK_LEXP, { .pat = "{{" }, 2, NULL }, - { TK_REXP, { .pat = "}}" }, 2, NULL }, - { TK_LSTM, { .pat = "{%" }, 2, NULL }, - { TK_RSTM, { .pat = "%}" }, 2, NULL }, - { TK_ARROW, { .pat = "=>" }, 2, NULL }, - { TK_NULLISH, { .pat = "??" }, 2, NULL }, - { TK_QDOT, { .pat = "?." }, 2, NULL }, - { TK_PLACEH, { .pat = "${" }, 2, NULL }, - { TK_ADD, { .pat = "+" }, 1, NULL }, - { TK_ASSIGN, { .pat = "=" }, 1, NULL }, - { TK_BAND, { .pat = "&" }, 1, NULL }, - { TK_BOR, { .pat = "|" }, 1, NULL }, - { TK_LBRACK, { .pat = "[" }, 1, NULL }, - { TK_RBRACK, { .pat = "]" }, 1, NULL }, - { TK_BXOR, { .pat = "^" }, 1, NULL }, - { TK_LBRACE, { .pat = "{" }, 1, NULL }, - { TK_RBRACE, { .pat = "}" }, 1, NULL }, - { TK_COLON, { .pat = ":" }, 1, NULL }, - { TK_COMMA, { .pat = "," }, 1, NULL }, - { TK_COMPL, { .pat = "~" }, 1, NULL }, - //{ TK_DIV, { .pat = "/" }, 1, NULL }, - { TK_GT, { .pat = ">" }, 1, NULL }, - { TK_NOT, { .pat = "!" }, 1, NULL }, - { TK_LT, { .pat = "<" }, 1, NULL }, - { TK_MOD, { .pat = "%" }, 1, NULL }, - { TK_MUL, { .pat = "*" }, 1, NULL }, - { TK_LPAREN, { .pat = "(" }, 1, NULL }, - { TK_RPAREN, { .pat = ")" }, 1, NULL }, - { TK_QMARK, { .pat = "?" }, 1, NULL }, - { TK_SCOL, { .pat = ";" }, 1, NULL }, - { TK_SUB, { .pat = "-" }, 1, NULL }, - { TK_DOT, { .pat = "." }, 1, NULL }, - { TK_STRING, { .pat = "'" }, 1, parse_string }, - { TK_STRING, { .pat = "\"" }, 1, parse_string }, - { TK_REGEXP, { .pat = "/" }, 1, parse_regexp }, - { TK_LABEL, { .pat = "_" }, 1, parse_label }, - { TK_LABEL, { .pat = "az" }, 0, parse_label }, - { TK_LABEL, { .pat = "AZ" }, 0, parse_label }, - { TK_NUMBER, { .pat = "09" }, 0, parse_number }, - - /* NB: this must be last for simple retrieval */ - { TK_TEMPLATE, { .pat = "`" }, 1, parse_string } -}; - static const struct keyword reserved_words[] = { { TK_ENDFUNC, "endfunction", 11 }, { TK_CONTINUE, "continue", 8 }, @@ -174,119 +74,118 @@ static const struct keyword reserved_words[] = { }; -/* length of the longest token in our lookup table */ -#define UC_LEX_MAX_TOKEN_LEN 3 +static int +fill_buf(uc_lexer_t *lex) { + lex->rbuf = xrealloc(lex->rbuf, 128); + lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp); + lex->rpos = 0; -static uc_token_t * -emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv) -{ - lex->curr.type = type; - lex->curr.uv = uv; - lex->curr.pos = pos; + if (!lex->rlen) + return EOF; - return &lex->curr; -} + lex->rpos++; -static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len) -{ - if (len) { - lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len); - memcpy(lex->lookbehind + lex->lookbehindlen, data, len); - lex->lookbehindlen += len; - } + return (int)lex->rbuf[0]; } -static void lookbehind_reset(uc_lexer_t *lex) { - free(lex->lookbehind); - lex->lookbehind = NULL; - lex->lookbehindlen = 0; -} +static int +update_line(uc_lexer_t *lex, int ch) { + if (ch == '\n' || ch == EOF) + uc_source_line_next(lex->source); + else + uc_source_line_update(lex->source, 1); -static uc_token_t * -lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) { - uc_token_t *rv = NULL; + lex->source->off++; - if (lex->lookbehind) { - if (strip_trailing_chars) { - while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1])) - lex->lookbehindlen--; - } + return ch; +} - rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen)); +static int +lookahead_char(uc_lexer_t *lex) { + int c; - lookbehind_reset(lex); - } + if (lex->rpos < lex->rlen) + return (int)lex->rbuf[lex->rpos]; - return rv; -} + c = fill_buf(lex); + lex->rpos = 0; -static inline size_t -buf_remaining(uc_lexer_t *lex) { - return (lex->bufend - lex->bufstart); + return c; } -static inline bool -_buf_startswith(uc_lexer_t *lex, const char *str, size_t len) { - return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len)); -} +static bool +check_char(uc_lexer_t *lex, int ch) { + if (lookahead_char(lex) != ch) + return false; -#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) + lex->rpos++; + update_line(lex, ch); -static void -buf_consume(uc_lexer_t *lex, size_t len) { - size_t i, linelen; + return true; +} - for (i = 0, linelen = 0; i < len; i++) { - if (lex->bufstart[i] == '\n') { - uc_source_line_update(lex->source, linelen); - uc_source_line_next(lex->source); +static int +next_char(uc_lexer_t *lex) { + int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex); - linelen = 0; - } - else { - linelen++; - } - } + return update_line(lex, ch); +} + +static uc_token_t * +emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv) +{ + lex->curr.type = type; + lex->curr.uv = uv; - if (linelen) - uc_source_line_update(lex->source, linelen); + if (pos < 0) + lex->curr.pos = lex->source->off + pos; + else + lex->curr.pos = (size_t)pos; - lex->bufstart += len; - lex->source->off += len; + return &lex->curr; } static uc_token_t * -parse_comment(uc_lexer_t *lex) -{ - const struct token *tok = lex->tok; - const char *ptr, *end; - size_t elen; +emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) { + uc_token_t *rv = NULL; + + if (lex->buffer.count) { + if (strip_trailing_chars) + while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer))) + lex->buffer.count--; + + rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count)); - if (!strcmp(tok->u.pat, "//")) { - end = "\n"; - elen = 1; + uc_vector_clear(&lex->buffer); } - else { - end = "*/"; - elen = 2; + else if (type != TK_TEXT) { + rv = emit_op(lex, pos, type, ucv_string_new_length("", 0)); } - for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) { - if (!strncmp(ptr, end, elen)) { - buf_consume(lex, (ptr - lex->bufstart) + elen); + return rv; +} - return UC_LEX_CONTINUE_PARSING; - } - } - buf_consume(lex, ptr - lex->bufstart); +static uc_token_t * +parse_comment(uc_lexer_t *lex, int kind) +{ + int ch; + + while (true) { + ch = next_char(lex); - if (lex->eof) { - lex->state = UC_LEX_EOF; + if (kind == '/' && (ch == '\n' || ch == EOF)) + break; + + if (kind == '*' && ch == '*' && check_char(lex, '/')) + break; + + if (ch == EOF) { + lex->state = UC_LEX_EOF; - if (elen == 2) return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment")); + } } return NULL; @@ -301,238 +200,157 @@ append_utf8(uc_lexer_t *lex, int code) { rem = sizeof(ustr); if (utf8enc(&up, &rem, code)) - lookbehind_append(lex, ustr, up - ustr); + for (up = ustr; rem < (int)sizeof(ustr); rem++) + uc_vector_push(&lex->buffer, *up++); } static uc_token_t * -parse_string(uc_lexer_t *lex) +parse_string(uc_lexer_t *lex, int kind) { - const struct token *tok = lex->tok; - char q = tok->u.pat[0]; - char *ptr, *c; - uc_token_t *rv; - int code; + int code, ch, i; + unsigned type; + size_t off; - if (!buf_remaining(lex)) - return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); + if (kind == '`') + type = TK_TEMPLATE; + else if (kind == '/') + type = TK_REGEXP; + else + type = TK_STRING; - for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) { - /* continuation of placeholder start */ - if (lex->is_placeholder) { - if (*ptr == '{') { - buf_consume(lex, 1); - rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); + off = lex->source->off - 1; - if (!rv) - rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); + for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { + switch (ch) { + /* placeholder */ + case '$': + if (type == TK_TEMPLATE && check_char(lex, '{')) { + lex->state = UC_LEX_PLACEHOLDER_START; - return rv; + return emit_buffer(lex, off, type, NULL); } - lex->is_placeholder = false; - lookbehind_append(lex, "$", 1); - } + uc_vector_push(&lex->buffer, '$'); + break; - /* continuation of escape sequence */ - if (lex->is_escape) { - if (lex->esclen == 0) { - /* non-unicode escape following a lead surrogate, emit replacement... */ - if (lex->lead_surrogate && *ptr != 'u') { - append_utf8(lex, 0xFFFD); - lex->lead_surrogate = 0; - } + /* escape sequence */ + case '\\': + /* unicode escape sequence */ + if (type != TK_REGEXP && check_char(lex, 'u')) { + for (i = 0, code = 0; i < 4; i++) { + ch = next_char(lex); - switch ((q == '/') ? 0 : *ptr) { - case 'u': - case 'x': - lex->esc[lex->esclen++] = *ptr; - break; + if (!isxdigit(ch)) + return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - lex->esc[lex->esclen++] = 'o'; - lex->esc[lex->esclen++] = *ptr; - break; + code = code * 16 + hex(ch); + } - default: - lex->is_escape = false; - c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr); + /* is a leading surrogate value */ + if ((code & 0xFC00) == 0xD800) { + /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ + if (lex->lead_surrogate) + append_utf8(lex, 0xFFFD); - if (c && *c >= 'a') { - lookbehind_append(lex, c + 1, 1); + /* store surrogate value and advance to next escape sequence */ + lex->lead_surrogate = code; + } + + /* is a trailing surrogate value */ + else if ((code & 0xFC00) == 0xDC00) { + /* found a trailing surrogate following a leading one, combine and encode */ + if (lex->lead_surrogate) { + code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); + lex->lead_surrogate = 0; } - else { - /* regex mode => retain backslash */ - if (q == '/') - lookbehind_append(lex, "\\", 1); - lookbehind_append(lex, ptr, 1); + /* trailing surrogate not following a leading one, ignore and use replacement char */ + else { + code = 0xFFFD; } - buf_consume(lex, (ptr + 1) - lex->bufstart); + append_utf8(lex, code); + } - break; + /* is a normal codepoint */ + else { + append_utf8(lex, code); } } - else { - switch (lex->esc[0]) { - case 'u': - if (lex->esclen < 5) { - if (!isxdigit(*ptr)) - return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); - lex->esc[lex->esclen++] = *ptr; - } - - if (lex->esclen == 5) { - code = hex(lex->esc[1]) * 16 * 16 * 16 + - hex(lex->esc[2]) * 16 * 16 + - hex(lex->esc[3]) * 16 + - hex(lex->esc[4]); - - /* is a leading surrogate value */ - if ((code & 0xFC00) == 0xD800) { - /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ - if (lex->lead_surrogate) - append_utf8(lex, 0xFFFD); - - /* store surrogate value and advance to next escape sequence */ - lex->lead_surrogate = code; - } - - /* is a trailing surrogate value */ - else if ((code & 0xFC00) == 0xDC00) { - /* found a trailing surrogate following a leading one, combine and encode */ - if (lex->lead_surrogate) { - code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); - lex->lead_surrogate = 0; - } - - /* trailing surrogate not following a leading one, ignore and use replacement char */ - else { - code = 0xFFFD; - } - - append_utf8(lex, code); - } - - /* is a normal codepoint */ - else { - append_utf8(lex, code); - } - - lex->esclen = 0; - lex->is_escape = false; - buf_consume(lex, (ptr + 1) - lex->bufstart); - } + /* hex escape sequence */ + else if (type != TK_REGEXP && check_char(lex, 'x')) { + for (i = 0, code = 0; i < 2; i++) { + ch = next_char(lex); - break; + if (!isxdigit(ch)) + return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); - case 'x': - if (lex->esclen < 3) { - if (!isxdigit(*ptr)) - return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); + code = code * 16 + hex(ch); + } - lex->esc[lex->esclen++] = *ptr; - } + append_utf8(lex, code); + } - if (lex->esclen == 3) { - append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2])); + /* octal or letter */ + else { + /* try to parse octal sequence... */ + for (i = 0, code = 0, ch = lookahead_char(lex); + kind != '/' && i < 3 && ch >= '0' && ch <= '7'; + i++, next_char(lex), ch = lookahead_char(lex)) { + code = code * 8 + dec(ch); + } - lex->esclen = 0; - lex->is_escape = false; - buf_consume(lex, (ptr + 1) - lex->bufstart); - } + if (i) { + if (code > 255) + return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence")); - break; + append_utf8(lex, code); + } - case 'o': - if (lex->esclen < 4) { - /* found a non-octal char */ - if (*ptr < '0' || *ptr > '7') { - /* pad sequence to three chars */ - switch (lex->esclen) { - case 3: - lex->esc[3] = lex->esc[2]; - lex->esc[2] = lex->esc[1]; - lex->esc[1] = '0'; - break; - - case 2: - lex->esc[3] = lex->esc[1]; - lex->esc[2] = '0'; - lex->esc[1] = '0'; - break; - } - - lex->esclen = 4; - buf_consume(lex, ptr-- - lex->bufstart); - } - - /* append */ - else { - lex->esc[lex->esclen++] = *ptr; - buf_consume(lex, (ptr + 1) - lex->bufstart); - } - } + /* ... no octal sequence, handle other escape */ + else { + ch = next_char(lex); - if (lex->esclen == 4) { - code = dec(lex->esc[1]) * 8 * 8 + - dec(lex->esc[2]) * 8 + - dec(lex->esc[3]); + switch (ch) { + case 'a': uc_vector_push(&lex->buffer, '\a'); break; + case 'b': uc_vector_push(&lex->buffer, '\b'); break; + case 'e': uc_vector_push(&lex->buffer, '\033'); break; + case 'f': uc_vector_push(&lex->buffer, '\f'); break; + case 'n': uc_vector_push(&lex->buffer, '\n'); break; + case 'r': uc_vector_push(&lex->buffer, '\r'); break; + case 't': uc_vector_push(&lex->buffer, '\t'); break; + case 'v': uc_vector_push(&lex->buffer, '\v'); break; - if (code > 255) - return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); + case EOF: + return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string")); - append_utf8(lex, code); + default: + /* regex mode => retain backslash */ + if (type == TK_REGEXP) + uc_vector_push(&lex->buffer, '\\'); - lex->esclen = 0; - lex->is_escape = false; + uc_vector_push(&lex->buffer, ch); } - - break; } } - } - - /* terminating char */ - else if (*ptr == q) { - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr + 1) - lex->bufstart); - - rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); - if (!rv) - rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); - - return rv; - } + break; - /* escape sequence start */ - else if (*ptr == '\\') { - lex->is_escape = true; - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr - lex->bufstart) + 1); - } + /* other character */ + default: + /* terminating delimitter */ + if (ch == kind) + return emit_buffer(lex, off, type, NULL); - /* potential placeholder start */ - else if (q == '`' && *ptr == '$') { - lex->is_placeholder = true; - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr - lex->bufstart) + 1); + uc_vector_push(&lex->buffer, ch); } } - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, ptr - lex->bufstart); + // FIXME + lex->state = UC_LEX_EOF; - return NULL; + return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); } @@ -563,76 +381,31 @@ parse_regexp(uc_lexer_t *lex) size_t len; char *s; - switch (lex->esc[0]) { - case UC_LEX_PARSE_REGEX_INIT: - if (lex->no_regexp) { - if (buf_startswith(lex, "=")) { - buf_consume(lex, 1); - - return emit_op(lex, lex->source->off, TK_ASDIV, NULL); - } - - return emit_op(lex, lex->source->off, TK_DIV, NULL); - } - - lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN; - break; - - case UC_LEX_PARSE_REGEX_PATTERN: - rv = parse_string(lex); - - if (rv && rv->type == TK_ERROR) - return rv; + rv = parse_string(lex, '/'); - if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) { - lex->lookbehind = (char *)rv; - lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS; - } - - break; - - case UC_LEX_PARSE_REGEX_FLAGS: - rv = (uc_token_t *)lex->lookbehind; - - while (lex->bufstart < lex->bufend || lex->eof) { - switch (lex->eof ? EOF : lex->bufstart[0]) { - case 'g': - buf_consume(lex, 1); + if (rv->type == TK_REGEXP) { + while (true) { + if (check_char(lex, 'g')) is_reg_global = true; - break; - - case 'i': - buf_consume(lex, 1); + else if (check_char(lex, 'i')) is_reg_icase = true; - break; - - case 's': - buf_consume(lex, 1); + else if (check_char(lex, 's')) is_reg_newline = true; + else break; - - default: - lex->lookbehind = NULL; - - len = xasprintf(&s, "%c%*s", - (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), - ucv_string_length(rv->uv), - ucv_string_get(rv->uv)); - - ucv_free(rv->uv, false); - rv->uv = ucv_string_new_length(s, len); - free(s); - - rv->type = TK_REGEXP; - - return rv; - } } - break; + len = xasprintf(&s, "%c%*s", + (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), + ucv_string_length(rv->uv), + ucv_string_get(rv->uv)); + + ucv_free(rv->uv, false); + rv->uv = ucv_string_new_length(s, len); + free(s); } - return NULL; + return rv; } @@ -647,37 +420,34 @@ parse_regexp(uc_lexer_t *lex) */ static uc_token_t * -parse_label(uc_lexer_t *lex) +parse_label(uc_lexer_t *lex, int ch) { - const struct token *tok = lex->tok; const struct keyword *word; - char *ptr; - size_t i; - - if (!lex->lookbehind && tok->plen) - lookbehind_append(lex, tok->u.pat, tok->plen); + size_t i, len; - if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) { - if (lex->no_keyword == false) { - for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { - if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) { - lookbehind_reset(lex); + while (true) { + uc_vector_push(&lex->buffer, ch); + ch = lookahead_char(lex); - return emit_op(lex, lex->source->off - word->plen, word->type, NULL); - } - } - } + if (!isalnum(ch) && ch != '_') + break; - return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL); + next_char(lex); } - for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) - ; + len = lex->buffer.count; - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, ptr - lex->bufstart); + if (!lex->no_keyword) { + for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { + if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) { + uc_vector_clear(&lex->buffer); - return NULL; + return emit_op(lex, -len, word->type, NULL); + } + } + } + + return emit_buffer(lex, -len, TK_LABEL, NULL); } @@ -694,7 +464,7 @@ parse_label(uc_lexer_t *lex) static inline bool is_numeric_char(uc_lexer_t *lex, char c) { - char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0; + char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0; switch (c|32) { case '.': @@ -731,380 +501,507 @@ is_numeric_char(uc_lexer_t *lex, char c) } static uc_token_t * -parse_number(uc_lexer_t *lex) +parse_number(uc_lexer_t *lex, int ch) { - uc_token_t *rv = NULL; uc_value_t *nv = NULL; - const char *ptr; + size_t len; char *e; - if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) { - lookbehind_append(lex, "\0", 1); - - nv = uc_number_parse_octal(lex->lookbehind, &e); + while (true) { + uc_vector_push(&lex->buffer, ch); + ch = lookahead_char(lex); - switch (ucv_type(nv)) { - case UC_DOUBLE: - rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv); + if (!is_numeric_char(lex, ch)) break; - case UC_INTEGER: - rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv); - break; + next_char(lex); + } - default: - rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal")); - } + len = lex->buffer.count; - lookbehind_reset(lex); + uc_vector_push(&lex->buffer, '\0'); - return rv; - } + nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e); - for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++) - ; + uc_vector_clear(&lex->buffer); - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, ptr - lex->bufstart); + switch (ucv_type(nv)) { + case UC_DOUBLE: + return emit_op(lex, -len, TK_DOUBLE, nv); - return NULL; + case UC_INTEGER: + return emit_op(lex, -len, TK_NUMBER, nv); + + default: + return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal")); + } } static uc_token_t * -lex_step(uc_lexer_t *lex, FILE *fp) +lex_find_token(uc_lexer_t *lex) { - uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; - union { uint32_t n; char str[4]; } search; - const struct token *tok; - size_t rlen, rem, *nest; - char *ptr, c; - uc_token_t *rv; - size_t i; + bool tpl = !(lex->config && lex->config->raw_mode); + int ch = next_char(lex); + + while (isspace(ch)) + ch = next_char(lex); + + switch (ch) { + case '~': + return emit_op(lex, -1, TK_COMPL, NULL); - /* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining, - * move the remaining bytes to the beginning and read more data */ - if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) { - if (!lex->buf) { - lex->buflen = 128; - lex->buf = xalloc(lex->buflen); + case '}': + if (tpl && check_char(lex, '}')) + return emit_op(lex, -2, TK_REXP, NULL); + + return emit_op(lex, -1, TK_RBRACE, NULL); + + case '|': + if (check_char(lex, '|')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASOR, NULL); + + return emit_op(lex, -2, TK_OR, NULL); } - rem = lex->bufend - lex->bufstart; + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASBOR, NULL); - if (rem) - memcpy(lex->buf, lex->bufstart, rem); + return emit_op(lex, -1, TK_BOR, NULL); - rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp); + case '{': + if (tpl && check_char(lex, '{')) + return emit_op(lex, -2, TK_LEXP, NULL); - lex->bufstart = lex->buf; - lex->bufend = lex->buf + rlen + rem; + if (tpl && check_char(lex, '%')) + return emit_op(lex, -2, TK_LSTM, NULL); - if (rlen == 0 && (ferror(fp) || feof(fp))) - lex->eof = 1; - } + return emit_op(lex, -1, TK_LBRACE, NULL); - switch (lex->state) { - case UC_LEX_IDENTIFY_BLOCK: - /* previous block had strip trailing whitespace flag, skip leading whitespace */ - if (lex->modifier == MINUS) { - while (buf_remaining(lex) && isspace(lex->bufstart[0])) - buf_consume(lex, 1); + case '^': + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASBXOR, NULL); - lex->modifier = UNSPEC; + return emit_op(lex, -1, TK_BXOR, NULL); + + case '[': + return emit_op(lex, -1, TK_LBRACK, NULL); + + case ']': + return emit_op(lex, -1, TK_RBRACK, NULL); + + case '?': + if (check_char(lex, '?')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASNULLISH, NULL); + + return emit_op(lex, -2, TK_NULLISH, NULL); } - /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ - else if (lex->modifier == NEWLINE) { - if (buf_startswith(lex, "\n")) - buf_consume(lex, 1); + if (check_char(lex, '.')) { + if (check_char(lex, '[')) + return emit_op(lex, -3, TK_QLBRACK, NULL); + + if (check_char(lex, '(')) + return emit_op(lex, -3, TK_QLPAREN, NULL); - lex->modifier = UNSPEC; + return emit_op(lex, -2, TK_QDOT, NULL); } - /* scan forward through buffer to identify start token */ - for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) { - /* found start of comment block */ - if (!strncmp(ptr, "{#", 2)) { - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr + 2) - lex->bufstart); - lex->lastoff = lex->source->off - 2; - lex->state = UC_LEX_BLOCK_COMMENT_START; + return emit_op(lex, lex->source->off, TK_QMARK, NULL); - return NULL; - } + case '>': + if (check_char(lex, '>')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASRIGHT, NULL); - /* found start of expression block */ - else if (!strncmp(ptr, "{{", 2)) { - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr + 2) - lex->bufstart); - lex->lastoff = lex->source->off - 2; - lex->state = UC_LEX_BLOCK_EXPRESSION_START; + return emit_op(lex, -2, TK_RSHIFT, NULL); + } - return NULL; - } + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_GE, NULL); - /* found start of statement block */ - else if (!strncmp(ptr, "{%", 2)) { - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, (ptr + 2) - lex->bufstart); - lex->lastoff = lex->source->off - 2; - lex->state = UC_LEX_BLOCK_STATEMENT_START; + return emit_op(lex, -1, TK_GT, NULL); - return NULL; - } + case '=': + if (check_char(lex, '=')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_EQS, NULL); + + return emit_op(lex, -2, TK_EQ, NULL); } - /* we're at eof */ - if (lex->eof) { - lookbehind_append(lex, ptr, lex->bufend - ptr); - lex->state = UC_LEX_EOF; + if (check_char(lex, '>')) + return emit_op(lex, -2, TK_ARROW, NULL); + + return emit_op(lex, -1, TK_ASSIGN, NULL); + + case '<': + if (check_char(lex, '<')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASLEFT, NULL); - return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL); + return emit_op(lex, -2, TK_LSHIFT, NULL); } - lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); - buf_consume(lex, ptr - lex->bufstart); - break; + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_LE, NULL); + + return emit_op(lex, -1, TK_LT, NULL); + + case ';': + return emit_op(lex, -1, TK_SCOL, NULL); + case ':': + return emit_op(lex, -1, TK_COLON, NULL); - case UC_LEX_BLOCK_COMMENT_START: - case UC_LEX_BLOCK_EXPRESSION_START: - case UC_LEX_BLOCK_STATEMENT_START: - rv = NULL; - lex->modifier = UNSPEC; + case '/': + ch = lookahead_char(lex); + lex->lastoff = lex->source->off - 1; - /* strip whitespace before block */ - if (buf_startswith(lex, "-")) { - rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r"); - buf_consume(lex, 1); + if (ch == '/' || ch == '*') + return parse_comment(lex, ch); + + if (lex->no_regexp) { + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASDIV, NULL); + + return emit_op(lex, -1, TK_DIV, NULL); } - /* disable lstrip flag (only valid for statement blocks) */ - else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) { - /* disable lstrip flag */ - if (buf_startswith(lex, "+")) { - rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); - buf_consume(lex, 1); - } + return parse_regexp(lex); - /* put out text leading up to the opening tag and potentially - * strip trailing white space from it depending on the global - * block lstrip setting */ - else { - rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, - (lex->config && lex->config->lstrip_blocks) ? " \t\v\f\r" : NULL); + case '.': + if (check_char(lex, '.')) { + if (check_char(lex, '.')) + return emit_op(lex, -3, TK_ELLIP, NULL); + + /* The sequence ".." cannot be a valid */ + return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character")); + } + + return emit_op(lex, -1, TK_DOT, NULL); + + case '-': + if (tpl && check_char(lex, '}')) { + if (check_char(lex, '}')) { + lex->modifier = MINUS; + + return emit_op(lex, -3, TK_REXP, NULL); } + + /* The sequence "-}" cannot be a valid */ + return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); } - else { - rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); + + if (tpl && check_char(lex, '%')) { + if (check_char(lex, '}')) { + lex->modifier = MINUS; + + return emit_op(lex, -3, TK_RSTM, NULL); + } + + /* The sequence "-%" cannot be a valid */ + return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); } - switch (lex->state) { - case UC_LEX_BLOCK_COMMENT_START: - lex->state = UC_LEX_BLOCK_COMMENT; - lex->block = COMMENT; - break; + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASSUB, NULL); - case UC_LEX_BLOCK_STATEMENT_START: - lex->state = UC_LEX_IDENTIFY_TOKEN; - lex->block = STATEMENTS; - break; + if (check_char(lex, '-')) + return emit_op(lex, -2, TK_DEC, NULL); - case UC_LEX_BLOCK_EXPRESSION_START: - lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; - break; + return emit_op(lex, -1, TK_SUB, NULL); - default: - break; + case ',': + return emit_op(lex, -1, TK_COMMA, NULL); + + case '+': + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASADD, NULL); + + if (check_char(lex, '+')) + return emit_op(lex, -2, TK_INC, NULL); + + return emit_op(lex, -1, TK_ADD, NULL); + + case '*': + if (check_char(lex, '*')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASEXP, NULL); + + return emit_op(lex, -2, TK_EXP, NULL); } - return rv; + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASMUL, NULL); + return emit_op(lex, -1, TK_MUL, NULL); - case UC_LEX_BLOCK_COMMENT: - /* scan forward through buffer to identify end token */ - while (lex->bufstart < lex->bufend - 2) { - if (buf_startswith(lex, "-#}")) { - lex->state = UC_LEX_IDENTIFY_BLOCK; - lex->modifier = MINUS; - buf_consume(lex, 3); - lex->lastoff = lex->source->off; - break; - } - else if (buf_startswith(lex, "#}")) { - lex->state = UC_LEX_IDENTIFY_BLOCK; - buf_consume(lex, 2); - lex->lastoff = lex->source->off; - break; - } + case '(': + return emit_op(lex, -1, TK_LPAREN, NULL); + + case ')': + return emit_op(lex, -1, TK_RPAREN, NULL); - buf_consume(lex, 1); + case '\'': + case '"': + case '`': + lex->lastoff = lex->source->off - 1; + + return parse_string(lex, ch); + + case '&': + if (check_char(lex, '&')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_ASAND, NULL); + + return emit_op(lex, -2, TK_AND, NULL); } - /* we're at eof */ - if (lex->eof) { - lex->state = UC_LEX_EOF; + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASBAND, NULL); + + return emit_op(lex, -1, TK_BAND, NULL); - buf_consume(lex, lex->bufend - lex->bufstart); + case '%': + if (tpl && check_char(lex, '}')) + return emit_op(lex, -2, TK_RSTM, NULL); - return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); + if (check_char(lex, '=')) + return emit_op(lex, -2, TK_ASMOD, NULL); + + return emit_op(lex, -1, TK_MOD, NULL); + + case '!': + if (check_char(lex, '=')) { + if (check_char(lex, '=')) + return emit_op(lex, -3, TK_NES, NULL); + + return emit_op(lex, -2, TK_NE, NULL); } - break; + return emit_op(lex, -1, TK_NOT, NULL); + case EOF: + return emit_op(lex, -1, TK_EOF, NULL); - case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: - lex->state = UC_LEX_IDENTIFY_TOKEN; - lex->block = EXPRESSION; + default: + if (isalpha(ch) || ch == '_') + return parse_label(lex, ch); - return emit_op(lex, lex->source->off, TK_LEXP, NULL); + if (isdigit(ch)) + return parse_number(lex, ch); + return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); + } +} - case UC_LEX_IDENTIFY_TOKEN: - /* skip leading whitespace */ - for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++) - ; +static uc_token_t * +lex_step(uc_lexer_t *lex) +{ + const char *strip = NULL; + uc_token_t *tok; + size_t *nest; + int ch; - buf_consume(lex, i); + while (lex->state != UC_LEX_EOF) { + switch (lex->state) { + case UC_LEX_IDENTIFY_BLOCK: + ch = next_char(lex); - if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) - return NULL; + /* previous block had strip trailing whitespace flag, skip leading whitespace */ + if (lex->modifier == MINUS) { + while (isspace(ch)) + ch = next_char(lex); - for (i = 0; i < sizeof(search.str); i++) - search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0; + lex->modifier = UNSPEC; + } - for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) { - /* remaining buffer data is shorter than token, skip */ - if (tok->plen > buf_remaining(lex)) - continue; + /* previous block was a statement block and trim_blocks is enabled, skip leading newline */ + else if (lex->modifier == NEWLINE) { + if (ch == '\n') + ch = next_char(lex); - c = buf_remaining(lex) ? lex->bufstart[0] : 0; + lex->modifier = UNSPEC; + } - if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn) - : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) { - lex->lastoff = lex->source->off; + /* scan forward through buffer to identify block start token */ + while (ch != EOF) { + if (ch == '{') { + ch = next_char(lex); - /* token has a parse method, switch state */ - if (tok->parse) { - lex->tok = tok; - lex->state = UC_LEX_PARSE_TOKEN; + switch (ch) { + /* found start of comment block */ + case '#': + lex->state = UC_LEX_BLOCK_COMMENT; + lex->block = COMMENT; - buf_consume(lex, tok->plen); + if (check_char(lex, '-')) + strip = " \n\t\v\f\r"; - return NULL; - } + break; - /* in raw code mode, ignore template tag tokens */ - if (lex->config && lex->config->raw_mode && - (tok->type == TK_LSTM || tok->type == TK_RSTM || - tok->type == TK_LEXP || tok->type == TK_REXP)) { - continue; - } + /* found start of expression block */ + case '{': + lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; - /* disallow nesting blocks */ - if (tok->type == TK_LSTM || tok->type == TK_LEXP) { - buf_consume(lex, tok->plen); + if (check_char(lex, '-')) + strip = " \n\t\v\f\r"; - return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested")); - } + break; - /* found end of block */ - else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) || - (lex->block == EXPRESSION && tok->type == TK_REXP)) { - /* strip whitespace after block */ - if (tok->u.pat[0] == '-') - lex->modifier = MINUS; + /* found start of statement block */ + case '%': + lex->state = UC_LEX_IDENTIFY_TOKEN; + lex->block = STATEMENTS; - /* strip newline after statement block */ - else if (lex->block == STATEMENTS && - lex->config && lex->config->trim_blocks) - lex->modifier = NEWLINE; + if (check_char(lex, '-')) + strip = " \n\t\v\f\r"; + else if (check_char(lex, '+')) + strip = NULL; + else if (lex->config && lex->config->lstrip_blocks) + strip = " \t\v\f\r"; - lex->state = UC_LEX_IDENTIFY_BLOCK; - lex->block = NONE; - } + break; + + default: + /* not a start tag, remember char and move on */ + uc_vector_push(&lex->buffer, '{'); + continue; + } - /* track opening braces */ - else if (tok->type == TK_LBRACE && lex->templates.count > 0) { - nest = uc_vector_last(&lex->templates); - (*nest)++; + break; } - /* check end of placeholder expression */ - else if (tok->type == TK_RBRACE && lex->templates.count > 0) { - nest = uc_vector_last(&lex->templates); + uc_vector_push(&lex->buffer, ch); + ch = next_char(lex); + } - if (*nest == 0) { - lex->templates.count--; - lex->state = UC_LEX_PARSE_TOKEN; - lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */ - } - else { - (*nest)--; - } + if (ch == EOF) + lex->state = UC_LEX_EOF; + + /* push out leading text */ + tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip); + lex->lastoff = lex->source->off - 2; + + if (!tok) + continue; + + return tok; + + + case UC_LEX_BLOCK_COMMENT: + ch = next_char(lex); + + /* scan forward through buffer to identify end token */ + while (ch != EOF) { + if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) { + lex->modifier = MINUS; + break; } - /* do not report statement tags to the parser */ - if (tok->type != 0 && tok->type != TK_LSTM) - rv = emit_op(lex, lex->source->off, - (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL); - else - rv = NULL; + if (ch == '#' && check_char(lex, '}')) + break; + + ch = next_char(lex); + } - buf_consume(lex, tok->plen); + if (ch == EOF) { + lex->state = UC_LEX_EOF; - return rv; + return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); } - } - /* no possible return beyond this point can advance, - mark lex state as eof */ - lex->state = UC_LEX_EOF; + lex->lastoff = lex->source->off; + lex->state = UC_LEX_IDENTIFY_BLOCK; - /* no token matched and we do have remaining data, junk */ - if (buf_remaining(lex)) - return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character")); + continue; - /* we're at eof, allow unclosed statement blocks */ - if (lex->block == STATEMENTS) - return NULL; - /* premature EOF */ - return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block")); + case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: + lex->state = UC_LEX_IDENTIFY_TOKEN; + lex->block = EXPRESSION; + return emit_op(lex, lex->source->off, TK_LEXP, NULL); - case UC_LEX_PARSE_TOKEN: - tok = lex->tok; - rv = tok->parse(lex); - if (rv) { - memset(lex->esc, 0, sizeof(lex->esc)); - lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN; - lex->is_placeholder = false; - lex->tok = NULL; + case UC_LEX_IDENTIFY_TOKEN: + do { tok = lex_find_token(lex); } while (tok == NULL); - if (rv == UC_LEX_CONTINUE_PARSING) - rv = NULL; + /* disallow nesting blocks */ + if (tok->type == TK_LSTM || tok->type == TK_LEXP) + return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested")); - return rv; - } + /* found end of statement block */ + if (lex->block == STATEMENTS && tok->type == TK_RSTM) { + /* strip newline after statement block? */ + if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks) + lex->modifier = NEWLINE; - break; + lex->lastoff = lex->source->off; + lex->state = UC_LEX_IDENTIFY_BLOCK; + lex->block = NONE; + tok = emit_op(lex, -2, TK_SCOL, NULL); + } - case UC_LEX_PLACEHOLDER: - lex->state = UC_LEX_IDENTIFY_TOKEN; + /* found end of expression block */ + else if (lex->block == EXPRESSION && tok->type == TK_REXP) { + lex->lastoff = lex->source->off; + lex->state = UC_LEX_IDENTIFY_BLOCK; + lex->block = NONE; + } + + /* track opening braces */ + else if (tok->type == TK_LBRACE && lex->templates.count > 0) { + nest = uc_vector_last(&lex->templates); + (*nest)++; + } + + /* check end of placeholder expression */ + else if (tok->type == TK_RBRACE && lex->templates.count > 0) { + nest = uc_vector_last(&lex->templates); + + if (*nest == 0) { + lex->templates.count--; + lex->state = UC_LEX_PLACEHOLDER_END; + } + else { + (*nest)--; + } + } + + /* premature EOF? */ + else if (tok->type == TK_EOF && lex->block != STATEMENTS) { + lex->state = UC_LEX_EOF; + + return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block")); + } - uc_vector_push(&lex->templates, 0); + return tok; - return emit_op(lex, lex->source->off, TK_PLACEH, NULL); + case UC_LEX_PLACEHOLDER_START: + lex->state = UC_LEX_IDENTIFY_TOKEN; + + uc_vector_push(&lex->templates, 0); + + return emit_op(lex, -2, TK_PLACEH, NULL); - case UC_LEX_EOF: - break; + + case UC_LEX_PLACEHOLDER_END: + lex->state = UC_LEX_IDENTIFY_TOKEN; + + return parse_string(lex, '`'); + + + case UC_LEX_EOF: + break; + } } - return NULL; + return emit_op(lex, lex->source->off, TK_EOF, NULL); } void @@ -1115,24 +1012,15 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) lex->config = config; lex->source = uc_source_get(source); - lex->eof = 0; - lex->is_escape = 0; - lex->block = NONE; lex->modifier = UNSPEC; - lex->buflen = 0; - lex->buf = NULL; - lex->bufstart = NULL; - lex->bufend = NULL; - - lex->lookbehindlen = 0; - lex->lookbehind = NULL; + lex->rlen = 0; + lex->rpos = 0; + lex->rbuf = NULL; - lex->tok = NULL; - - lex->esclen = 0; - memset(lex->esc, 0, sizeof(lex->esc)); + lex->buffer.count = 0; + lex->buffer.entries = NULL; lex->lead_surrogate = 0; @@ -1150,11 +1038,12 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) void uc_lexer_free(uc_lexer_t *lex) { + uc_vector_clear(&lex->buffer); uc_vector_clear(&lex->templates); + uc_source_put(lex->source); - free(lex->lookbehind); - free(lex->buf); + free(lex->rbuf); } uc_token_t * @@ -1162,47 +1051,94 @@ uc_lexer_next_token(uc_lexer_t *lex) { uc_token_t *rv = NULL; - while (lex->state != UC_LEX_EOF) { - rv = lex_step(lex, lex->source->fp); - - if (rv != NULL) - break; - } - - if (rv) { - lex->no_keyword = false; - lex->no_regexp = false; + rv = lex_step(lex); - return rv; - } + lex->no_keyword = false; + lex->no_regexp = false; - return emit_op(lex, lex->source->off, TK_EOF, NULL); + return rv; } const char * uc_tokenname(unsigned type) { static char buf[sizeof("'endfunction'")]; - size_t i; - - switch (type) { - case 0: return "End of file"; - case TK_TEMPLATE: return "Template"; - case TK_STRING: return "String"; - case TK_LABEL: return "Label"; - case TK_NUMBER: return "Number"; - case TK_DOUBLE: return "Double"; - case TK_REGEXP: return "Regexp"; - } - - for (i = 0; i < ARRAY_SIZE(tokens); i++) { - if (tokens[i].type != type) - continue; + const char *tokennames[] = { + [TK_LEXP] = "'{{'", + [TK_REXP] = "'}}'", + [TK_LSTM] = "'{%'", + [TK_RSTM] = "'%}'", + [TK_COMMA] = "','", + [TK_ASSIGN] = "'='", + [TK_ASADD] = "'+='", + [TK_ASSUB] = "'-='", + [TK_ASMUL] = "'*='", + [TK_ASDIV] = "'/='", + [TK_ASMOD] = "'%='", + [TK_ASLEFT] = "'<<='", + [TK_ASRIGHT] = "'>>='", + [TK_ASBAND] = "'&='", + [TK_ASBXOR] = "'^='", + [TK_ASBOR] = "'|='", + [TK_QMARK] = "'?'", + [TK_COLON] = "':'", + [TK_OR] = "'||'", + [TK_AND] = "'&&'", + [TK_BOR] = "'|'", + [TK_BXOR] = "'^'", + [TK_BAND] = "'&'", + [TK_EQS] = "'==='", + [TK_NES] = "'!=='", + [TK_EQ] = "'=='", + [TK_NE] = "'!='", + [TK_LT] = "'<'", + [TK_LE] = "'<='", + [TK_GT] = "'>'", + [TK_GE] = "'>='", + [TK_LSHIFT] = "'<<'", + [TK_RSHIFT] = "'>>'", + [TK_ADD] = "'+'", + [TK_SUB] = "'-'", + [TK_MUL] = "'*'", + [TK_DIV] = "'/'", + [TK_MOD] = "'%'", + [TK_EXP] = "'**'", + [TK_NOT] = "'!'", + [TK_COMPL] = "'~'", + [TK_INC] = "'++'", + [TK_DEC] = "'--'", + [TK_DOT] = "'.'", + [TK_LBRACK] = "'['", + [TK_RBRACK] = "']'", + [TK_LPAREN] = "'('", + [TK_RPAREN] = "')'", + [TK_LBRACE] = "'{'", + [TK_RBRACE] = "'}'", + [TK_SCOL] = "';'", + [TK_ELLIP] = "'...'", + [TK_ARROW] = "'=>'", + [TK_QLBRACK] = "'?.['", + [TK_QLPAREN] = "'?.('", + [TK_QDOT] = "'?.'", + [TK_ASEXP] = "'**='", + [TK_ASAND] = "'&&='", + [TK_ASOR] = "'||='", + [TK_ASNULLISH] = "'\?\?='", + [TK_NULLISH] = "'\?\?'", + [TK_PLACEH] = "'${'", + + [TK_TEXT] = "Text", + [TK_LABEL] = "Label", + [TK_NUMBER] = "Number", + [TK_DOUBLE] = "Double", + [TK_STRING] = "String", + [TK_REGEXP] = "Regexp", + [TK_TEMPLATE] = "Template", + [TK_ERROR] = "Error", + [TK_EOF] = "End of file", + }; - snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat); - - return buf; - } + size_t i; for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { if (reserved_words[i].type != type) @@ -1213,7 +1149,7 @@ uc_tokenname(unsigned type) return buf; } - return "?"; + return tokennames[type] ? tokennames[type] : "?"; } bool |