summaryrefslogtreecommitdiffhomepage
path: root/lexer.c
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2022-07-14 14:33:12 +0200
committerJo-Philipp Wich <jo@mein.io>2022-07-28 13:18:30 +0200
commit03c8e4b465c8cffd2596d2741b29ad2ba4ec1765 (patch)
tree6a43c9f54be5e3de4fcbc73b5ebaa518e642d3ad /lexer.c
parent1219d7efa170bf38fb1bf6a10fa0d1f96e62f091 (diff)
lexer: rewrite token scanner
- Use nested switches instead of lookup tables to detect tokens - Simplify input buffer logic - Reduce amount of intermediate states Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c1464
1 files changed, 700 insertions, 764 deletions
diff --git a/lexer.c b/lexer.c
index 5be8ece..574c051 100644
--- a/lexer.c
+++ b/lexer.c
@@ -29,24 +29,12 @@
#include "ucode/lib.h"
#include "ucode/lexer.h"
-#define UC_LEX_CONTINUE_PARSING (void *)1
-
struct keyword {
unsigned type;
const char *pat;
unsigned plen;
};
-struct token {
- unsigned type;
- union {
- uint32_t patn;
- char pat[4];
- } u;
- unsigned plen;
- uc_token_t *(*parse)(uc_lexer_t *);
-};
-
#define dec(o) \
((o) - '0')
@@ -56,94 +44,6 @@ struct token {
#ifndef NO_COMPILE
-static uc_token_t *parse_comment(uc_lexer_t *);
-static uc_token_t *parse_string(uc_lexer_t *);
-static uc_token_t *parse_regexp(uc_lexer_t *);
-static uc_token_t *parse_number(uc_lexer_t *);
-static uc_token_t *parse_label(uc_lexer_t *);
-
-static const struct token tokens[] = {
- { TK_ASLEFT, { .pat = "<<=" }, 3, NULL },
- { TK_ASRIGHT, { .pat = ">>=" }, 3, NULL },
- { TK_LEXP, { .pat = "{{-" }, 3, NULL },
- { TK_REXP, { .pat = "-}}" }, 3, NULL },
- { TK_LSTM, { .pat = "{%+" }, 3, NULL },
- { TK_LSTM, { .pat = "{%-" }, 3, NULL },
- { TK_RSTM, { .pat = "-%}" }, 3, NULL },
- { TK_EQS, { .pat = "===" }, 3, NULL },
- { TK_NES, { .pat = "!==" }, 3, NULL },
- { TK_ELLIP, { .pat = "..." }, 3, NULL },
- { TK_QLBRACK, { .pat = "?.[" }, 3, NULL },
- { TK_QLPAREN, { .pat = "?.(" }, 3, NULL },
- { TK_ASEXP, { .pat = "**=" }, 3, NULL },
- { TK_ASAND, { .pat = "&&=" }, 3, NULL },
- { TK_ASOR, { .pat = "||=" }, 3, NULL },
- { TK_ASNULLISH, { .pat = "\?\?=" }, 3, NULL },
- { TK_AND, { .pat = "&&" }, 2, NULL },
- { TK_ASADD, { .pat = "+=" }, 2, NULL },
- { TK_ASBAND, { .pat = "&=" }, 2, NULL },
- { TK_ASBOR, { .pat = "|=" }, 2, NULL },
- { TK_ASBXOR, { .pat = "^=" }, 2, NULL },
- //{ TK_ASDIV, { .pat = "/=" }, 2, NULL },
- { TK_ASMOD, { .pat = "%=" }, 2, NULL },
- { TK_ASMUL, { .pat = "*=" }, 2, NULL },
- { TK_ASSUB, { .pat = "-=" }, 2, NULL },
- { TK_EXP, { .pat = "**" }, 2, NULL },
- { TK_DEC, { .pat = "--" }, 2, NULL },
- { TK_INC, { .pat = "++" }, 2, NULL },
- { TK_EQ, { .pat = "==" }, 2, NULL },
- { TK_NE, { .pat = "!=" }, 2, NULL },
- { TK_LE, { .pat = "<=" }, 2, NULL },
- { TK_GE, { .pat = ">=" }, 2, NULL },
- { TK_LSHIFT, { .pat = "<<" }, 2, NULL },
- { TK_RSHIFT, { .pat = ">>" }, 2, NULL },
- { 0, { .pat = "//" }, 2, parse_comment },
- { 0, { .pat = "/*" }, 2, parse_comment },
- { TK_OR, { .pat = "||" }, 2, NULL },
- { TK_LEXP, { .pat = "{{" }, 2, NULL },
- { TK_REXP, { .pat = "}}" }, 2, NULL },
- { TK_LSTM, { .pat = "{%" }, 2, NULL },
- { TK_RSTM, { .pat = "%}" }, 2, NULL },
- { TK_ARROW, { .pat = "=>" }, 2, NULL },
- { TK_NULLISH, { .pat = "??" }, 2, NULL },
- { TK_QDOT, { .pat = "?." }, 2, NULL },
- { TK_PLACEH, { .pat = "${" }, 2, NULL },
- { TK_ADD, { .pat = "+" }, 1, NULL },
- { TK_ASSIGN, { .pat = "=" }, 1, NULL },
- { TK_BAND, { .pat = "&" }, 1, NULL },
- { TK_BOR, { .pat = "|" }, 1, NULL },
- { TK_LBRACK, { .pat = "[" }, 1, NULL },
- { TK_RBRACK, { .pat = "]" }, 1, NULL },
- { TK_BXOR, { .pat = "^" }, 1, NULL },
- { TK_LBRACE, { .pat = "{" }, 1, NULL },
- { TK_RBRACE, { .pat = "}" }, 1, NULL },
- { TK_COLON, { .pat = ":" }, 1, NULL },
- { TK_COMMA, { .pat = "," }, 1, NULL },
- { TK_COMPL, { .pat = "~" }, 1, NULL },
- //{ TK_DIV, { .pat = "/" }, 1, NULL },
- { TK_GT, { .pat = ">" }, 1, NULL },
- { TK_NOT, { .pat = "!" }, 1, NULL },
- { TK_LT, { .pat = "<" }, 1, NULL },
- { TK_MOD, { .pat = "%" }, 1, NULL },
- { TK_MUL, { .pat = "*" }, 1, NULL },
- { TK_LPAREN, { .pat = "(" }, 1, NULL },
- { TK_RPAREN, { .pat = ")" }, 1, NULL },
- { TK_QMARK, { .pat = "?" }, 1, NULL },
- { TK_SCOL, { .pat = ";" }, 1, NULL },
- { TK_SUB, { .pat = "-" }, 1, NULL },
- { TK_DOT, { .pat = "." }, 1, NULL },
- { TK_STRING, { .pat = "'" }, 1, parse_string },
- { TK_STRING, { .pat = "\"" }, 1, parse_string },
- { TK_REGEXP, { .pat = "/" }, 1, parse_regexp },
- { TK_LABEL, { .pat = "_" }, 1, parse_label },
- { TK_LABEL, { .pat = "az" }, 0, parse_label },
- { TK_LABEL, { .pat = "AZ" }, 0, parse_label },
- { TK_NUMBER, { .pat = "09" }, 0, parse_number },
-
- /* NB: this must be last for simple retrieval */
- { TK_TEMPLATE, { .pat = "`" }, 1, parse_string }
-};
-
static const struct keyword reserved_words[] = {
{ TK_ENDFUNC, "endfunction", 11 },
{ TK_CONTINUE, "continue", 8 },
@@ -174,119 +74,118 @@ static const struct keyword reserved_words[] = {
};
-/* length of the longest token in our lookup table */
-#define UC_LEX_MAX_TOKEN_LEN 3
+static int
+fill_buf(uc_lexer_t *lex) {
+ lex->rbuf = xrealloc(lex->rbuf, 128);
+ lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
+ lex->rpos = 0;
-static uc_token_t *
-emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv)
-{
- lex->curr.type = type;
- lex->curr.uv = uv;
- lex->curr.pos = pos;
+ if (!lex->rlen)
+ return EOF;
- return &lex->curr;
-}
+ lex->rpos++;
-static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len)
-{
- if (len) {
- lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len);
- memcpy(lex->lookbehind + lex->lookbehindlen, data, len);
- lex->lookbehindlen += len;
- }
+ return (int)lex->rbuf[0];
}
-static void lookbehind_reset(uc_lexer_t *lex) {
- free(lex->lookbehind);
- lex->lookbehind = NULL;
- lex->lookbehindlen = 0;
-}
+static int
+update_line(uc_lexer_t *lex, int ch) {
+ if (ch == '\n' || ch == EOF)
+ uc_source_line_next(lex->source);
+ else
+ uc_source_line_update(lex->source, 1);
-static uc_token_t *
-lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) {
- uc_token_t *rv = NULL;
+ lex->source->off++;
- if (lex->lookbehind) {
- if (strip_trailing_chars) {
- while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1]))
- lex->lookbehindlen--;
- }
+ return ch;
+}
- rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen));
+static int
+lookahead_char(uc_lexer_t *lex) {
+ int c;
- lookbehind_reset(lex);
- }
+ if (lex->rpos < lex->rlen)
+ return (int)lex->rbuf[lex->rpos];
- return rv;
-}
+ c = fill_buf(lex);
+ lex->rpos = 0;
-static inline size_t
-buf_remaining(uc_lexer_t *lex) {
- return (lex->bufend - lex->bufstart);
+ return c;
}
-static inline bool
-_buf_startswith(uc_lexer_t *lex, const char *str, size_t len) {
- return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len));
-}
+static bool
+check_char(uc_lexer_t *lex, int ch) {
+ if (lookahead_char(lex) != ch)
+ return false;
-#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
+ lex->rpos++;
+ update_line(lex, ch);
-static void
-buf_consume(uc_lexer_t *lex, size_t len) {
- size_t i, linelen;
+ return true;
+}
- for (i = 0, linelen = 0; i < len; i++) {
- if (lex->bufstart[i] == '\n') {
- uc_source_line_update(lex->source, linelen);
- uc_source_line_next(lex->source);
+static int
+next_char(uc_lexer_t *lex) {
+ int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
- linelen = 0;
- }
- else {
- linelen++;
- }
- }
+ return update_line(lex, ch);
+}
+
+static uc_token_t *
+emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
+{
+ lex->curr.type = type;
+ lex->curr.uv = uv;
- if (linelen)
- uc_source_line_update(lex->source, linelen);
+ if (pos < 0)
+ lex->curr.pos = lex->source->off + pos;
+ else
+ lex->curr.pos = (size_t)pos;
- lex->bufstart += len;
- lex->source->off += len;
+ return &lex->curr;
}
static uc_token_t *
-parse_comment(uc_lexer_t *lex)
-{
- const struct token *tok = lex->tok;
- const char *ptr, *end;
- size_t elen;
+emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
+ uc_token_t *rv = NULL;
+
+ if (lex->buffer.count) {
+ if (strip_trailing_chars)
+ while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
+ lex->buffer.count--;
+
+ rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
- if (!strcmp(tok->u.pat, "//")) {
- end = "\n";
- elen = 1;
+ uc_vector_clear(&lex->buffer);
}
- else {
- end = "*/";
- elen = 2;
+ else if (type != TK_TEXT) {
+ rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
}
- for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) {
- if (!strncmp(ptr, end, elen)) {
- buf_consume(lex, (ptr - lex->bufstart) + elen);
+ return rv;
+}
- return UC_LEX_CONTINUE_PARSING;
- }
- }
- buf_consume(lex, ptr - lex->bufstart);
+static uc_token_t *
+parse_comment(uc_lexer_t *lex, int kind)
+{
+ int ch;
+
+ while (true) {
+ ch = next_char(lex);
- if (lex->eof) {
- lex->state = UC_LEX_EOF;
+ if (kind == '/' && (ch == '\n' || ch == EOF))
+ break;
+
+ if (kind == '*' && ch == '*' && check_char(lex, '/'))
+ break;
+
+ if (ch == EOF) {
+ lex->state = UC_LEX_EOF;
- if (elen == 2)
return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
+ }
}
return NULL;
@@ -301,238 +200,157 @@ append_utf8(uc_lexer_t *lex, int code) {
rem = sizeof(ustr);
if (utf8enc(&up, &rem, code))
- lookbehind_append(lex, ustr, up - ustr);
+ for (up = ustr; rem < (int)sizeof(ustr); rem++)
+ uc_vector_push(&lex->buffer, *up++);
}
static uc_token_t *
-parse_string(uc_lexer_t *lex)
+parse_string(uc_lexer_t *lex, int kind)
{
- const struct token *tok = lex->tok;
- char q = tok->u.pat[0];
- char *ptr, *c;
- uc_token_t *rv;
- int code;
+ int code, ch, i;
+ unsigned type;
+ size_t off;
- if (!buf_remaining(lex))
- return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
+ if (kind == '`')
+ type = TK_TEMPLATE;
+ else if (kind == '/')
+ type = TK_REGEXP;
+ else
+ type = TK_STRING;
- for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
- /* continuation of placeholder start */
- if (lex->is_placeholder) {
- if (*ptr == '{') {
- buf_consume(lex, 1);
- rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
+ off = lex->source->off - 1;
- if (!rv)
- rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
+ for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+ switch (ch) {
+ /* placeholder */
+ case '$':
+ if (type == TK_TEMPLATE && check_char(lex, '{')) {
+ lex->state = UC_LEX_PLACEHOLDER_START;
- return rv;
+ return emit_buffer(lex, off, type, NULL);
}
- lex->is_placeholder = false;
- lookbehind_append(lex, "$", 1);
- }
+ uc_vector_push(&lex->buffer, '$');
+ break;
- /* continuation of escape sequence */
- if (lex->is_escape) {
- if (lex->esclen == 0) {
- /* non-unicode escape following a lead surrogate, emit replacement... */
- if (lex->lead_surrogate && *ptr != 'u') {
- append_utf8(lex, 0xFFFD);
- lex->lead_surrogate = 0;
- }
+ /* escape sequence */
+ case '\\':
+ /* unicode escape sequence */
+ if (type != TK_REGEXP && check_char(lex, 'u')) {
+ for (i = 0, code = 0; i < 4; i++) {
+ ch = next_char(lex);
- switch ((q == '/') ? 0 : *ptr) {
- case 'u':
- case 'x':
- lex->esc[lex->esclen++] = *ptr;
- break;
+ if (!isxdigit(ch))
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- lex->esc[lex->esclen++] = 'o';
- lex->esc[lex->esclen++] = *ptr;
- break;
+ code = code * 16 + hex(ch);
+ }
- default:
- lex->is_escape = false;
- c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr);
+ /* is a leading surrogate value */
+ if ((code & 0xFC00) == 0xD800) {
+ /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+ if (lex->lead_surrogate)
+ append_utf8(lex, 0xFFFD);
- if (c && *c >= 'a') {
- lookbehind_append(lex, c + 1, 1);
+ /* store surrogate value and advance to next escape sequence */
+ lex->lead_surrogate = code;
+ }
+
+ /* is a trailing surrogate value */
+ else if ((code & 0xFC00) == 0xDC00) {
+ /* found a trailing surrogate following a leading one, combine and encode */
+ if (lex->lead_surrogate) {
+ code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+ lex->lead_surrogate = 0;
}
- else {
- /* regex mode => retain backslash */
- if (q == '/')
- lookbehind_append(lex, "\\", 1);
- lookbehind_append(lex, ptr, 1);
+ /* trailing surrogate not following a leading one, ignore and use replacement char */
+ else {
+ code = 0xFFFD;
}
- buf_consume(lex, (ptr + 1) - lex->bufstart);
+ append_utf8(lex, code);
+ }
- break;
+ /* is a normal codepoint */
+ else {
+ append_utf8(lex, code);
}
}
- else {
- switch (lex->esc[0]) {
- case 'u':
- if (lex->esclen < 5) {
- if (!isxdigit(*ptr))
- return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
- lex->esc[lex->esclen++] = *ptr;
- }
-
- if (lex->esclen == 5) {
- code = hex(lex->esc[1]) * 16 * 16 * 16 +
- hex(lex->esc[2]) * 16 * 16 +
- hex(lex->esc[3]) * 16 +
- hex(lex->esc[4]);
-
- /* is a leading surrogate value */
- if ((code & 0xFC00) == 0xD800) {
- /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
- if (lex->lead_surrogate)
- append_utf8(lex, 0xFFFD);
-
- /* store surrogate value and advance to next escape sequence */
- lex->lead_surrogate = code;
- }
-
- /* is a trailing surrogate value */
- else if ((code & 0xFC00) == 0xDC00) {
- /* found a trailing surrogate following a leading one, combine and encode */
- if (lex->lead_surrogate) {
- code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
- lex->lead_surrogate = 0;
- }
-
- /* trailing surrogate not following a leading one, ignore and use replacement char */
- else {
- code = 0xFFFD;
- }
-
- append_utf8(lex, code);
- }
-
- /* is a normal codepoint */
- else {
- append_utf8(lex, code);
- }
-
- lex->esclen = 0;
- lex->is_escape = false;
- buf_consume(lex, (ptr + 1) - lex->bufstart);
- }
+ /* hex escape sequence */
+ else if (type != TK_REGEXP && check_char(lex, 'x')) {
+ for (i = 0, code = 0; i < 2; i++) {
+ ch = next_char(lex);
- break;
+ if (!isxdigit(ch))
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
- case 'x':
- if (lex->esclen < 3) {
- if (!isxdigit(*ptr))
- return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+ code = code * 16 + hex(ch);
+ }
- lex->esc[lex->esclen++] = *ptr;
- }
+ append_utf8(lex, code);
+ }
- if (lex->esclen == 3) {
- append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2]));
+ /* octal or letter */
+ else {
+ /* try to parse octal sequence... */
+ for (i = 0, code = 0, ch = lookahead_char(lex);
+ kind != '/' && i < 3 && ch >= '0' && ch <= '7';
+ i++, next_char(lex), ch = lookahead_char(lex)) {
+ code = code * 8 + dec(ch);
+ }
- lex->esclen = 0;
- lex->is_escape = false;
- buf_consume(lex, (ptr + 1) - lex->bufstart);
- }
+ if (i) {
+ if (code > 255)
+ return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
- break;
+ append_utf8(lex, code);
+ }
- case 'o':
- if (lex->esclen < 4) {
- /* found a non-octal char */
- if (*ptr < '0' || *ptr > '7') {
- /* pad sequence to three chars */
- switch (lex->esclen) {
- case 3:
- lex->esc[3] = lex->esc[2];
- lex->esc[2] = lex->esc[1];
- lex->esc[1] = '0';
- break;
-
- case 2:
- lex->esc[3] = lex->esc[1];
- lex->esc[2] = '0';
- lex->esc[1] = '0';
- break;
- }
-
- lex->esclen = 4;
- buf_consume(lex, ptr-- - lex->bufstart);
- }
-
- /* append */
- else {
- lex->esc[lex->esclen++] = *ptr;
- buf_consume(lex, (ptr + 1) - lex->bufstart);
- }
- }
+ /* ... no octal sequence, handle other escape */
+ else {
+ ch = next_char(lex);
- if (lex->esclen == 4) {
- code = dec(lex->esc[1]) * 8 * 8 +
- dec(lex->esc[2]) * 8 +
- dec(lex->esc[3]);
+ switch (ch) {
+ case 'a': uc_vector_push(&lex->buffer, '\a'); break;
+ case 'b': uc_vector_push(&lex->buffer, '\b'); break;
+ case 'e': uc_vector_push(&lex->buffer, '\033'); break;
+ case 'f': uc_vector_push(&lex->buffer, '\f'); break;
+ case 'n': uc_vector_push(&lex->buffer, '\n'); break;
+ case 'r': uc_vector_push(&lex->buffer, '\r'); break;
+ case 't': uc_vector_push(&lex->buffer, '\t'); break;
+ case 'v': uc_vector_push(&lex->buffer, '\v'); break;
- if (code > 255)
- return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+ case EOF:
+ return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
- append_utf8(lex, code);
+ default:
+ /* regex mode => retain backslash */
+ if (type == TK_REGEXP)
+ uc_vector_push(&lex->buffer, '\\');
- lex->esclen = 0;
- lex->is_escape = false;
+ uc_vector_push(&lex->buffer, ch);
}
-
- break;
}
}
- }
-
- /* terminating char */
- else if (*ptr == q) {
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr + 1) - lex->bufstart);
-
- rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
- if (!rv)
- rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
-
- return rv;
- }
+ break;
- /* escape sequence start */
- else if (*ptr == '\\') {
- lex->is_escape = true;
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr - lex->bufstart) + 1);
- }
+ /* other character */
+ default:
+ /* terminating delimitter */
+ if (ch == kind)
+ return emit_buffer(lex, off, type, NULL);
- /* potential placeholder start */
- else if (q == '`' && *ptr == '$') {
- lex->is_placeholder = true;
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr - lex->bufstart) + 1);
+ uc_vector_push(&lex->buffer, ch);
}
}
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, ptr - lex->bufstart);
+ // FIXME
+ lex->state = UC_LEX_EOF;
- return NULL;
+ return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
}
@@ -563,76 +381,31 @@ parse_regexp(uc_lexer_t *lex)
size_t len;
char *s;
- switch (lex->esc[0]) {
- case UC_LEX_PARSE_REGEX_INIT:
- if (lex->no_regexp) {
- if (buf_startswith(lex, "=")) {
- buf_consume(lex, 1);
-
- return emit_op(lex, lex->source->off, TK_ASDIV, NULL);
- }
-
- return emit_op(lex, lex->source->off, TK_DIV, NULL);
- }
-
- lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN;
- break;
-
- case UC_LEX_PARSE_REGEX_PATTERN:
- rv = parse_string(lex);
-
- if (rv && rv->type == TK_ERROR)
- return rv;
+ rv = parse_string(lex, '/');
- if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) {
- lex->lookbehind = (char *)rv;
- lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS;
- }
-
- break;
-
- case UC_LEX_PARSE_REGEX_FLAGS:
- rv = (uc_token_t *)lex->lookbehind;
-
- while (lex->bufstart < lex->bufend || lex->eof) {
- switch (lex->eof ? EOF : lex->bufstart[0]) {
- case 'g':
- buf_consume(lex, 1);
+ if (rv->type == TK_REGEXP) {
+ while (true) {
+ if (check_char(lex, 'g'))
is_reg_global = true;
- break;
-
- case 'i':
- buf_consume(lex, 1);
+ else if (check_char(lex, 'i'))
is_reg_icase = true;
- break;
-
- case 's':
- buf_consume(lex, 1);
+ else if (check_char(lex, 's'))
is_reg_newline = true;
+ else
break;
-
- default:
- lex->lookbehind = NULL;
-
- len = xasprintf(&s, "%c%*s",
- (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
- ucv_string_length(rv->uv),
- ucv_string_get(rv->uv));
-
- ucv_free(rv->uv, false);
- rv->uv = ucv_string_new_length(s, len);
- free(s);
-
- rv->type = TK_REGEXP;
-
- return rv;
- }
}
- break;
+ len = xasprintf(&s, "%c%*s",
+ (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
+ ucv_string_length(rv->uv),
+ ucv_string_get(rv->uv));
+
+ ucv_free(rv->uv, false);
+ rv->uv = ucv_string_new_length(s, len);
+ free(s);
}
- return NULL;
+ return rv;
}
@@ -647,37 +420,34 @@ parse_regexp(uc_lexer_t *lex)
*/
static uc_token_t *
-parse_label(uc_lexer_t *lex)
+parse_label(uc_lexer_t *lex, int ch)
{
- const struct token *tok = lex->tok;
const struct keyword *word;
- char *ptr;
- size_t i;
-
- if (!lex->lookbehind && tok->plen)
- lookbehind_append(lex, tok->u.pat, tok->plen);
+ size_t i, len;
- if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
- if (lex->no_keyword == false) {
- for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
- if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
- lookbehind_reset(lex);
+ while (true) {
+ uc_vector_push(&lex->buffer, ch);
+ ch = lookahead_char(lex);
- return emit_op(lex, lex->source->off - word->plen, word->type, NULL);
- }
- }
- }
+ if (!isalnum(ch) && ch != '_')
+ break;
- return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL);
+ next_char(lex);
}
- for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
- ;
+ len = lex->buffer.count;
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, ptr - lex->bufstart);
+ if (!lex->no_keyword) {
+ for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
+ if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
+ uc_vector_clear(&lex->buffer);
- return NULL;
+ return emit_op(lex, -len, word->type, NULL);
+ }
+ }
+ }
+
+ return emit_buffer(lex, -len, TK_LABEL, NULL);
}
@@ -694,7 +464,7 @@ parse_label(uc_lexer_t *lex)
static inline bool
is_numeric_char(uc_lexer_t *lex, char c)
{
- char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0;
+ char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
switch (c|32) {
case '.':
@@ -731,380 +501,507 @@ is_numeric_char(uc_lexer_t *lex, char c)
}
static uc_token_t *
-parse_number(uc_lexer_t *lex)
+parse_number(uc_lexer_t *lex, int ch)
{
- uc_token_t *rv = NULL;
uc_value_t *nv = NULL;
- const char *ptr;
+ size_t len;
char *e;
- if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) {
- lookbehind_append(lex, "\0", 1);
-
- nv = uc_number_parse_octal(lex->lookbehind, &e);
+ while (true) {
+ uc_vector_push(&lex->buffer, ch);
+ ch = lookahead_char(lex);
- switch (ucv_type(nv)) {
- case UC_DOUBLE:
- rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv);
+ if (!is_numeric_char(lex, ch))
break;
- case UC_INTEGER:
- rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv);
- break;
+ next_char(lex);
+ }
- default:
- rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal"));
- }
+ len = lex->buffer.count;
- lookbehind_reset(lex);
+ uc_vector_push(&lex->buffer, '\0');
- return rv;
- }
+ nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
- for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++)
- ;
+ uc_vector_clear(&lex->buffer);
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, ptr - lex->bufstart);
+ switch (ucv_type(nv)) {
+ case UC_DOUBLE:
+ return emit_op(lex, -len, TK_DOUBLE, nv);
- return NULL;
+ case UC_INTEGER:
+ return emit_op(lex, -len, TK_NUMBER, nv);
+
+ default:
+ return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
+ }
}
static uc_token_t *
-lex_step(uc_lexer_t *lex, FILE *fp)
+lex_find_token(uc_lexer_t *lex)
{
- uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
- union { uint32_t n; char str[4]; } search;
- const struct token *tok;
- size_t rlen, rem, *nest;
- char *ptr, c;
- uc_token_t *rv;
- size_t i;
+ bool tpl = !(lex->config && lex->config->raw_mode);
+ int ch = next_char(lex);
+
+ while (isspace(ch))
+ ch = next_char(lex);
+
+ switch (ch) {
+ case '~':
+ return emit_op(lex, -1, TK_COMPL, NULL);
- /* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining,
- * move the remaining bytes to the beginning and read more data */
- if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) {
- if (!lex->buf) {
- lex->buflen = 128;
- lex->buf = xalloc(lex->buflen);
+ case '}':
+ if (tpl && check_char(lex, '}'))
+ return emit_op(lex, -2, TK_REXP, NULL);
+
+ return emit_op(lex, -1, TK_RBRACE, NULL);
+
+ case '|':
+ if (check_char(lex, '|')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASOR, NULL);
+
+ return emit_op(lex, -2, TK_OR, NULL);
}
- rem = lex->bufend - lex->bufstart;
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASBOR, NULL);
- if (rem)
- memcpy(lex->buf, lex->bufstart, rem);
+ return emit_op(lex, -1, TK_BOR, NULL);
- rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp);
+ case '{':
+ if (tpl && check_char(lex, '{'))
+ return emit_op(lex, -2, TK_LEXP, NULL);
- lex->bufstart = lex->buf;
- lex->bufend = lex->buf + rlen + rem;
+ if (tpl && check_char(lex, '%'))
+ return emit_op(lex, -2, TK_LSTM, NULL);
- if (rlen == 0 && (ferror(fp) || feof(fp)))
- lex->eof = 1;
- }
+ return emit_op(lex, -1, TK_LBRACE, NULL);
- switch (lex->state) {
- case UC_LEX_IDENTIFY_BLOCK:
- /* previous block had strip trailing whitespace flag, skip leading whitespace */
- if (lex->modifier == MINUS) {
- while (buf_remaining(lex) && isspace(lex->bufstart[0]))
- buf_consume(lex, 1);
+ case '^':
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASBXOR, NULL);
- lex->modifier = UNSPEC;
+ return emit_op(lex, -1, TK_BXOR, NULL);
+
+ case '[':
+ return emit_op(lex, -1, TK_LBRACK, NULL);
+
+ case ']':
+ return emit_op(lex, -1, TK_RBRACK, NULL);
+
+ case '?':
+ if (check_char(lex, '?')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASNULLISH, NULL);
+
+ return emit_op(lex, -2, TK_NULLISH, NULL);
}
- /* previous block was a statement block and trim_blocks is enabld, skip leading newline */
- else if (lex->modifier == NEWLINE) {
- if (buf_startswith(lex, "\n"))
- buf_consume(lex, 1);
+ if (check_char(lex, '.')) {
+ if (check_char(lex, '['))
+ return emit_op(lex, -3, TK_QLBRACK, NULL);
+
+ if (check_char(lex, '('))
+ return emit_op(lex, -3, TK_QLPAREN, NULL);
- lex->modifier = UNSPEC;
+ return emit_op(lex, -2, TK_QDOT, NULL);
}
- /* scan forward through buffer to identify start token */
- for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) {
- /* found start of comment block */
- if (!strncmp(ptr, "{#", 2)) {
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr + 2) - lex->bufstart);
- lex->lastoff = lex->source->off - 2;
- lex->state = UC_LEX_BLOCK_COMMENT_START;
+ return emit_op(lex, lex->source->off, TK_QMARK, NULL);
- return NULL;
- }
+ case '>':
+ if (check_char(lex, '>')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASRIGHT, NULL);
- /* found start of expression block */
- else if (!strncmp(ptr, "{{", 2)) {
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr + 2) - lex->bufstart);
- lex->lastoff = lex->source->off - 2;
- lex->state = UC_LEX_BLOCK_EXPRESSION_START;
+ return emit_op(lex, -2, TK_RSHIFT, NULL);
+ }
- return NULL;
- }
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_GE, NULL);
- /* found start of statement block */
- else if (!strncmp(ptr, "{%", 2)) {
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, (ptr + 2) - lex->bufstart);
- lex->lastoff = lex->source->off - 2;
- lex->state = UC_LEX_BLOCK_STATEMENT_START;
+ return emit_op(lex, -1, TK_GT, NULL);
- return NULL;
- }
+ case '=':
+ if (check_char(lex, '=')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_EQS, NULL);
+
+ return emit_op(lex, -2, TK_EQ, NULL);
}
- /* we're at eof */
- if (lex->eof) {
- lookbehind_append(lex, ptr, lex->bufend - ptr);
- lex->state = UC_LEX_EOF;
+ if (check_char(lex, '>'))
+ return emit_op(lex, -2, TK_ARROW, NULL);
+
+ return emit_op(lex, -1, TK_ASSIGN, NULL);
+
+ case '<':
+ if (check_char(lex, '<')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASLEFT, NULL);
- return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL);
+ return emit_op(lex, -2, TK_LSHIFT, NULL);
}
- lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
- buf_consume(lex, ptr - lex->bufstart);
- break;
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_LE, NULL);
+
+ return emit_op(lex, -1, TK_LT, NULL);
+
+ case ';':
+ return emit_op(lex, -1, TK_SCOL, NULL);
+ case ':':
+ return emit_op(lex, -1, TK_COLON, NULL);
- case UC_LEX_BLOCK_COMMENT_START:
- case UC_LEX_BLOCK_EXPRESSION_START:
- case UC_LEX_BLOCK_STATEMENT_START:
- rv = NULL;
- lex->modifier = UNSPEC;
+ case '/':
+ ch = lookahead_char(lex);
+ lex->lastoff = lex->source->off - 1;
- /* strip whitespace before block */
- if (buf_startswith(lex, "-")) {
- rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r");
- buf_consume(lex, 1);
+ if (ch == '/' || ch == '*')
+ return parse_comment(lex, ch);
+
+ if (lex->no_regexp) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASDIV, NULL);
+
+ return emit_op(lex, -1, TK_DIV, NULL);
}
- /* disable lstrip flag (only valid for statement blocks) */
- else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) {
- /* disable lstrip flag */
- if (buf_startswith(lex, "+")) {
- rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
- buf_consume(lex, 1);
- }
+ return parse_regexp(lex);
- /* put out text leading up to the opening tag and potentially
- * strip trailing white space from it depending on the global
- * block lstrip setting */
- else {
- rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT,
- (lex->config && lex->config->lstrip_blocks) ? " \t\v\f\r" : NULL);
+ case '.':
+ if (check_char(lex, '.')) {
+ if (check_char(lex, '.'))
+ return emit_op(lex, -3, TK_ELLIP, NULL);
+
+ /* The sequence ".." cannot be a valid */
+ return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
+ }
+
+ return emit_op(lex, -1, TK_DOT, NULL);
+
+ case '-':
+ if (tpl && check_char(lex, '}')) {
+ if (check_char(lex, '}')) {
+ lex->modifier = MINUS;
+
+ return emit_op(lex, -3, TK_REXP, NULL);
}
+
+ /* The sequence "-}" cannot be a valid */
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
}
- else {
- rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
+
+ if (tpl && check_char(lex, '%')) {
+ if (check_char(lex, '}')) {
+ lex->modifier = MINUS;
+
+ return emit_op(lex, -3, TK_RSTM, NULL);
+ }
+
+ /* The sequence "-%" cannot be a valid */
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
}
- switch (lex->state) {
- case UC_LEX_BLOCK_COMMENT_START:
- lex->state = UC_LEX_BLOCK_COMMENT;
- lex->block = COMMENT;
- break;
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASSUB, NULL);
- case UC_LEX_BLOCK_STATEMENT_START:
- lex->state = UC_LEX_IDENTIFY_TOKEN;
- lex->block = STATEMENTS;
- break;
+ if (check_char(lex, '-'))
+ return emit_op(lex, -2, TK_DEC, NULL);
- case UC_LEX_BLOCK_EXPRESSION_START:
- lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
- break;
+ return emit_op(lex, -1, TK_SUB, NULL);
- default:
- break;
+ case ',':
+ return emit_op(lex, -1, TK_COMMA, NULL);
+
+ case '+':
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASADD, NULL);
+
+ if (check_char(lex, '+'))
+ return emit_op(lex, -2, TK_INC, NULL);
+
+ return emit_op(lex, -1, TK_ADD, NULL);
+
+ case '*':
+ if (check_char(lex, '*')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASEXP, NULL);
+
+ return emit_op(lex, -2, TK_EXP, NULL);
}
- return rv;
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASMUL, NULL);
+ return emit_op(lex, -1, TK_MUL, NULL);
- case UC_LEX_BLOCK_COMMENT:
- /* scan forward through buffer to identify end token */
- while (lex->bufstart < lex->bufend - 2) {
- if (buf_startswith(lex, "-#}")) {
- lex->state = UC_LEX_IDENTIFY_BLOCK;
- lex->modifier = MINUS;
- buf_consume(lex, 3);
- lex->lastoff = lex->source->off;
- break;
- }
- else if (buf_startswith(lex, "#}")) {
- lex->state = UC_LEX_IDENTIFY_BLOCK;
- buf_consume(lex, 2);
- lex->lastoff = lex->source->off;
- break;
- }
+ case '(':
+ return emit_op(lex, -1, TK_LPAREN, NULL);
+
+ case ')':
+ return emit_op(lex, -1, TK_RPAREN, NULL);
- buf_consume(lex, 1);
+ case '\'':
+ case '"':
+ case '`':
+ lex->lastoff = lex->source->off - 1;
+
+ return parse_string(lex, ch);
+
+ case '&':
+ if (check_char(lex, '&')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_ASAND, NULL);
+
+ return emit_op(lex, -2, TK_AND, NULL);
}
- /* we're at eof */
- if (lex->eof) {
- lex->state = UC_LEX_EOF;
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASBAND, NULL);
+
+ return emit_op(lex, -1, TK_BAND, NULL);
- buf_consume(lex, lex->bufend - lex->bufstart);
+ case '%':
+ if (tpl && check_char(lex, '}'))
+ return emit_op(lex, -2, TK_RSTM, NULL);
- return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
+ if (check_char(lex, '='))
+ return emit_op(lex, -2, TK_ASMOD, NULL);
+
+ return emit_op(lex, -1, TK_MOD, NULL);
+
+ case '!':
+ if (check_char(lex, '=')) {
+ if (check_char(lex, '='))
+ return emit_op(lex, -3, TK_NES, NULL);
+
+ return emit_op(lex, -2, TK_NE, NULL);
}
- break;
+ return emit_op(lex, -1, TK_NOT, NULL);
+ case EOF:
+ return emit_op(lex, -1, TK_EOF, NULL);
- case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
- lex->state = UC_LEX_IDENTIFY_TOKEN;
- lex->block = EXPRESSION;
+ default:
+ if (isalpha(ch) || ch == '_')
+ return parse_label(lex, ch);
- return emit_op(lex, lex->source->off, TK_LEXP, NULL);
+ if (isdigit(ch))
+ return parse_number(lex, ch);
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
+ }
+}
- case UC_LEX_IDENTIFY_TOKEN:
- /* skip leading whitespace */
- for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++)
- ;
+static uc_token_t *
+lex_step(uc_lexer_t *lex)
+{
+ const char *strip = NULL;
+ uc_token_t *tok;
+ size_t *nest;
+ int ch;
- buf_consume(lex, i);
+ while (lex->state != UC_LEX_EOF) {
+ switch (lex->state) {
+ case UC_LEX_IDENTIFY_BLOCK:
+ ch = next_char(lex);
- if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN)
- return NULL;
+ /* previous block had strip trailing whitespace flag, skip leading whitespace */
+ if (lex->modifier == MINUS) {
+ while (isspace(ch))
+ ch = next_char(lex);
- for (i = 0; i < sizeof(search.str); i++)
- search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0;
+ lex->modifier = UNSPEC;
+ }
- for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
- /* remaining buffer data is shorter than token, skip */
- if (tok->plen > buf_remaining(lex))
- continue;
+ /* previous block was a statement block and trim_blocks is enabled, skip leading newline */
+ else if (lex->modifier == NEWLINE) {
+ if (ch == '\n')
+ ch = next_char(lex);
- c = buf_remaining(lex) ? lex->bufstart[0] : 0;
+ lex->modifier = UNSPEC;
+ }
- if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn)
- : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) {
- lex->lastoff = lex->source->off;
+ /* scan forward through buffer to identify block start token */
+ while (ch != EOF) {
+ if (ch == '{') {
+ ch = next_char(lex);
- /* token has a parse method, switch state */
- if (tok->parse) {
- lex->tok = tok;
- lex->state = UC_LEX_PARSE_TOKEN;
+ switch (ch) {
+ /* found start of comment block */
+ case '#':
+ lex->state = UC_LEX_BLOCK_COMMENT;
+ lex->block = COMMENT;
- buf_consume(lex, tok->plen);
+ if (check_char(lex, '-'))
+ strip = " \n\t\v\f\r";
- return NULL;
- }
+ break;
- /* in raw code mode, ignore template tag tokens */
- if (lex->config && lex->config->raw_mode &&
- (tok->type == TK_LSTM || tok->type == TK_RSTM ||
- tok->type == TK_LEXP || tok->type == TK_REXP)) {
- continue;
- }
+ /* found start of expression block */
+ case '{':
+ lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
- /* disallow nesting blocks */
- if (tok->type == TK_LSTM || tok->type == TK_LEXP) {
- buf_consume(lex, tok->plen);
+ if (check_char(lex, '-'))
+ strip = " \n\t\v\f\r";
- return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
- }
+ break;
- /* found end of block */
- else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) ||
- (lex->block == EXPRESSION && tok->type == TK_REXP)) {
- /* strip whitespace after block */
- if (tok->u.pat[0] == '-')
- lex->modifier = MINUS;
+ /* found start of statement block */
+ case '%':
+ lex->state = UC_LEX_IDENTIFY_TOKEN;
+ lex->block = STATEMENTS;
- /* strip newline after statement block */
- else if (lex->block == STATEMENTS &&
- lex->config && lex->config->trim_blocks)
- lex->modifier = NEWLINE;
+ if (check_char(lex, '-'))
+ strip = " \n\t\v\f\r";
+ else if (check_char(lex, '+'))
+ strip = NULL;
+ else if (lex->config && lex->config->lstrip_blocks)
+ strip = " \t\v\f\r";
- lex->state = UC_LEX_IDENTIFY_BLOCK;
- lex->block = NONE;
- }
+ break;
+
+ default:
+ /* not a start tag, remember char and move on */
+ uc_vector_push(&lex->buffer, '{');
+ continue;
+ }
- /* track opening braces */
- else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
- nest = uc_vector_last(&lex->templates);
- (*nest)++;
+ break;
}
- /* check end of placeholder expression */
- else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
- nest = uc_vector_last(&lex->templates);
+ uc_vector_push(&lex->buffer, ch);
+ ch = next_char(lex);
+ }
- if (*nest == 0) {
- lex->templates.count--;
- lex->state = UC_LEX_PARSE_TOKEN;
- lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */
- }
- else {
- (*nest)--;
- }
+ if (ch == EOF)
+ lex->state = UC_LEX_EOF;
+
+ /* push out leading text */
+ tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
+ lex->lastoff = lex->source->off - 2;
+
+ if (!tok)
+ continue;
+
+ return tok;
+
+
+ case UC_LEX_BLOCK_COMMENT:
+ ch = next_char(lex);
+
+ /* scan forward through buffer to identify end token */
+ while (ch != EOF) {
+ if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
+ lex->modifier = MINUS;
+ break;
}
- /* do not report statement tags to the parser */
- if (tok->type != 0 && tok->type != TK_LSTM)
- rv = emit_op(lex, lex->source->off,
- (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL);
- else
- rv = NULL;
+ if (ch == '#' && check_char(lex, '}'))
+ break;
+
+ ch = next_char(lex);
+ }
- buf_consume(lex, tok->plen);
+ if (ch == EOF) {
+ lex->state = UC_LEX_EOF;
- return rv;
+ return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
}
- }
- /* no possible return beyond this point can advance,
- mark lex state as eof */
- lex->state = UC_LEX_EOF;
+ lex->lastoff = lex->source->off;
+ lex->state = UC_LEX_IDENTIFY_BLOCK;
- /* no token matched and we do have remaining data, junk */
- if (buf_remaining(lex))
- return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character"));
+ continue;
- /* we're at eof, allow unclosed statement blocks */
- if (lex->block == STATEMENTS)
- return NULL;
- /* premature EOF */
- return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block"));
+ case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
+ lex->state = UC_LEX_IDENTIFY_TOKEN;
+ lex->block = EXPRESSION;
+ return emit_op(lex, lex->source->off, TK_LEXP, NULL);
- case UC_LEX_PARSE_TOKEN:
- tok = lex->tok;
- rv = tok->parse(lex);
- if (rv) {
- memset(lex->esc, 0, sizeof(lex->esc));
- lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN;
- lex->is_placeholder = false;
- lex->tok = NULL;
+ case UC_LEX_IDENTIFY_TOKEN:
+ do { tok = lex_find_token(lex); } while (tok == NULL);
- if (rv == UC_LEX_CONTINUE_PARSING)
- rv = NULL;
+ /* disallow nesting blocks */
+ if (tok->type == TK_LSTM || tok->type == TK_LEXP)
+ return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
- return rv;
- }
+ /* found end of statement block */
+ if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
+ /* strip newline after statement block? */
+ if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
+ lex->modifier = NEWLINE;
- break;
+ lex->lastoff = lex->source->off;
+ lex->state = UC_LEX_IDENTIFY_BLOCK;
+ lex->block = NONE;
+ tok = emit_op(lex, -2, TK_SCOL, NULL);
+ }
- case UC_LEX_PLACEHOLDER:
- lex->state = UC_LEX_IDENTIFY_TOKEN;
+ /* found end of expression block */
+ else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
+ lex->lastoff = lex->source->off;
+ lex->state = UC_LEX_IDENTIFY_BLOCK;
+ lex->block = NONE;
+ }
+
+ /* track opening braces */
+ else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
+ nest = uc_vector_last(&lex->templates);
+ (*nest)++;
+ }
+
+ /* check end of placeholder expression */
+ else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
+ nest = uc_vector_last(&lex->templates);
+
+ if (*nest == 0) {
+ lex->templates.count--;
+ lex->state = UC_LEX_PLACEHOLDER_END;
+ }
+ else {
+ (*nest)--;
+ }
+ }
+
+ /* premature EOF? */
+ else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
+ lex->state = UC_LEX_EOF;
+
+ return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
+ }
- uc_vector_push(&lex->templates, 0);
+ return tok;
- return emit_op(lex, lex->source->off, TK_PLACEH, NULL);
+ case UC_LEX_PLACEHOLDER_START:
+ lex->state = UC_LEX_IDENTIFY_TOKEN;
+
+ uc_vector_push(&lex->templates, 0);
+
+ return emit_op(lex, -2, TK_PLACEH, NULL);
- case UC_LEX_EOF:
- break;
+
+ case UC_LEX_PLACEHOLDER_END:
+ lex->state = UC_LEX_IDENTIFY_TOKEN;
+
+ return parse_string(lex, '`');
+
+
+ case UC_LEX_EOF:
+ break;
+ }
}
- return NULL;
+ return emit_op(lex, lex->source->off, TK_EOF, NULL);
}
void
@@ -1115,24 +1012,15 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
lex->config = config;
lex->source = uc_source_get(source);
- lex->eof = 0;
- lex->is_escape = 0;
-
lex->block = NONE;
lex->modifier = UNSPEC;
- lex->buflen = 0;
- lex->buf = NULL;
- lex->bufstart = NULL;
- lex->bufend = NULL;
-
- lex->lookbehindlen = 0;
- lex->lookbehind = NULL;
+ lex->rlen = 0;
+ lex->rpos = 0;
+ lex->rbuf = NULL;
- lex->tok = NULL;
-
- lex->esclen = 0;
- memset(lex->esc, 0, sizeof(lex->esc));
+ lex->buffer.count = 0;
+ lex->buffer.entries = NULL;
lex->lead_surrogate = 0;
@@ -1150,11 +1038,12 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
void
uc_lexer_free(uc_lexer_t *lex)
{
+ uc_vector_clear(&lex->buffer);
uc_vector_clear(&lex->templates);
+
uc_source_put(lex->source);
- free(lex->lookbehind);
- free(lex->buf);
+ free(lex->rbuf);
}
uc_token_t *
@@ -1162,47 +1051,94 @@ uc_lexer_next_token(uc_lexer_t *lex)
{
uc_token_t *rv = NULL;
- while (lex->state != UC_LEX_EOF) {
- rv = lex_step(lex, lex->source->fp);
-
- if (rv != NULL)
- break;
- }
-
- if (rv) {
- lex->no_keyword = false;
- lex->no_regexp = false;
+ rv = lex_step(lex);
- return rv;
- }
+ lex->no_keyword = false;
+ lex->no_regexp = false;
- return emit_op(lex, lex->source->off, TK_EOF, NULL);
+ return rv;
}
const char *
uc_tokenname(unsigned type)
{
static char buf[sizeof("'endfunction'")];
- size_t i;
-
- switch (type) {
- case 0: return "End of file";
- case TK_TEMPLATE: return "Template";
- case TK_STRING: return "String";
- case TK_LABEL: return "Label";
- case TK_NUMBER: return "Number";
- case TK_DOUBLE: return "Double";
- case TK_REGEXP: return "Regexp";
- }
-
- for (i = 0; i < ARRAY_SIZE(tokens); i++) {
- if (tokens[i].type != type)
- continue;
+ const char *tokennames[] = {
+ [TK_LEXP] = "'{{'",
+ [TK_REXP] = "'}}'",
+ [TK_LSTM] = "'{%'",
+ [TK_RSTM] = "'%}'",
+ [TK_COMMA] = "','",
+ [TK_ASSIGN] = "'='",
+ [TK_ASADD] = "'+='",
+ [TK_ASSUB] = "'-='",
+ [TK_ASMUL] = "'*='",
+ [TK_ASDIV] = "'/='",
+ [TK_ASMOD] = "'%='",
+ [TK_ASLEFT] = "'<<='",
+ [TK_ASRIGHT] = "'>>='",
+ [TK_ASBAND] = "'&='",
+ [TK_ASBXOR] = "'^='",
+ [TK_ASBOR] = "'|='",
+ [TK_QMARK] = "'?'",
+ [TK_COLON] = "':'",
+ [TK_OR] = "'||'",
+ [TK_AND] = "'&&'",
+ [TK_BOR] = "'|'",
+ [TK_BXOR] = "'^'",
+ [TK_BAND] = "'&'",
+ [TK_EQS] = "'==='",
+ [TK_NES] = "'!=='",
+ [TK_EQ] = "'=='",
+ [TK_NE] = "'!='",
+ [TK_LT] = "'<'",
+ [TK_LE] = "'<='",
+ [TK_GT] = "'>'",
+ [TK_GE] = "'>='",
+ [TK_LSHIFT] = "'<<'",
+ [TK_RSHIFT] = "'>>'",
+ [TK_ADD] = "'+'",
+ [TK_SUB] = "'-'",
+ [TK_MUL] = "'*'",
+ [TK_DIV] = "'/'",
+ [TK_MOD] = "'%'",
+ [TK_EXP] = "'**'",
+ [TK_NOT] = "'!'",
+ [TK_COMPL] = "'~'",
+ [TK_INC] = "'++'",
+ [TK_DEC] = "'--'",
+ [TK_DOT] = "'.'",
+ [TK_LBRACK] = "'['",
+ [TK_RBRACK] = "']'",
+ [TK_LPAREN] = "'('",
+ [TK_RPAREN] = "')'",
+ [TK_LBRACE] = "'{'",
+ [TK_RBRACE] = "'}'",
+ [TK_SCOL] = "';'",
+ [TK_ELLIP] = "'...'",
+ [TK_ARROW] = "'=>'",
+ [TK_QLBRACK] = "'?.['",
+ [TK_QLPAREN] = "'?.('",
+ [TK_QDOT] = "'?.'",
+ [TK_ASEXP] = "'**='",
+ [TK_ASAND] = "'&&='",
+ [TK_ASOR] = "'||='",
+ [TK_ASNULLISH] = "'\?\?='",
+ [TK_NULLISH] = "'\?\?'",
+ [TK_PLACEH] = "'${'",
+
+ [TK_TEXT] = "Text",
+ [TK_LABEL] = "Label",
+ [TK_NUMBER] = "Number",
+ [TK_DOUBLE] = "Double",
+ [TK_STRING] = "String",
+ [TK_REGEXP] = "Regexp",
+ [TK_TEMPLATE] = "Template",
+ [TK_ERROR] = "Error",
+ [TK_EOF] = "End of file",
+ };
- snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat);
-
- return buf;
- }
+ size_t i;
for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
if (reserved_words[i].type != type)
@@ -1213,7 +1149,7 @@ uc_tokenname(unsigned type)
return buf;
}
- return "?";
+ return tokennames[type] ? tokennames[type] : "?";
}
bool