summaryrefslogtreecommitdiffhomepage
path: root/lexer.c
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2020-10-09 16:01:31 +0200
committerJo-Philipp Wich <jo@mein.io>2020-10-14 12:09:28 +0200
commit6ad05263426e6f4aae4665d52b9ed1962ab4cd24 (patch)
tree7b73f563e291eeab944071e0c9a3b9128e924c6b /lexer.c
parent4d1c4e28b8d8368a105717e142f8e920cbf4ea0f (diff)
lexer: rewrite
Rewrite the lexer into a restartable state machine to support parsing from file streams without the need to read the entire source text into memory first. As a side effect, the length of labels and strings is unlimited now. Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c1380
1 files changed, 783 insertions, 597 deletions
diff --git a/lexer.c b/lexer.c
index 810e2eb..2ec1ea0 100644
--- a/lexer.c
+++ b/lexer.c
@@ -25,6 +25,7 @@
#include <errno.h>
#include "ast.h"
+#include "lib.h"
#include "lexer.h"
#include "parser.h"
@@ -33,7 +34,11 @@ struct token {
int type;
const char *pat;
int plen;
- int (*parse)(const char *buf, struct ut_op *op, struct ut_state *s);
+ union {
+ uint32_t (*parse)(struct ut_state *s);
+ double d;
+ bool b;
+ };
};
#define dec(o) \
@@ -43,12 +48,11 @@ struct token {
(((x) >= 'a') ? (10 + (x) - 'a') : \
(((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
-static int parse_comment(const char *, struct ut_op *, struct ut_state *);
-static int parse_string(const char *, struct ut_op *, struct ut_state *);
-static int parse_regexp(const char *, struct ut_op *, struct ut_state *);
-static int parse_number(const char *, struct ut_op *, struct ut_state *);
-static int parse_label(const char *, struct ut_op *, struct ut_state *);
-static int parse_bool(const char *, struct ut_op *, struct ut_state *);
+static uint32_t parse_comment(struct ut_state *);
+static uint32_t parse_string(struct ut_state *);
+static uint32_t parse_regexp(struct ut_state *);
+static uint32_t parse_number(struct ut_state *);
+static uint32_t parse_label(struct ut_state *);
static const struct token tokens[] = {
{ 0, " ", 1 },
@@ -81,8 +85,8 @@ static const struct token tokens[] = {
{ T_GE, ">=", 2 },
{ T_LSHIFT, "<<", 2 },
{ T_RSHIFT, ">>", 2 },
- { 0, "//", 2, parse_comment },
- { 0, "/*", 2, parse_comment },
+ { 0, "//", 2, { .parse = parse_comment } },
+ { 0, "/*", 2, { .parse = parse_comment } },
{ T_OR, "||", 2 },
{ T_LEXP, "{{", 2 },
{ T_REXP, "}}", 2 },
@@ -112,18 +116,18 @@ static const struct token tokens[] = {
{ T_SCOL, ";", 1 },
{ T_SUB, "-", 1 },
{ T_DOT, ".", 1 },
- { T_STRING, "'", 1, parse_string },
- { T_STRING, "\"", 1, parse_string },
- { T_REGEXP, "/", 1, parse_regexp },
- { T_LABEL, "_", 1, parse_label },
- { T_LABEL, "az", 0, parse_label },
- { T_LABEL, "AZ", 0, parse_label },
- { T_NUMBER, "09", 0, parse_number },
+ { T_STRING, "'", 1, { .parse = parse_string } },
+ { T_STRING, "\"", 1, { .parse = parse_string } },
+ { T_REGEXP, "/", 1, { .parse = parse_regexp } },
+ { T_LABEL, "_", 1, { .parse = parse_label } },
+ { T_LABEL, "az", 0, { .parse = parse_label } },
+ { T_LABEL, "AZ", 0, { .parse = parse_label } },
+ { T_NUMBER, "09", 0, { .parse = parse_number } },
};
static const struct token reserved_words[] = {
{ T_ENDFUNC, "endfunction", 11 },
- { T_NUMBER, "Infinity", 8, parse_number },
+ { T_DOUBLE, "Infinity", 8, { .d = INFINITY } },
{ T_CONTINUE, "continue", 8 },
{ T_ENDWHILE, "endwhile", 8 },
{ T_FUNC, "function", 8 },
@@ -136,101 +140,19 @@ static const struct token reserved_words[] = {
{ T_WHILE, "while", 5 },
{ T_BREAK, "break", 5 },
{ T_CATCH, "catch", 5 },
- { T_BOOL, "false", 5, parse_bool },
- { T_BOOL, "true", 4, parse_bool },
+ { T_BOOL, "false", 5, { .b = false } },
+ { T_BOOL, "true", 4, { .b = true } },
{ T_ELSE, "else", 4 },
{ T_THIS, "this", 4 },
{ T_NULL, "null", 4 },
{ T_CASE, "case", 4 },
- { T_NUMBER, "NaN", 3, parse_number },
+ { T_DOUBLE, "NaN", 3, { .d = NAN } },
{ T_TRY, "try", 3 },
{ T_FOR, "for", 3 },
{ T_IF, "if", 2 },
{ T_IN, "in", 2 },
};
-const char *tokennames[__T_MAX] = {
- [0] = "End of file",
- [T_FUNC] = "'function'",
- [T_LOCAL] = "'local'",
- [T_WHILE] = "'while",
- [T_ELSE] = "'else'",
- [T_FOR] = "'for'",
- [T_IF] = "'if'",
- [T_IN] = "'in'",
- [T_ASLEFT] = "'x<<=y'",
- [T_ASRIGHT] = "'x>>=y'",
- [T_AND] = "'x&&y'",
- [T_ASADD] = "'x+=y'",
- [T_ASBAND] = "'x&=y'",
- [T_ASBOR] = "'x|=y'",
- [T_ASBXOR] = "'x^=y'",
- [T_ASDIV] = "'x/=y'",
- [T_ASMOD] = "'x%=y'",
- [T_ASMUL] = "'x*=y'",
- [T_ASSUB] = "'x-=y'",
- [T_DEC] = "'x--'",
- [T_INC] = "'x++'",
- [T_EQ] = "'x==y'",
- [T_NE] = "'x!=y'",
- [T_EQS] = "'x===y'",
- [T_NES] = "'x!==y'",
- [T_LE] = "'x<=y'",
- [T_GE] = "'x>=y'",
- [T_LSHIFT] = "'x<<y'",
- [T_RSHIFT] = "'x>>y'",
- [T_LEXP] = "'{{'",
- [T_REXP] = "'}}'",
- [T_OR] = "'x||y'",
- [T_ADD] = "'x+y'",
- [T_ASSIGN] = "'x=y'",
- [T_BAND] = "'x&y'",
- [T_BOR] = "'x|y'",
- [T_LBRACK] = "'['",
- [T_RBRACK] = "']'",
- [T_BXOR] = "'x^y'",
- [T_LBRACE] = "'{'",
- [T_RBRACE] = "'}'",
- [T_COLON] = "':'",
- [T_COMMA] = "','",
- [T_COMPL] = "'~x'",
- [T_DIV] = "'x/y'",
- [T_GT] = "'x>y'",
- [T_NOT] = "'!x'",
- [T_LT] = "'x<y'",
- [T_MOD] = "'x%y'",
- [T_MUL] = "'x*y'",
- [T_LPAREN] = "'('",
- [T_RPAREN] = "')'",
- [T_QMARK] = "'?'",
- [T_SCOL] = "';'",
- [T_SUB] = "'x-y'",
- [T_DOT] = "'.'",
- [T_STRING] = "String",
- [T_LABEL] = "Label",
- [T_NUMBER] = "Number",
- [T_DOUBLE] = "Double",
- [T_BOOL] = "Bool",
- [T_REGEXP] = "Regexp",
- [T_TEXT] = "Text",
- [T_ENDIF] = "'endif'",
- [T_ENDFOR] = "'endfor'",
- [T_ENDWHILE] = "'endwhile'",
- [T_ENDFUNC] = "'endfuncton'",
- [T_RETURN] = "'return'",
- [T_BREAK] = "'break'",
- [T_CONTINUE] = "'continue'",
- [T_NULL] = "'null'",
- [T_THIS] = "'this'",
- [T_TRY] = "'try'",
- [T_CATCH] = "'catch'",
- [T_SWITCH] = "'switch'",
- [T_CASE] = "'case'",
- [T_DEFAULT] = "'default'",
- //[T_LSTM] = "'{%'",
- //[T_RSTM] = "'%}'"
-};
-
/*
* Stores the given codepoint as a utf8 multibyte sequence into the given
@@ -286,6 +208,117 @@ utf8enc(char **out, int *rem, int code)
return true;
}
+/* length of the longest token in our lookup table */
+#define UT_LEX_MAX_TOKEN_LEN 3
+
+static uint32_t emit_op(struct ut_state *s, uint32_t pos, int type, struct json_object *val)
+{
+ uint32_t off = ut_new_op(s, type, val, UINT32_MAX);
+ struct ut_op *op = ut_get_op(s, off);
+
+ op->off = pos;
+
+ /* Follow JSLint logic and treat a slash after any of the
+ * `(,=:[!&|?{};` characters as the beginning of a regex
+ * literal... */
+ switch (type) {
+ case T_LPAREN:
+ case T_COMMA:
+
+ case T_ASADD:
+ case T_ASBAND:
+ case T_ASBOR:
+ case T_ASBXOR:
+ case T_ASDIV:
+ case T_ASLEFT:
+ case T_ASMOD:
+ case T_ASMUL:
+ case T_ASRIGHT:
+ case T_ASSIGN:
+ case T_ASSUB:
+ case T_EQ:
+ case T_EQS:
+ case T_GE:
+ case T_LE:
+ case T_NE:
+ case T_NES:
+
+ case T_COLON:
+ case T_LBRACK:
+ case T_NOT:
+
+ case T_AND:
+ case T_BAND:
+
+ case T_OR:
+ case T_BOR:
+
+ case T_QMARK:
+
+ case T_LBRACE:
+ case T_RBRACE:
+
+ case T_LSTM:
+ case T_LEXP:
+
+ case T_SCOL:
+ s->lex.expect_div = false;
+ break;
+
+ default:
+ s->lex.expect_div = true;
+ }
+
+ return off;
+}
+
+static void lookbehind_append(struct ut_state *s, const char *data, size_t len)
+{
+ if (len) {
+ s->lex.lookbehind = xrealloc(s->lex.lookbehind, s->lex.lookbehindlen + len);
+ memcpy(s->lex.lookbehind + s->lex.lookbehindlen, data, len);
+ s->lex.lookbehindlen += len;
+ }
+}
+
+static void lookbehind_reset(struct ut_state *s) {
+ free(s->lex.lookbehind);
+ s->lex.lookbehind = NULL;
+ s->lex.lookbehindlen = 0;
+}
+
+static uint32_t lookbehind_to_text(struct ut_state *s, uint32_t pos, int type, const char *strip_trailing_chars) {
+ uint32_t rv = 0;
+
+ if (s->lex.lookbehind) {
+ if (strip_trailing_chars) {
+ while (s->lex.lookbehindlen > 0 && strchr(strip_trailing_chars, s->lex.lookbehind[s->lex.lookbehindlen-1]))
+ s->lex.lookbehindlen--;
+ }
+
+ rv = emit_op(s, pos, type, xjs_new_string_len(s->lex.lookbehind, s->lex.lookbehindlen));
+
+ lookbehind_reset(s);
+ }
+
+ return rv;
+}
+
+static inline size_t buf_remaining(struct ut_state *s) {
+ return (s->lex.bufend - s->lex.bufstart);
+}
+
+static inline bool _buf_startswith(struct ut_state *s, const char *str, size_t len) {
+ return (buf_remaining(s) >= len && !strncmp(s->lex.bufstart, str, len));
+}
+
+#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
+
+static void buf_consume(struct ut_state *s, ssize_t len) {
+ s->lex.bufstart += len;
+ s->lex.off += len;
+}
+
/*
* Parses a comment from the given buffer.
*
@@ -296,28 +329,50 @@ utf8enc(char **out, int *rem, int code)
* -UT_ERROR_UNTERMINATED_COMMENT Unterminated string
*/
-static int
-parse_comment(const char *buf, struct ut_op *op, struct ut_state *s)
+static uint32_t
+parse_comment(struct ut_state *s)
{
- const char *p = buf;
+ const struct token *tok = s->lex.tok;
+ const char *ptr, *end;
+ size_t elen;
- /* single line comment */
- if (p[0] == '/' && p[1] == '/') {
- while (*p != 0 && *p != '\n')
- p++;
+ if (!buf_remaining(s)) {
+ s->error.code = UT_ERROR_UNTERMINATED_COMMENT;
- return (p - buf);
+ return 0;
}
- /* multi line comment */
- while (*p) {
- if (p[0] == '*' && p[1] == '/')
- break;
+ if (!strcmp(tok->pat, "//")) {
+ end = "\n";
+ elen = 1;
+ }
+ else {
+ end = "*/";
+ elen = 2;
+ }
+
+ for (ptr = s->lex.bufstart; ptr < s->lex.bufend - elen; ptr++) {
+ if (!strncmp(ptr, end, elen)) {
+ buf_consume(s, (ptr - s->lex.bufstart) + elen);
- p++;
+ return UINT32_MAX;
+ }
}
- return *p ? (p - buf) + 2 : -UT_ERROR_UNTERMINATED_COMMENT;
+ buf_consume(s, ptr - s->lex.bufstart);
+
+ return 0;
+}
+
+static void append_utf8(struct ut_state *s, int code) {
+ char ustr[8], *up;
+ int rem;
+
+ up = ustr;
+ rem = sizeof(ustr);
+
+ if (utf8enc(&up, &rem, code))
+ lookbehind_append(s, ustr, up - ustr);
}
/*
@@ -332,269 +387,236 @@ parse_comment(const char *buf, struct ut_op *op, struct ut_state *s)
* -UT_ERROR_OVERLONG_STRING String literal too long
*/
-static int
-parse_string(const char *buf, struct ut_op *op, struct ut_state *s)
+static uint32_t
+parse_string(struct ut_state *s)
{
- char q = *(buf++);
- char str[128] = { 0 };
- char *out = str;
- const char *in = buf;
- bool esc = false;
- int rem = sizeof(str) - 1;
- int lead_surrogate = 0;
+ const struct token *tok = s->lex.tok;
+ char q = tok->pat[0];
+ char *ptr, *c;
+ uint32_t rv;
int code;
- while (*in) {
- /* continuation of escape sequence */
- if (esc) {
- /* \uFFFF */
- if (in[0] == 'u') {
- if (isxdigit(in[1]) && isxdigit(in[2]) &&
- isxdigit(in[3]) && isxdigit(in[4])) {
- code = hex(in[1]) * 16 * 16 * 16 +
- hex(in[2]) * 16 * 16 +
- hex(in[3]) * 16 +
- hex(in[4]);
-
- /* is a leading surrogate value */
- if ((code & 0xFC00) == 0xD800) {
- /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
- if (lead_surrogate) {
- if (!utf8enc(&out, &rem, 0xFFFD)) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
- }
+ if (!buf_remaining(s)) {
+ s->error.code = UT_ERROR_UNTERMINATED_STRING;
+ s->lex.off = s->lex.lastoff;
- /* store surrogate value and advance to next escape sequence */
- lead_surrogate = code;
- goto next;
- }
+ return 0;
+ }
- /* is a trailing surrogate value */
- else if ((code & 0xFC00) == 0xDC00) {
- /* found a trailing surrogate following a leading one, combine and encode */
- if (lead_surrogate) {
- code = 0x10000 + ((lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
- lead_surrogate = 0;
- }
+ for (ptr = s->lex.bufstart; ptr < s->lex.bufend; ptr++) {
+ /* continuation of escape sequence */
+ if (s->lex.is_escape) {
+ if (s->lex.esclen == 0) {
+ /* non-unicode escape following a lead surrogate, emit replacement... */
+ if (s->lex.lead_surrogate && *ptr != 'u') {
+ append_utf8(s, 0xFFFD);
+ s->lex.lead_surrogate = 0;
+ }
- /* trailing surrogate not following a leading one, ignore and use replacement char */
- else {
- code = 0xFFFD;
- }
+ switch (*ptr) {
+ case 'u':
+ case 'x':
+ s->lex.esc[s->lex.esclen++] = *ptr;
+ break;
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ /* regex mode => backref, retain literally */
+ if (q == '/') {
+ s->lex.is_escape = false;
+ lookbehind_append(s, "\\", 1);
+ lookbehind_append(s, ptr, 1);
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
}
- if (!utf8enc(&out, &rem, code)) {
- s->off += (in - buf);
+ /* string mode => likely octal */
+ else if (*ptr < '8') {
+ s->lex.esc[s->lex.esclen++] = 'o';
+ s->lex.esc[s->lex.esclen++] = *ptr;
+ }
- return -UT_ERROR_OVERLONG_STRING;
+ /* non-octal char, add verbatim */
+ else {
+ s->lex.is_escape = false;
+ lookbehind_append(s, ptr, 1);
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
}
-next:
- in += 5;
- }
- else {
- s->off += (in - buf);
+ break;
- return -UT_ERROR_INVALID_ESCAPE;
+ default:
+ s->lex.is_escape = false;
+ c = strchr("a\ab\be\ef\fn\nr\rt\tv\v", *ptr);
+ lookbehind_append(s, c ? c + 1 : ptr, 1);
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
+ break;
}
}
-
- /* other escape sequences */
else {
- /* found any non-utf8 escape sequence following a leading unicode surrogate,
- emit replacement character and skip surrogate. */
- if (lead_surrogate) {
- if (!utf8enc(&out, &rem, 0xFFFD)) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
-
- lead_surrogate = 0;
- }
-
- /* \xFF */
- if (in[0] == 'x') {
- if (isxdigit(in[1]) && isxdigit(in[2])) {
- if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
+ switch (s->lex.esc[0]) {
+ case 'u':
+ if (s->lex.esclen < 5) {
+ if (!isxdigit(*ptr)) {
+ s->lex.off += s->lex.esclen + 1;
+ s->error.code = UT_ERROR_INVALID_ESCAPE;
+
+ return 0;
}
- in += 3;
+ s->lex.esc[s->lex.esclen++] = *ptr;
}
- else {
- s->off += (in - buf);
-
- return -UT_ERROR_INVALID_ESCAPE;
- }
- }
- /* \1 .. \9 (regex backreference) */
- else if (q == '/' && in[0] >= '0' && in[0] <= '9') {
- /* in regexp mode, retain backslash */
- if (rem-- < 1) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
-
- *out++ = '\\';
- *out = *in;
- }
+ if (s->lex.esclen == 5) {
+ code = hex(s->lex.esc[1]) * 16 * 16 * 16 +
+ hex(s->lex.esc[2]) * 16 * 16 +
+ hex(s->lex.esc[3]) * 16 +
+ hex(s->lex.esc[4]);
- /* \377, \77 or \7 */
- else if (in[0] >= '0' && in[0] <= '7') {
- if (lead_surrogate) {
- if (!utf8enc(&out, &rem, 0xFFFD)) {
- s->off += (in - buf);
+ /* is a leading surrogate value */
+ if ((code & 0xFC00) == 0xD800) {
+ /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+ if (s->lex.lead_surrogate)
+ append_utf8(s, 0xFFFD);
- return -UT_ERROR_OVERLONG_STRING;
+ /* store surrogate value and advance to next escape sequence */
+ s->lex.lead_surrogate = code;
}
- lead_surrogate = 0;
- }
-
- /* \377 */
- if (in[1] >= '0' && in[1] <= '7' &&
- in[2] >= '0' && in[2] <= '7') {
- code = dec(in[0]) * 8 * 8 +
- dec(in[1]) * 8 +
- dec(in[2]);
+ /* is a trailing surrogate value */
+ else if ((code & 0xFC00) == 0xDC00) {
+ /* found a trailing surrogate following a leading one, combine and encode */
+ if (s->lex.lead_surrogate) {
+ code = 0x10000 + ((s->lex.lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+ s->lex.lead_surrogate = 0;
+ }
- if (code > 255) {
- s->off += (in - buf);
+ /* trailing surrogate not following a leading one, ignore and use replacement char */
+ else {
+ code = 0xFFFD;
+ }
- return -UT_ERROR_INVALID_ESCAPE;
+ append_utf8(s, code);
}
- if (!utf8enc(&out, &rem, code)) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
+ /* is a normal codepoint */
+ else {
+ append_utf8(s, code);
}
- in += 3;
+ s->lex.esclen = 0;
+ s->lex.is_escape = false;
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
}
- /* \77 */
- else if (in[1] >= '0' && in[1] <= '7') {
- if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) {
- s->off += (in - buf);
+ break;
- return -UT_ERROR_OVERLONG_STRING;
+ case 'x':
+ if (s->lex.esclen < 3) {
+ if (!isxdigit(*ptr)) {
+ s->lex.off += s->lex.esclen + 1;
+ s->error.code = UT_ERROR_INVALID_ESCAPE;
+ return 0;
}
- in += 2;
+ s->lex.esc[s->lex.esclen++] = *ptr;
}
- /* \7 */
- else {
- if (!utf8enc(&out, &rem, dec(in[0]))) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
+ if (s->lex.esclen == 3) {
+ append_utf8(s, hex(s->lex.esc[1]) * 16 + hex(s->lex.esc[2]));
- in += 1;
+ s->lex.esclen = 0;
+ s->lex.is_escape = false;
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
}
- }
- /* single character escape */
- else {
- if (lead_surrogate) {
- if (!utf8enc(&out, &rem, 0xFFFD)) {
- s->off += (in - buf);
+ break;
+
+ case 'o':
+ if (s->lex.esclen < 4) {
+ /* found a non-octal char */
+ if (*ptr < '0' || *ptr > '7') {
+ /* pad sequence to three chars */
+ switch (s->lex.esclen) {
+ case 3:
+ s->lex.esc[3] = s->lex.esc[2];
+ s->lex.esc[2] = s->lex.esc[1];
+ s->lex.esc[1] = '0';
+ break;
+
+ case 2:
+ s->lex.esc[3] = s->lex.esc[1];
+ s->lex.esc[2] = '0';
+ s->lex.esc[1] = '0';
+ break;
+ }
- return -UT_ERROR_OVERLONG_STRING;
+ s->lex.esclen = 4;
+ buf_consume(s, ptr - s->lex.bufstart);
}
- lead_surrogate = 0;
+ /* append */
+ else {
+ s->lex.esc[s->lex.esclen++] = *ptr;
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
+ }
}
- if (rem-- < 1) {
- s->off += (in - buf);
+ if (s->lex.esclen == 4) {
+ code = dec(s->lex.esc[1]) * 8 * 8 +
+ dec(s->lex.esc[2]) * 8 +
+ dec(s->lex.esc[3]);
- return -UT_ERROR_OVERLONG_STRING;
- }
-
- switch (in[0]) {
- case 'a': *out = '\a'; break;
- case 'b': *out = '\b'; break;
- case 'e': *out = '\e'; break;
- case 'f': *out = '\f'; break;
- case 'n': *out = '\n'; break;
- case 'r': *out = '\r'; break;
- case 't': *out = '\t'; break;
- case 'v': *out = '\v'; break;
- default:
- /* in regexp mode, retain backslash */
- if (q == '/') {
- if (rem-- < 1) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
+ if (code > 255) {
+ s->lex.off += s->lex.esclen + 1;
+ s->error.code = UT_ERROR_INVALID_ESCAPE;
- *out++ = '\\';
+ return 0;
}
- *out = *in;
- break;
+ append_utf8(s, code);
+
+ s->lex.esclen = 0;
+ s->lex.is_escape = false;
}
- in++;
- out++;
+ break;
}
}
-
- esc = false;
- continue;
}
- /* begin of escape sequence */
- if (*in == '\\') {
- in++;
- esc = true;
- continue;
- }
-
-
- /* there's a non-escape following a previous leading unicode surrogate,
- * ignore surrogate and emit replacement char */
- if (lead_surrogate) {
- if (!utf8enc(&out, &rem, 0xFFFD)) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
- }
-
- lead_surrogate = 0;
- }
+ /* terminating char */
+ else if (*ptr == q) {
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, (ptr + 1) - s->lex.bufstart);
+ rv = lookbehind_to_text(s, s->lex.lastoff, T_STRING, NULL);
- /* terminating quote */
- if (*in == q) {
- op->val = xjs_new_string_len(str, sizeof(str) - 1 - rem);
+ if (!rv)
+ rv = emit_op(s, s->lex.lastoff, T_STRING, xjs_new_string_len("", 0));
- return (in - buf) + 2;
+ return rv;
}
- /* ordinary char */
- if (rem-- < 1) {
- s->off += (in - buf);
-
- return -UT_ERROR_OVERLONG_STRING;
+ /* escape sequence start */
+ else if (*ptr == '\\') {
+ s->lex.is_escape = true;
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, ptr - s->lex.bufstart);
}
-
- *out++ = *in++;
}
- return -UT_ERROR_UNTERMINATED_STRING;
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, ptr - s->lex.bufstart);
+
+ return 0;
}
@@ -611,65 +633,97 @@ next:
* -UT_ERROR_INVALID_REGEXP Could not compile regexp
*/
-static int
-parse_regexp(const char *buf, struct ut_op *op, struct ut_state *s)
+enum {
+ UT_LEX_PARSE_REGEX_INIT,
+ UT_LEX_PARSE_REGEX_PATTERN,
+ UT_LEX_PARSE_REGEX_FLAGS
+};
+
+static uint32_t
+parse_regexp(struct ut_state *s)
{
- struct json_object *rv;
- const char *p;
+ struct json_object *pattern;
+ struct ut_op *op;
+ uint32_t rv;
char *err;
- int len;
- if (s->expect_div == 1) {
- if (!strncmp(buf, "/=", 2)) {
- op->type = T_ASDIV;
- return 2;
+ switch (s->lex.esc[0]) {
+ case UT_LEX_PARSE_REGEX_INIT:
+ if (s->lex.expect_div) {
+ s->lex.expect_div = false;
+
+ if (buf_startswith(s, "=")) {
+ buf_consume(s, 1);
+
+ return emit_op(s, s->lex.off, T_ASDIV, NULL);
+ }
+
+ return emit_op(s, s->lex.off, T_DIV, NULL);
}
- else {
- op->type = T_DIV;
- return 1;
+
+ s->lex.esc[0] = UT_LEX_PARSE_REGEX_PATTERN;
+ break;
+
+ case UT_LEX_PARSE_REGEX_PATTERN:
+ rv = parse_string(s);
+
+ if (rv != 0 && rv != UINT32_MAX) {
+ s->lex.lookbehind = (char *)ut_get_op(s, rv);
+ s->lex.esc[0] = UT_LEX_PARSE_REGEX_FLAGS;
}
- }
- len = parse_string(buf, op, s);
+ break;
- if (len < 2) {
- json_object_put(op->val);
+ case UT_LEX_PARSE_REGEX_FLAGS:
+ op = (struct ut_op *)s->lex.lookbehind;
- return (len < 0) ? len : -UT_ERROR_UNTERMINATED_STRING;
- }
+ while (s->lex.bufstart < s->lex.bufend) {
+ switch (s->lex.bufstart[0]) {
+ case 'g':
+ buf_consume(s, 1);
+ op->is_reg_global = true;
+ break;
- for (p = buf + len; strchr("gis", *p); p++) {
- switch (*p) {
- case 'g':
- op->is_reg_global = 1;
- len++;
- break;
+ case 'i':
+ buf_consume(s, 1);
+ op->is_reg_icase = true;
+ break;
- case 'i':
- op->is_reg_icase = 1;
- len++;
- break;
+ case 's':
+ buf_consume(s, 1);
+ op->is_reg_newline = true;
+ break;
- case 's':
- op->is_reg_newline = 1;
- len++;
- break;
- }
- }
+ default:
+ s->lex.lookbehind = NULL;
+
+ pattern = ut_new_regexp(json_object_get_string(op->val),
+ op->is_reg_icase,
+ op->is_reg_newline,
+ op->is_reg_global,
+ &err);
- p = json_object_get_string(op->val);
- rv = ut_new_regexp(p, op->is_reg_icase, op->is_reg_newline, op->is_reg_global, &err);
+ json_object_put(op->val);
- json_object_put(op->val);
- op->val = rv;
+ op->type = T_REGEXP;
+ op->val = pattern;
- if (!rv) {
- s->error.info.regexp_error = err;
+ if (!pattern) {
+ s->error.info.regexp_error = err;
+ s->error.code = UT_ERROR_INVALID_REGEXP;
+ s->lex.off = s->lex.lastoff;
+
+ return 0;
+ }
+
+ return ut_get_off(s, op);
+ }
+ }
- return -UT_ERROR_INVALID_REGEXP;
+ break;
}
- return len;
+ return 0;
}
@@ -683,41 +737,50 @@ parse_regexp(const char *buf, struct ut_op *op, struct ut_state *s)
* -UT_ERROR_OVERLONG_STRING Label too long
*/
-static int
-parse_label(const char *buf, struct ut_op *op, struct ut_state *s)
+static uint32_t
+parse_label(struct ut_state *s)
{
+ const struct token *tok = s->lex.tok;
const struct token *word;
- char str[128] = { 0 };
- char *out = str;
- const char *in = buf;
- int rem = sizeof(str) - 1;
- int i;
-
- while (*in == '_' || isalnum(*in)) {
- if (rem-- < 1) {
- s->off += (in - buf);
- return -UT_ERROR_OVERLONG_STRING;
- }
+ uint32_t rv;
+ char *ptr;
+ size_t i;
- *out++ = *in++;
- }
+ if (!s->lex.lookbehind && tok->plen)
+ lookbehind_append(s, tok->pat, tok->plen);
- for (i = 0, word = &reserved_words[0];
- i < sizeof(reserved_words) / sizeof(reserved_words[0]);
- i++, word = &reserved_words[i]) {
- if (!strcmp(str, word->pat)) {
- op->type = word->type;
+ if (!buf_remaining(s) || (s->lex.bufstart[0] != '_' && !isalnum(s->lex.bufstart[0]))) {
+ for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
+ if (s->lex.lookbehindlen == word->plen && !strncmp(s->lex.lookbehind, word->pat, word->plen)) {
+ lookbehind_reset(s);
- if (word->parse)
- word->parse(str, op, s);
+ switch (word->type) {
+ case T_DOUBLE:
+ rv = emit_op(s, s->lex.off - word->plen, word->type, ut_new_double(word->d));
+ break;
- return (in - buf);
+ case T_BOOL:
+ rv = emit_op(s, s->lex.off - word->plen, word->type, xjs_new_boolean(word->b));
+ break;
+
+ default:
+ rv = emit_op(s, s->lex.off - word->plen, word->type, NULL);
+ }
+
+ return rv;
+ }
}
+
+ return lookbehind_to_text(s, s->lex.off - s->lex.lookbehindlen, T_LABEL, NULL);
}
- op->val = xjs_new_string(str);
+ for (ptr = s->lex.bufstart; ptr < s->lex.bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
+ ;
+
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, ptr - s->lex.bufstart);
- return (in - buf);
+ return 0;
}
@@ -731,287 +794,410 @@ parse_label(const char *buf, struct ut_op *op, struct ut_state *s)
* -UT_ERROR_INVALID_ESCAPE Invalid number character
*/
-static int
-parse_number(const char *buf, struct ut_op *op, struct ut_state *s)
+static inline bool
+is_numeric_char(struct ut_state *s, char c)
+{
+ char prev = s->lex.lookbehindlen ? s->lex.lookbehind[s->lex.lookbehindlen-1] : 0;
+
+ if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+'))
+ return true;
+
+ return (isxdigit(c) || c == 'x' || c == 'X' || c == '.');
+}
+
+static uint32_t
+parse_number(struct ut_state *s)
{
+ uint32_t rv = 0;
long long int n;
+ char *ptr, *e;
double d;
- char *e;
- if (!strncmp(buf, "Infinity", 8)) {
- op->type = T_DOUBLE;
- op->val = ut_new_double(INFINITY);
+ if (!buf_remaining(s) || !is_numeric_char(s, s->lex.bufstart[0])) {
+ lookbehind_append(s, "\0", 1);
- return 8;
- }
- else if (!strncmp(buf, "NaN", 3)) {
- op->type = T_DOUBLE;
- op->val = ut_new_double(NAN);
+ n = strtoll(s->lex.lookbehind, &e, 0);
+
+ if (*e == '.' || *e == 'e' || *e == 'E') {
+ d = strtod(s->lex.lookbehind, &e);
+
+ if (e > s->lex.lookbehind && *e == 0) {
+ rv = emit_op(s, s->lex.off - (e - s->lex.lookbehind), T_DOUBLE, ut_new_double(d));
+ }
+ else {
+ s->error.code = UT_ERROR_INVALID_ESCAPE;
+ s->lex.off -= s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1;
+ }
+ }
+ else if (*e == 0) {
+ rv = emit_op(s, s->lex.off - (e - s->lex.lookbehind), T_NUMBER, xjs_new_int64(n));
+ ut_get_op(s, rv)->is_overflow = (errno == ERANGE);
+ }
+ else {
+ s->error.code = UT_ERROR_INVALID_ESCAPE;
+ s->lex.off -= s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1;
+ }
+
+ lookbehind_reset(s);
- return 3;
+ return rv;
}
- n = strtoll(buf, &e, 0);
+ for (ptr = s->lex.bufstart; ptr < s->lex.bufend && is_numeric_char(s, *ptr); ptr++)
+ ;
- if (e > buf) {
- if (*e == '.') {
- d = strtod(buf, &e);
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, ptr - s->lex.bufstart);
- if (e > buf) {
- op->type = T_DOUBLE;
- op->val = ut_new_double(d);
+ return 0;
+}
- return (e - buf);
- }
+static uint32_t
+lex_step(struct ut_state *s, FILE *fp)
+{
+ const struct token *tok;
+ size_t rlen, rem;
+ char *ptr, c;
+ uint32_t rv;
+ size_t i;
+
+ /* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining,
+ * move the remaining bytes to the beginning and read more data */
+ if (buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) {
+ if (!s->lex.buf) {
+ s->lex.buflen = 128;
+ s->lex.buf = xalloc(s->lex.buflen);
}
+ rem = s->lex.bufend - s->lex.bufstart;
+
+ memcpy(s->lex.buf, s->lex.bufstart, rem);
- op->type = T_NUMBER;
- op->val = xjs_new_int64(n);
- op->is_overflow = (errno == ERANGE);
+ rlen = fread(s->lex.buf + rem, 1, s->lex.buflen - rem, fp);
- return (e - buf);
+ s->lex.bufstart = s->lex.buf;
+ s->lex.bufend = s->lex.buf + rlen + rem;
+
+ if (rlen == 0 && (ferror(fp) || feof(fp)))
+ s->lex.eof = 1;
}
- return -UT_ERROR_INVALID_ESCAPE;
-}
+ switch (s->lex.state) {
+ case UT_LEX_IDENTIFY_BLOCK:
+ /* previous block had strip trailing whitespace flag, skip leading whitespace */
+ if (s->lex.skip_leading_whitespace) {
+ while (buf_remaining(s) && isspace(s->lex.bufstart[0]))
+ buf_consume(s, 1);
+ s->lex.skip_leading_whitespace = false;
+ }
-/*
- * Parses a bool literal from the given buffer.
- *
- * Returns the amount of consumed characters from the given buffer.
- */
+ /* previous block was a statement block and trim_blocks is enabld, skip leading newline */
+ else if (s->lex.skip_leading_newline) {
+ if (buf_startswith(s, "\n"))
+ buf_consume(s, 1);
-static int
-parse_bool(const char *buf, struct ut_op *op, struct ut_state *s)
-{
- if (!strncmp(buf, "false", 5)) {
- op->val = xjs_new_boolean(false);
+ s->lex.skip_leading_newline = false;
+ }
- return 5;
- }
- else if (!strncmp(buf, "true", 4)) {
- op->val = xjs_new_boolean(true);
+ /* scan forward through buffer to identify start token */
+ for (ptr = s->lex.bufstart; ptr < s->lex.bufend - strlen("{#"); ptr++) {
+ /* found start of comment block */
+ if (!strncmp(ptr, "{#", 2)) {
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, (ptr + 2) - s->lex.bufstart);
+ s->lex.lastoff = s->lex.off - 2;
+ s->lex.state = UT_LEX_BLOCK_COMMENT_START;
- return 4;
- }
+ return 0;
+ }
- return 0;
-}
+ /* found start of expression block */
+ else if (!strncmp(ptr, "{{", 2)) {
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, (ptr + 2) - s->lex.bufstart);
+ s->lex.lastoff = s->lex.off - 2;
+ s->lex.state = UT_LEX_BLOCK_EXPRESSION_START;
+ return 0;
+ }
-static int
-match_token(const char *ptr, struct ut_op *op, struct ut_state *s)
-{
- int i;
- const struct token *tok;
+ /* found start of statement block */
+ else if (!strncmp(ptr, "{%", 2)) {
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, (ptr + 2) - s->lex.bufstart);
+ s->lex.lastoff = s->lex.off - 2;
+ s->lex.state = UT_LEX_BLOCK_STATEMENT_START;
- for (i = 0, tok = &tokens[0];
- i < sizeof(tokens) / sizeof(tokens[0]);
- i++, tok = &tokens[i]) {
- if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
- (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1])) {
- op->type = tok->type;
+ return 0;
+ }
+ }
- if (tok->parse)
- return tok->parse(ptr, op, s);
+ /* we're at eof */
+ if (s->lex.eof) {
+ lookbehind_append(s, ptr, s->lex.bufend - ptr);
+ s->lex.state = UT_LEX_EOF;
- return tok->plen;
+ return lookbehind_to_text(s, s->lex.lastoff, T_TEXT, NULL);
}
- }
- return -UT_ERROR_UNEXPECTED_CHAR;
-}
+ lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart);
+ buf_consume(s, ptr - s->lex.bufstart);
+ break;
-uint32_t
-ut_get_token(struct ut_state *s, const char *input, int *mlen)
-{
- struct ut_op op = { 0 };
- const char *o, *p;
- uint32_t rv;
- for (o = p = input; *p; p++) {
- if (s->blocktype == UT_BLOCK_NONE) {
- if (!strncmp(p, "{#", 2))
- s->blocktype = UT_BLOCK_COMMENT;
- else if (!strncmp(p, "{{", 2))
- s->blocktype = UT_BLOCK_EXPRESSION;
- else if (!strncmp(p, "{%", 2))
- s->blocktype = UT_BLOCK_STATEMENT;
-
- if (s->blocktype) {
- *mlen = p - input;
- s->start_tag_seen = 0;
- s->off += *mlen;
-
- /* strip whitespace before block */
- if (p[2] == '-') {
- while (p > o && isspace(p[-1]))
- p--;
- }
+ case UT_LEX_BLOCK_COMMENT_START:
+ case UT_LEX_BLOCK_EXPRESSION_START:
+ case UT_LEX_BLOCK_STATEMENT_START:
+ rv = 0;
+ s->lex.skip_leading_whitespace = 0;
- /* lstrip */
- else if (s->lstrip_blocks && s->blocktype == UT_BLOCK_STATEMENT && p[2] != '+') {
- while (p > o && p[-1] != '\n' && isspace(p[-1]))
- p--;
- }
+ /* strip whitespace before block */
+ if (buf_startswith(s, "-")) {
+ rv = lookbehind_to_text(s, s->lex.off, T_TEXT, " \n\t\v\f\r");
+ buf_consume(s, 1);
+ }
- if (p == o)
- return 0;
+ /* disable lstrip flag (only valid for statement blocks) */
+ else if (s->lex.state == UT_LEX_BLOCK_STATEMENT_START) {
+ /* disable lstrip flag */
+ if (buf_startswith(s, "+")) {
+ rv = lookbehind_to_text(s, s->lex.off, T_TEXT, NULL);
+ buf_consume(s, 1);
+ }
- return ut_new_op(s, T_TEXT, xjs_new_string_len(o, p - o), UINT32_MAX);
+ /* global block lstrip */
+ else if (s->lstrip_blocks) {
+ rv = lookbehind_to_text(s, s->lex.off, T_TEXT, " \t\v\f\r");
}
}
- else if (s->blocktype == UT_BLOCK_COMMENT) {
- if (!strncmp(p, "#}", 2) || !strncmp(p, "-#}", 3)) {
- *mlen = (p - input) + 2;
+ else {
+ rv = lookbehind_to_text(s, s->lex.off, T_TEXT, NULL);
+ }
- /* strip whitespace after block */
- if (*p == '-') {
- (*mlen)++;
+ switch (s->lex.state) {
+ case UT_LEX_BLOCK_COMMENT_START:
+ s->lex.state = UT_LEX_BLOCK_COMMENT;
+ break;
- while (isspace(p[3])) {
- (*mlen)++;
- p++;
- }
- }
+ case UT_LEX_BLOCK_STATEMENT_START:
+ s->lex.within_statement_block = 1;
+ s->lex.state = UT_LEX_IDENTIFY_TOKEN;
+ break;
- s->blocktype = UT_BLOCK_NONE;
- s->off += *mlen;
+ case UT_LEX_BLOCK_EXPRESSION_START:
+ s->lex.state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG;
+ break;
- return 0;
- }
+ default:
+ break;
}
- else if (s->blocktype == UT_BLOCK_STATEMENT || s->blocktype == UT_BLOCK_EXPRESSION) {
- *mlen = match_token(p, &op, s);
- if (*mlen < 0) {
- s->error.code = -*mlen;
+ return rv;
- return 0;
+
+ case UT_LEX_BLOCK_COMMENT:
+ /* scan forward through buffer to identify end token */
+ while (s->lex.bufstart < s->lex.bufend - 2) {
+ if (buf_startswith(s, "-#}")) {
+ s->lex.state = UT_LEX_IDENTIFY_BLOCK;
+ s->lex.skip_leading_whitespace = 1;
+ buf_consume(s, 3);
+ s->lex.lastoff = s->lex.off;
+ break;
+ }
+ else if (buf_startswith(s, "#}")) {
+ s->lex.state = UT_LEX_IDENTIFY_BLOCK;
+ s->lex.skip_leading_whitespace = 0;
+ buf_consume(s, 2);
+ s->lex.lastoff = s->lex.off;
+ break;
}
- /* disallow nesting blocks */
- else if ((s->start_tag_seen && s->blocktype == UT_BLOCK_STATEMENT &&
- (op.type == T_LEXP || op.type == T_REXP || op.type == T_LSTM)) ||
- (s->start_tag_seen && s->blocktype == UT_BLOCK_EXPRESSION &&
- (op.type == T_LSTM || op.type == T_RSTM || op.type == T_LEXP))) {
- s->error.code = UT_ERROR_NESTED_BLOCKS;
+ buf_consume(s, 1);
+ }
- return 0;
- }
+ /* we're at eof */
+ if (s->lex.eof) {
+ s->lex.off = s->lex.lastoff;
+ s->error.code = UT_ERROR_UNTERMINATED_BLOCK;
+ }
+
+ break;
+
+
+ case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG:
+ s->lex.within_expression_block = 1;
+ s->lex.state = UT_LEX_IDENTIFY_TOKEN;
- /* emit additional empty statement (semicolon) at end of template block */
- else if ((s->blocktype == UT_BLOCK_STATEMENT && op.type == T_RSTM) ||
- (s->blocktype == UT_BLOCK_EXPRESSION && op.type == T_REXP)) {
- if (!s->semicolon_emitted) {
- s->semicolon_emitted = true;
- op.type = T_SCOL;
- *mlen = 0;
+ return emit_op(s, s->lex.off, T_LEXP, NULL);
+
+
+ case UT_LEX_IDENTIFY_TOKEN:
+ for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
+ /* remaining buffer data is shorter than token, skip */
+ if (tok->plen > buf_remaining(s))
+ continue;
+
+ c = s->lex.bufstart[0];
+
+ if (tok->plen ? !strncmp(s->lex.bufstart, tok->pat, tok->plen)
+ : (c >= tok->pat[0] && c <= tok->pat[1])) {
+ buf_consume(s, tok->plen);
+
+ /* token has a parse method, switch state */
+ if (tok->parse) {
+ s->lex.tok = tok;
+ s->lex.state = UT_LEX_PARSE_TOKEN;
+ s->lex.lastoff = s->lex.off - tok->plen;
+
+ return 0;
}
- else {
- /* strip whitespace after block */
- if (*p == '-') {
- while (isspace(p[3])) {
- (*mlen)++;
- p++;
- }
- }
- else if (s->blocktype == UT_BLOCK_STATEMENT &&
- s->trim_blocks && p[2] == '\n') {
- (*mlen)++;
+
+ /* disallow nesting blocks */
+ if ((s->lex.within_expression_block &&
+ (tok->type == T_LSTM || tok->type == T_RSTM || tok->type == T_LEXP)) ||
+ (s->lex.within_statement_block &&
+ (tok->type == T_LEXP || tok->type == T_REXP || tok->type == T_LSTM))) {
+ s->error.code = UT_ERROR_NESTED_BLOCKS;
+ s->lex.off -= tok->plen;
+
+ return 0;
+ }
+
+ /* found end of block */
+ else if ((s->lex.within_statement_block && tok->type == T_RSTM) ||
+ (s->lex.within_expression_block && tok->type == T_REXP)) {
+ /* emit additional empty statement (semicolon) at end of template block */
+ if (!s->lex.semicolon_emitted) {
+ s->lex.semicolon_emitted = true;
+
+ /* rewind */
+ buf_consume(s, -tok->plen);
+
+ return emit_op(s, s->lex.off, T_SCOL, NULL);
}
- s->semicolon_emitted = false;
- s->blocktype = UT_BLOCK_NONE;
+ /* strip whitespace after block */
+ if (tok->pat[0] == '-')
+ s->lex.skip_leading_whitespace = true;
+
+ /* strip newline after statement block */
+ else if (s->lex.within_statement_block && s->trim_blocks)
+ s->lex.skip_leading_newline = true;
+
+ s->lex.semicolon_emitted = false;
+ s->lex.within_statement_block = false;
+ s->lex.within_expression_block = false;
+ s->lex.state = UT_LEX_IDENTIFY_BLOCK;
+ s->lex.lastoff = s->lex.off;
}
+
+ /* do not report statement tags to the parser */
+ if (tok->type != 0 && tok->type != T_LSTM && tok->type != T_RSTM)
+ rv = emit_op(s, s->lex.off - tok->plen, tok->type, NULL);
+ else
+ rv = 0;
+
+ return rv;
}
+ }
- s->start_tag_seen = 1;
- s->off += *mlen;
+ /* no token matched and we do have remaining data, junk */
+ if (buf_remaining(s)) {
+ s->error.code = UT_ERROR_UNEXPECTED_CHAR;
- /* do not report '{%' and '%}' tags to parser */
- if (op.type == T_LSTM || op.type == T_RSTM || op.type == 0)
- return 0;
+ return 0;
+ }
- rv = ut_new_op(s, op.type, op.val, UINT32_MAX);
+ /* we're at eof, allow unclosed statement blocks */
+ if (s->lex.within_statement_block) {
+ s->lex.state = UT_LEX_EOF;
- if (rv) {
- s->pool[rv - 1].is_overflow = op.is_overflow;
- s->pool[rv - 1].is_reg_icase = op.is_reg_icase;
- s->pool[rv - 1].is_reg_global = op.is_reg_global;
- s->pool[rv - 1].is_reg_newline = op.is_reg_newline;
- }
+ return 0;
+ }
- /* Follow JSLint logic and treat a slash after any of the
- * `(,=:[!&|?{};` characters as the beginning of a regex
- * literal... */
- switch (op.type) {
- case T_LPAREN:
- case T_COMMA:
-
- case T_ASADD:
- case T_ASBAND:
- case T_ASBOR:
- case T_ASBXOR:
- case T_ASDIV:
- case T_ASLEFT:
- case T_ASMOD:
- case T_ASMUL:
- case T_ASRIGHT:
- case T_ASSIGN:
- case T_ASSUB:
- case T_EQ:
- case T_EQS:
- case T_GE:
- case T_LE:
- case T_NE:
- case T_NES:
-
- case T_COLON:
- case T_LBRACK:
- case T_NOT:
-
- case T_AND:
- case T_BAND:
-
- case T_OR:
- case T_BOR:
-
- case T_QMARK:
-
- case T_LBRACE:
- case T_RBRACE:
-
- case T_LSTM:
- case T_LEXP:
-
- case T_SCOL:
- s->expect_div = 0;
- break;
+ /* premature EOF */
+ s->error.code = UT_ERROR_UNTERMINATED_BLOCK;
- default:
- s->expect_div = 1;
- }
+ break;
+
+
+ case UT_LEX_PARSE_TOKEN:
+ tok = s->lex.tok;
+ rv = tok->parse(s);
+
+ if (rv) {
+ memset(s->lex.esc, 0, sizeof(s->lex.esc));
+ s->lex.state = UT_LEX_IDENTIFY_TOKEN;
+ s->lex.tok = NULL;
+
+ if (rv == UINT32_MAX)
+ rv = 0;
return rv;
}
- }
- /* allow unclosed '{%' blocks */
- if (s->blocktype == UT_BLOCK_EXPRESSION || s->blocktype == UT_BLOCK_COMMENT) {
- s->error.code = UT_ERROR_UNTERMINATED_BLOCK;
+ break;
- return 0;
+
+ case UT_LEX_EOF:
+ break;
}
- if (p > input) {
- *mlen = p - input;
- s->off += *mlen;
+ return 0;
+}
- return ut_new_op(s, T_TEXT, xjs_new_string_len(o, p - o), UINT32_MAX);
+uint32_t
+ut_get_token(struct ut_state *s, FILE *fp)
+{
+ uint32_t rv;
+
+ while (s->lex.state != UT_LEX_EOF) {
+ rv = lex_step(s, fp);
+
+ if (rv == 0 && s->error.code)
+ break;
+
+ if (rv > 0)
+ return rv;
}
return 0;
}
+
+const char *
+ut_get_tokenname(int type)
+{
+ static char buf[sizeof("'endfunction'")];
+ size_t i;
+
+ switch (type) {
+ case 0: return "End of file";
+ case T_STRING: return "String";
+ case T_LABEL: return "Label";
+ case T_NUMBER: return "Number";
+ case T_DOUBLE: return "Double";
+ case T_REGEXP: return "Regexp";
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tokens); i++) {
+ if (tokens[i].type != type)
+ continue;
+
+ snprintf(buf, sizeof(buf), "'%s'", tokens[i].pat);
+
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
+ if (reserved_words[i].type != type)
+ continue;
+
+ snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
+
+ return buf;
+ }
+
+ return "?";
+}