/* * Copyright (C) 2020 Jo-Philipp Wich * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include "ast.h" #include "lib.h" #include "lexer.h" #include "parser.h" struct token { int type; const char *pat; int plen; union { uint32_t (*parse)(struct ut_state *s); double d; bool b; }; }; #define dec(o) \ ((o) - '0') #define hex(x) \ (((x) >= 'a') ? (10 + (x) - 'a') : \ (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) static uint32_t parse_comment(struct ut_state *); static uint32_t parse_string(struct ut_state *); static uint32_t parse_regexp(struct ut_state *); static uint32_t parse_number(struct ut_state *); static uint32_t parse_label(struct ut_state *); static const struct token tokens[] = { { 0, " ", 1 }, { 0, "\t", 1 }, { 0, "\r", 1 }, { 0, "\n", 1 }, { T_ASLEFT, "<<=", 3 }, { T_ASRIGHT, ">>=", 3 }, { T_LEXP, "{{-", 3 }, { T_REXP, "-}}", 3 }, { T_LSTM, "{%+", 3 }, { T_LSTM, "{%-", 3 }, { T_RSTM, "-%}", 3 }, { T_EQS, "===", 3 }, { T_NES, "!==", 3 }, { T_ELLIP, "...", 3 }, { T_AND, "&&", 2 }, { T_ASADD, "+=", 2 }, { T_ASBAND, "&=", 2 }, { T_ASBOR, "|=", 2 }, { T_ASBXOR, "^=", 2 }, //{ T_ASDIV, "/=", 2 }, { T_ASMOD, "%=", 2 }, { T_ASMUL, "*=", 2 }, { T_ASSUB, "-=", 2 }, { T_DEC, "--", 2 }, { T_INC, "++", 2 }, { T_EQ, "==", 2 }, { T_NE, "!=", 2 }, { T_LE, "<=", 2 }, { T_GE, ">=", 2 }, { T_LSHIFT, "<<", 2 }, { T_RSHIFT, ">>", 2 }, { 0, "//", 2, { .parse = parse_comment } }, { 0, "/*", 2, { .parse = parse_comment } }, { T_OR, "||", 2 }, { T_LEXP, "{{", 2 }, { T_REXP, "}}", 2 }, { T_LSTM, "{%", 2 }, { T_RSTM, "%}", 2 }, { T_ARROW, "=>", 2 }, { T_ADD, "+", 1 }, { T_ASSIGN, "=", 1 }, { T_BAND, "&", 1 }, { T_BOR, "|", 1 }, { T_LBRACK, "[", 1 }, { T_RBRACK, "]", 1 }, { T_BXOR, "^", 1 }, { T_LBRACE, "{", 1 }, { T_RBRACE, "}", 1 }, { T_COLON, ":", 1 }, { T_COMMA, ",", 1 }, { T_COMPL, "~", 1 }, //{ T_DIV, "/", 1 }, { T_GT, ">", 1 }, { T_NOT, "!", 1 }, { T_LT, "<", 1 }, { T_MOD, "%", 1 }, { T_MUL, "*", 1 }, { T_LPAREN, "(", 1 }, { T_RPAREN, ")", 1 }, { T_QMARK, "?", 1 }, { T_SCOL, ";", 1 }, { T_SUB, "-", 1 }, { T_DOT, ".", 1 }, { T_STRING, "'", 1, { .parse = parse_string } }, { T_STRING, "\"", 1, { .parse = parse_string } }, { T_REGEXP, "/", 1, { .parse = parse_regexp } }, { T_LABEL, "_", 1, { .parse = parse_label } }, { T_LABEL, "az", 0, { .parse = parse_label } }, { T_LABEL, "AZ", 0, { .parse = parse_label } }, { T_NUMBER, "09", 0, { .parse = parse_number } }, }; static const struct token reserved_words[] = { { T_ENDFUNC, "endfunction", 11 }, { T_DOUBLE, "Infinity", 8, { .d = INFINITY } }, { T_CONTINUE, "continue", 8 }, { T_ENDWHILE, "endwhile", 8 }, { T_FUNC, "function", 8 }, { T_DEFAULT, "default", 7 }, { T_RETURN, "return", 6 }, { T_ENDFOR, "endfor", 6 }, { T_SWITCH, "switch", 6 }, { T_LOCAL, "local", 5 }, { T_ENDIF, "endif", 5 }, { T_WHILE, "while", 5 }, { T_BREAK, "break", 5 }, { T_CATCH, "catch", 5 }, { T_BOOL, "false", 5, { .b = false } }, { T_BOOL, "true", 4, { .b = true } }, { T_ELIF, "elif", 4 }, { T_ELSE, "else", 4 }, { T_THIS, "this", 4 }, { T_NULL, "null", 4 }, { T_CASE, "case", 4 }, { T_DOUBLE, "NaN", 3, { .d = NAN } }, { T_TRY, "try", 3 }, { T_FOR, "for", 3 }, { T_LOCAL, "let", 3 }, { T_IF, "if", 2 }, { T_IN, "in", 2 }, }; /* * Stores the given codepoint as a utf8 multibyte sequence into the given * output buffer and substracts the required amount of bytes from the given * length pointer. * * Returns false if the multibyte sequence would not fit into the buffer, * otherwise true. */ bool utf8enc(char **out, int *rem, int code) { if (code >= 0 && code <= 0x7F) { if (*rem < 1) return false; *(*out)++ = code; (*rem)--; return true; } else if (code > 0 && code <= 0x7FF) { if (*rem < 2) return false; *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; return true; } else if (code > 0 && code <= 0xFFFF) { if (*rem < 3) return false; *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; return true; } else if (code > 0 && code <= 0x10FFFF) { if (*rem < 4) return false; *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; return true; } return true; } /* length of the longest token in our lookup table */ #define UT_LEX_MAX_TOKEN_LEN 3 static uint32_t emit_op(struct ut_state *s, uint32_t pos, int type, struct json_object *val) { uint32_t off = ut_new_op(s, type, val, UINT32_MAX); struct ut_op *op = ut_get_op(s, off); op->off = pos; /* Follow JSLint logic and treat a slash after any of the * `(,=:[!&|?{};` characters as the beginning of a regex * literal... */ switch (type) { case T_LPAREN: case T_COMMA: case T_ASADD: case T_ASBAND: case T_ASBOR: case T_ASBXOR: case T_ASDIV: case T_ASLEFT: case T_ASMOD: case T_ASMUL: case T_ASRIGHT: case T_ASSIGN: case T_ASSUB: case T_EQ: case T_EQS: case T_GE: case T_LE: case T_NE: case T_NES: case T_COLON: case T_LBRACK: case T_NOT: case T_AND: case T_BAND: case T_OR: case T_BOR: case T_QMARK: case T_LBRACE: case T_RBRACE: case T_LSTM: case T_LEXP: case T_SCOL: s->lex.expect_div = false; break; default: s->lex.expect_div = true; } return off; } static void lookbehind_append(struct ut_state *s, const char *data, size_t len) { if (len) { s->lex.lookbehind = xrealloc(s->lex.lookbehind, s->lex.lookbehindlen + len); memcpy(s->lex.lookbehind + s->lex.lookbehindlen, data, len); s->lex.lookbehindlen += len; } } static void lookbehind_reset(struct ut_state *s) { free(s->lex.lookbehind); s->lex.lookbehind = NULL; s->lex.lookbehindlen = 0; } static uint32_t lookbehind_to_text(struct ut_state *s, uint32_t pos, int type, const char *strip_trailing_chars) { uint32_t rv = 0; if (s->lex.lookbehind) { if (strip_trailing_chars) { while (s->lex.lookbehindlen > 0 && strchr(strip_trailing_chars, s->lex.lookbehind[s->lex.lookbehindlen-1])) s->lex.lookbehindlen--; } rv = emit_op(s, pos, type, xjs_new_string_len(s->lex.lookbehind, s->lex.lookbehindlen)); lookbehind_reset(s); } return rv; } static inline size_t buf_remaining(struct ut_state *s) { return (s->lex.bufend - s->lex.bufstart); } static inline bool _buf_startswith(struct ut_state *s, const char *str, size_t len) { return (buf_remaining(s) >= len && !strncmp(s->lex.bufstart, str, len)); } #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) static void buf_consume(struct ut_state *s, ssize_t len) { s->lex.bufstart += len; s->source->off += len; } static uint32_t parse_comment(struct ut_state *s) { const struct token *tok = s->lex.tok; const char *ptr, *end; size_t elen; if (!buf_remaining(s)) { ut_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated comment"); return 0; } if (!strcmp(tok->pat, "//")) { end = "\n"; elen = 1; } else { end = "*/"; elen = 2; } for (ptr = s->lex.bufstart; ptr < s->lex.bufend - elen; ptr++) { if (!strncmp(ptr, end, elen)) { buf_consume(s, (ptr - s->lex.bufstart) + elen); return UINT32_MAX; } } buf_consume(s, ptr - s->lex.bufstart); return 0; } static void append_utf8(struct ut_state *s, int code) { char ustr[8], *up; int rem; up = ustr; rem = sizeof(ustr); if (utf8enc(&up, &rem, code)) lookbehind_append(s, ustr, up - ustr); } static uint32_t parse_string(struct ut_state *s) { const struct token *tok = s->lex.tok; char q = tok->pat[0]; char *ptr, *c; uint32_t rv; int code; if (!buf_remaining(s)) { ut_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated string"); return 0; } for (ptr = s->lex.bufstart; ptr < s->lex.bufend; ptr++) { /* continuation of escape sequence */ if (s->lex.is_escape) { if (s->lex.esclen == 0) { /* non-unicode escape following a lead surrogate, emit replacement... */ if (s->lex.lead_surrogate && *ptr != 'u') { append_utf8(s, 0xFFFD); s->lex.lead_surrogate = 0; } switch (*ptr) { case 'u': case 'x': s->lex.esc[s->lex.esclen++] = *ptr; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* regex mode => backref, retain literally */ if (q == '/') { s->lex.is_escape = false; lookbehind_append(s, "\\", 1); lookbehind_append(s, ptr, 1); buf_consume(s, (ptr + 1) - s->lex.bufstart); } /* string mode => likely octal */ else if (*ptr < '8') { s->lex.esc[s->lex.esclen++] = 'o'; s->lex.esc[s->lex.esclen++] = *ptr; } /* non-octal char, add verbatim */ else { s->lex.is_escape = false; lookbehind_append(s, ptr, 1); buf_consume(s, (ptr + 1) - s->lex.bufstart); } break; default: s->lex.is_escape = false; c = strchr("a\ab\be\ef\fn\nr\rt\tv\v", *ptr); lookbehind_append(s, c ? c + 1 : ptr, 1); buf_consume(s, (ptr + 1) - s->lex.bufstart); break; } } else { switch (s->lex.esc[0]) { case 'u': if (s->lex.esclen < 5) { if (!isxdigit(*ptr)) { ut_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); return 0; } s->lex.esc[s->lex.esclen++] = *ptr; } if (s->lex.esclen == 5) { code = hex(s->lex.esc[1]) * 16 * 16 * 16 + hex(s->lex.esc[2]) * 16 * 16 + hex(s->lex.esc[3]) * 16 + hex(s->lex.esc[4]); /* is a leading surrogate value */ if ((code & 0xFC00) == 0xD800) { /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ if (s->lex.lead_surrogate) append_utf8(s, 0xFFFD); /* store surrogate value and advance to next escape sequence */ s->lex.lead_surrogate = code; } /* is a trailing surrogate value */ else if ((code & 0xFC00) == 0xDC00) { /* found a trailing surrogate following a leading one, combine and encode */ if (s->lex.lead_surrogate) { code = 0x10000 + ((s->lex.lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); s->lex.lead_surrogate = 0; } /* trailing surrogate not following a leading one, ignore and use replacement char */ else { code = 0xFFFD; } append_utf8(s, code); } /* is a normal codepoint */ else { append_utf8(s, code); } s->lex.esclen = 0; s->lex.is_escape = false; buf_consume(s, (ptr + 1) - s->lex.bufstart); } break; case 'x': if (s->lex.esclen < 3) { if (!isxdigit(*ptr)) { ut_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); return 0; } s->lex.esc[s->lex.esclen++] = *ptr; } if (s->lex.esclen == 3) { append_utf8(s, hex(s->lex.esc[1]) * 16 + hex(s->lex.esc[2])); s->lex.esclen = 0; s->lex.is_escape = false; buf_consume(s, (ptr + 1) - s->lex.bufstart); } break; case 'o': if (s->lex.esclen < 4) { /* found a non-octal char */ if (*ptr < '0' || *ptr > '7') { /* pad sequence to three chars */ switch (s->lex.esclen) { case 3: s->lex.esc[3] = s->lex.esc[2]; s->lex.esc[2] = s->lex.esc[1]; s->lex.esc[1] = '0'; break; case 2: s->lex.esc[3] = s->lex.esc[1]; s->lex.esc[2] = '0'; s->lex.esc[1] = '0'; break; } s->lex.esclen = 4; buf_consume(s, ptr - s->lex.bufstart); } /* append */ else { s->lex.esc[s->lex.esclen++] = *ptr; buf_consume(s, (ptr + 1) - s->lex.bufstart); } } if (s->lex.esclen == 4) { code = dec(s->lex.esc[1]) * 8 * 8 + dec(s->lex.esc[2]) * 8 + dec(s->lex.esc[3]); if (code > 255) { ut_new_exception(s, s->source->off + s->lex.esclen + 1, "Syntax error: Invalid escape sequence"); return 0; } append_utf8(s, code); s->lex.esclen = 0; s->lex.is_escape = false; } break; } } } /* terminating char */ else if (*ptr == q) { lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, (ptr + 1) - s->lex.bufstart); rv = lookbehind_to_text(s, s->lex.lastoff, T_STRING, NULL); if (!rv) rv = emit_op(s, s->lex.lastoff, T_STRING, xjs_new_string_len("", 0)); return rv; } /* escape sequence start */ else if (*ptr == '\\') { s->lex.is_escape = true; lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, ptr - s->lex.bufstart); } } lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, ptr - s->lex.bufstart); return 0; } /* * Parses a regexp literal from the given buffer. * * Returns a negative value on error, otherwise the amount of consumed * characters from the given buffer. * * Error values: * -UT_ERROR_UNTERMINATED_STRING Unterminated regexp * -UT_ERROR_INVALID_ESCAPE Invalid escape sequence * -UT_ERROR_OVERLONG_STRING Regexp literal too long * -UT_ERROR_INVALID_REGEXP Could not compile regexp */ enum { UT_LEX_PARSE_REGEX_INIT, UT_LEX_PARSE_REGEX_PATTERN, UT_LEX_PARSE_REGEX_FLAGS }; static uint32_t parse_regexp(struct ut_state *s) { struct json_object *pattern; struct ut_op *op; uint32_t rv; char *err; switch (s->lex.esc[0]) { case UT_LEX_PARSE_REGEX_INIT: if (s->lex.expect_div) { s->lex.expect_div = false; if (buf_startswith(s, "=")) { buf_consume(s, 1); return emit_op(s, s->source->off, T_ASDIV, NULL); } return emit_op(s, s->source->off, T_DIV, NULL); } s->lex.esc[0] = UT_LEX_PARSE_REGEX_PATTERN; break; case UT_LEX_PARSE_REGEX_PATTERN: rv = parse_string(s); if (rv != 0 && rv != UINT32_MAX) { s->lex.lookbehind = (char *)ut_get_op(s, rv); s->lex.esc[0] = UT_LEX_PARSE_REGEX_FLAGS; } break; case UT_LEX_PARSE_REGEX_FLAGS: op = (struct ut_op *)s->lex.lookbehind; while (s->lex.bufstart < s->lex.bufend) { switch (s->lex.bufstart[0]) { case 'g': buf_consume(s, 1); op->is_reg_global = true; break; case 'i': buf_consume(s, 1); op->is_reg_icase = true; break; case 's': buf_consume(s, 1); op->is_reg_newline = true; break; default: s->lex.lookbehind = NULL; pattern = ut_new_regexp(json_object_get_string(op->val), op->is_reg_icase, op->is_reg_newline, op->is_reg_global, &err); json_object_put(op->val); op->type = T_REGEXP; op->val = pattern; if (!pattern) { ut_new_exception(s, op->off, "Syntax error: %s", err); free(err); return 0; } return ut_get_off(s, op); } } break; } return 0; } /* * Parses a label from the given buffer. * * Returns a negative value on error, otherwise the amount of consumed * characters from the given buffer. * * Error values: * -UT_ERROR_OVERLONG_STRING Label too long */ static uint32_t parse_label(struct ut_state *s) { const struct token *tok = s->lex.tok; const struct token *word; uint32_t rv; char *ptr; size_t i; if (!s->lex.lookbehind && tok->plen) lookbehind_append(s, tok->pat, tok->plen); if (!buf_remaining(s) || (s->lex.bufstart[0] != '_' && !isalnum(s->lex.bufstart[0]))) { for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { if (s->lex.lookbehindlen == word->plen && !strncmp(s->lex.lookbehind, word->pat, word->plen)) { lookbehind_reset(s); switch (word->type) { case T_DOUBLE: rv = emit_op(s, s->source->off - word->plen, word->type, ut_new_double(word->d)); break; case T_BOOL: rv = emit_op(s, s->source->off - word->plen, word->type, xjs_new_boolean(word->b)); break; default: rv = emit_op(s, s->source->off - word->plen, word->type, NULL); } return rv; } } return lookbehind_to_text(s, s->source->off - s->lex.lookbehindlen, T_LABEL, NULL); } for (ptr = s->lex.bufstart; ptr < s->lex.bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) ; lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, ptr - s->lex.bufstart); return 0; } /* * Parses a number literal from the given buffer. * * Returns a negative value on error, otherwise the amount of consumed * characters from the given buffer. * * Error values: * -UT_ERROR_INVALID_ESCAPE Invalid number character */ static inline bool is_numeric_char(struct ut_state *s, char c) { char prev = s->lex.lookbehindlen ? s->lex.lookbehind[s->lex.lookbehindlen-1] : 0; if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) return true; return (isxdigit(c) || c == 'x' || c == 'X' || c == '.'); } static uint32_t parse_number(struct ut_state *s) { uint32_t rv = 0; long long int n; char *ptr, *e; double d; if (!buf_remaining(s) || !is_numeric_char(s, s->lex.bufstart[0])) { lookbehind_append(s, "\0", 1); n = strtoll(s->lex.lookbehind, &e, 0); if (*e == '.' || *e == 'e' || *e == 'E') { d = strtod(s->lex.lookbehind, &e); if (e > s->lex.lookbehind && *e == 0) rv = emit_op(s, s->source->off - (e - s->lex.lookbehind), T_DOUBLE, ut_new_double(d)); else ut_new_exception(s, s->source->off - (s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1), "Syntax error: Invalid number literal"); } else if (*e == 0) { rv = emit_op(s, s->source->off - (e - s->lex.lookbehind), T_NUMBER, xjs_new_int64(n)); ut_get_op(s, rv)->is_overflow = (errno == ERANGE); } else { ut_new_exception(s, s->source->off - (s->lex.lookbehindlen - (e - s->lex.lookbehind) - 1), "Syntax error: Invalid number literal"); } lookbehind_reset(s); return rv; } for (ptr = s->lex.bufstart; ptr < s->lex.bufend && is_numeric_char(s, *ptr); ptr++) ; lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, ptr - s->lex.bufstart); return 0; } static uint32_t lex_step(struct ut_state *s, FILE *fp) { const struct token *tok; size_t rlen, rem; char *ptr, c; uint32_t rv; size_t i; /* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining, * move the remaining bytes to the beginning and read more data */ if (buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN) { if (!s->lex.buf) { s->lex.buflen = 128; s->lex.buf = xalloc(s->lex.buflen); } rem = s->lex.bufend - s->lex.bufstart; memcpy(s->lex.buf, s->lex.bufstart, rem); rlen = fread(s->lex.buf + rem, 1, s->lex.buflen - rem, fp); s->lex.bufstart = s->lex.buf; s->lex.bufend = s->lex.buf + rlen + rem; if (rlen == 0 && (ferror(fp) || feof(fp))) s->lex.eof = 1; } switch (s->lex.state) { case UT_LEX_IDENTIFY_BLOCK: /* previous block had strip trailing whitespace flag, skip leading whitespace */ if (s->lex.skip_leading_whitespace) { while (buf_remaining(s) && isspace(s->lex.bufstart[0])) buf_consume(s, 1); s->lex.skip_leading_whitespace = false; } /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ else if (s->lex.skip_leading_newline) { if (buf_startswith(s, "\n")) buf_consume(s, 1); s->lex.skip_leading_newline = false; } /* scan forward through buffer to identify start token */ for (ptr = s->lex.bufstart; ptr < s->lex.bufend - strlen("{#"); ptr++) { /* found start of comment block */ if (!strncmp(ptr, "{#", 2)) { lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, (ptr + 2) - s->lex.bufstart); s->lex.lastoff = s->source->off - 2; s->lex.state = UT_LEX_BLOCK_COMMENT_START; return 0; } /* found start of expression block */ else if (!strncmp(ptr, "{{", 2)) { lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, (ptr + 2) - s->lex.bufstart); s->lex.lastoff = s->source->off - 2; s->lex.state = UT_LEX_BLOCK_EXPRESSION_START; return 0; } /* found start of statement block */ else if (!strncmp(ptr, "{%", 2)) { lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, (ptr + 2) - s->lex.bufstart); s->lex.lastoff = s->source->off - 2; s->lex.state = UT_LEX_BLOCK_STATEMENT_START; return 0; } } /* we're at eof */ if (s->lex.eof) { lookbehind_append(s, ptr, s->lex.bufend - ptr); s->lex.state = UT_LEX_EOF; return lookbehind_to_text(s, s->lex.lastoff, T_TEXT, NULL); } lookbehind_append(s, s->lex.bufstart, ptr - s->lex.bufstart); buf_consume(s, ptr - s->lex.bufstart); break; case UT_LEX_BLOCK_COMMENT_START: case UT_LEX_BLOCK_EXPRESSION_START: case UT_LEX_BLOCK_STATEMENT_START: rv = 0; s->lex.skip_leading_whitespace = 0; /* strip whitespace before block */ if (buf_startswith(s, "-")) { rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \n\t\v\f\r"); buf_consume(s, 1); } /* disable lstrip flag (only valid for statement blocks) */ else if (s->lex.state == UT_LEX_BLOCK_STATEMENT_START) { /* disable lstrip flag */ if (buf_startswith(s, "+")) { rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL); buf_consume(s, 1); } /* global block lstrip */ else if (s->lstrip_blocks) { rv = lookbehind_to_text(s, s->source->off, T_TEXT, " \t\v\f\r"); } } else { rv = lookbehind_to_text(s, s->source->off, T_TEXT, NULL); } switch (s->lex.state) { case UT_LEX_BLOCK_COMMENT_START: s->lex.state = UT_LEX_BLOCK_COMMENT; break; case UT_LEX_BLOCK_STATEMENT_START: s->lex.within_statement_block = 1; s->lex.state = UT_LEX_IDENTIFY_TOKEN; break; case UT_LEX_BLOCK_EXPRESSION_START: s->lex.state = UT_LEX_BLOCK_EXPRESSION_EMIT_TAG; break; default: break; } return rv; case UT_LEX_BLOCK_COMMENT: /* scan forward through buffer to identify end token */ while (s->lex.bufstart < s->lex.bufend - 2) { if (buf_startswith(s, "-#}")) { s->lex.state = UT_LEX_IDENTIFY_BLOCK; s->lex.skip_leading_whitespace = 1; buf_consume(s, 3); s->lex.lastoff = s->source->off; break; } else if (buf_startswith(s, "#}")) { s->lex.state = UT_LEX_IDENTIFY_BLOCK; s->lex.skip_leading_whitespace = 0; buf_consume(s, 2); s->lex.lastoff = s->source->off; break; } buf_consume(s, 1); } /* we're at eof */ if (s->lex.eof) ut_new_exception(s, s->lex.lastoff, "Syntax error: Unterminated template block"); break; case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG: s->lex.within_expression_block = 1; s->lex.state = UT_LEX_IDENTIFY_TOKEN; return emit_op(s, s->source->off, T_LEXP, NULL); case UT_LEX_IDENTIFY_TOKEN: for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) { /* remaining buffer data is shorter than token, skip */ if (tok->plen > buf_remaining(s)) continue; c = s->lex.bufstart[0]; if (tok->plen ? !strncmp(s->lex.bufstart, tok->pat, tok->plen) : (c >= tok->pat[0] && c <= tok->pat[1])) { buf_consume(s, tok->plen); s->lex.lastoff = s->source->off - tok->plen; /* token has a parse method, switch state */ if (tok->parse) { s->lex.tok = tok; s->lex.state = UT_LEX_PARSE_TOKEN; return 0; } /* disallow nesting blocks */ if ((s->lex.within_expression_block && (tok->type == T_LSTM || tok->type == T_RSTM || tok->type == T_LEXP)) || (s->lex.within_statement_block && (tok->type == T_LEXP || tok->type == T_REXP || tok->type == T_LSTM))) { ut_new_exception(s, s->source->off - tok->plen, "Syntax error: Template blocks may not be nested"); return 0; } /* found end of block */ else if ((s->lex.within_statement_block && tok->type == T_RSTM) || (s->lex.within_expression_block && tok->type == T_REXP)) { /* emit additional empty statement (semicolon) at end of template block */ if (!s->lex.semicolon_emitted) { s->lex.semicolon_emitted = true; /* rewind */ buf_consume(s, -tok->plen); return emit_op(s, s->source->off, T_SCOL, NULL); } /* strip whitespace after block */ if (tok->pat[0] == '-') s->lex.skip_leading_whitespace = true; /* strip newline after statement block */ else if (s->lex.within_statement_block && s->trim_blocks) s->lex.skip_leading_newline = true; s->lex.semicolon_emitted = false; s->lex.within_statement_block = false; s->lex.within_expression_block = false; s->lex.state = UT_LEX_IDENTIFY_BLOCK; s->lex.lastoff = s->source->off; } /* do not report statement tags to the parser */ if (tok->type != 0 && tok->type != T_LSTM && tok->type != T_RSTM) rv = emit_op(s, s->source->off - tok->plen, tok->type, NULL); else rv = 0; return rv; } } /* no token matched and we do have remaining data, junk */ if (buf_remaining(s)) { ut_new_exception(s, s->source->off, "Syntax error: Unexpected character"); return 0; } /* we're at eof, allow unclosed statement blocks */ if (s->lex.within_statement_block) { s->lex.state = UT_LEX_EOF; return 0; } /* premature EOF */ ut_new_exception(s, s->source->off, "Syntax error: Unterminated template block"); break; case UT_LEX_PARSE_TOKEN: tok = s->lex.tok; rv = tok->parse(s); if (rv) { memset(s->lex.esc, 0, sizeof(s->lex.esc)); s->lex.state = UT_LEX_IDENTIFY_TOKEN; s->lex.tok = NULL; if (rv == UINT32_MAX) rv = 0; return rv; } break; case UT_LEX_EOF: break; } return 0; } uint32_t ut_get_token(struct ut_state *s, FILE *fp) { uint32_t rv; while (s->lex.state != UT_LEX_EOF) { rv = lex_step(s, fp); if (rv == 0 && s->exception) break; if (rv > 0) return rv; } return 0; } const char * ut_get_tokenname(int type) { static char buf[sizeof("'endfunction'")]; size_t i; switch (type) { case 0: return "End of file"; case T_STRING: return "String"; case T_LABEL: return "Label"; case T_NUMBER: return "Number"; case T_DOUBLE: return "Double"; case T_REGEXP: return "Regexp"; } for (i = 0; i < ARRAY_SIZE(tokens); i++) { if (tokens[i].type != type) continue; snprintf(buf, sizeof(buf), "'%s'", tokens[i].pat); return buf; } for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { if (reserved_words[i].type != type) continue; snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); return buf; } return "?"; }