diff options
author | Jo-Philipp Wich <jo@mein.io> | 2020-08-10 17:05:03 +0200 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2020-08-21 23:04:45 +0200 |
commit | a56887df2a0f51b42d9d4013515e847b1a050c58 (patch) | |
tree | 3416726feacc13d8a94e4fda90edc1a6773fa71e /lexer.c |
Initial commit
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 738 |
1 files changed, 738 insertions, 0 deletions
@@ -0,0 +1,738 @@ +/* + * Copyright (C) 2020 Jo-Philipp Wich <jo@mein.io> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <stdio.h> + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <regex.h> +#include <math.h> + +#include "ast.h" +#include "lexer.h" +#include "parser.h" + + +struct token { + int type; + const char *pat; + int plen; + int (*parse)(const char *buf, struct ut_opcode *op, struct ut_state *s); +}; + +#define dec(o) \ + ((o) - '0') + +#define hex(x) \ + (((x) >= 'a') ? (10 + (x) - 'a') : \ + (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) + +static int parse_comment(const char *, struct ut_opcode *, struct ut_state *); +static int parse_string(const char *, struct ut_opcode *, struct ut_state *); +static int parse_number(const char *, struct ut_opcode *, struct ut_state *); +static int parse_label(const char *, struct ut_opcode *, struct ut_state *); +static int parse_bool(const char *, struct ut_opcode *, struct ut_state *); + +static const struct token tokens[] = { + { 0, " ", 1 }, + { 0, "\t", 1 }, + { 0, "\r", 1 }, + { 0, "\n", 1 }, + { T_ASLEFT, "<<=", 3 }, + { T_ASRIGHT, ">>=", 3 }, + { T_LEXP, "{{-", 3 }, + { T_REXP, "-}}", 3 }, + { T_LSTM, "{%-", 3 }, + { T_RSTM, "-%}", 3 }, + { T_AND, "&&", 2 }, + { T_ASADD, "+=", 2 }, + { T_ASBAND, "&=", 2 }, + { T_ASBOR, "|=", 2 }, + { T_ASBXOR, "^=", 2 }, + { T_ASDIV, "/=", 2 }, + { T_ASMOD, "%=", 2 }, + { T_ASMUL, "*=", 2 }, + { T_ASSUB, "-=", 2 }, + { T_DEC, "--", 2 }, + { T_INC, "++", 2 }, + { T_IF, "if", 2 }, + { T_EQ, "==", 2 }, + { T_NE, "!=", 2 }, + { T_LE, "<=", 2 }, + { T_GE, ">=", 2 }, + { T_LSHIFT, "<<", 2 }, + { T_RSHIFT, ">>", 2 }, + { 0, "//", 2, parse_comment }, + { 0, "/*", 2, parse_comment }, + { T_OR, "||", 2 }, + { T_LEXP, "{{", 2 }, + { T_REXP, "}}", 2 }, + { T_LSTM, "{%", 2 }, + { T_RSTM, "%}", 2 }, + { T_ADD, "+", 1 }, + { T_ASSIGN, "=", 1 }, + { T_BAND, "&", 1 }, + { T_BOR, "|", 1 }, + { T_LBRACK, "[", 1 }, + { T_RBRACK, "]", 1 }, + { T_BXOR, "^", 1 }, + { T_LBRACE, "{", 1 }, + { T_RBRACE, "}", 1 }, + { T_COLON, ":", 1 }, + { T_COMMA, ",", 1 }, + { T_COMPL, "~", 1 }, + { T_DIV, "/", 1 }, + { T_GT, ">", 1 }, + { T_NOT, "!", 1 }, + { T_LT, "<", 1 }, + { T_MOD, "%", 1 }, + { T_MUL, "*", 1 }, + { T_LPAREN, "(", 1 }, + { T_RPAREN, ")", 1 }, + { T_QMARK, "?", 1 }, + { T_SCOL, ";", 1 }, + { T_SUB, "-", 1 }, + { T_DOT, ".", 1 }, + { T_STRING, "'", 1, parse_string }, + { T_STRING, "\"", 1, parse_string }, + { T_LABEL, "_", 1, parse_label }, + { T_LABEL, "az", 0, parse_label }, + { T_LABEL, "AZ", 0, parse_label }, + { T_NUMBER, "09", 0, parse_number }, +}; + +static const struct token reserved_words[] = { + { T_ENDFUNC, "endfunction", 11 }, + { T_NUMBER, "Infinity", 8, parse_number }, + { T_CONTINUE, "continue", 8 }, + { T_ENDWHILE, "endwhile", 8 }, + { T_FUNC, "function", 8 }, + { T_RETURN, "return", 6 }, + { T_ENDFOR, "endfor", 6 }, + { T_LOCAL, "local", 5 }, + { T_ENDIF, "endif", 5 }, + { T_WHILE, "while", 5 }, + { T_BREAK, "break", 5 }, + { T_BOOL, "false", 5, parse_bool }, + { T_BOOL, "true", 4, parse_bool }, + { T_ELSE, "else", 4 }, + { T_NUMBER, "NaN", 3, parse_number }, + { T_FOR, "for", 3 }, + { T_IN, "in", 2 }, +}; + +const char *tokennames[69] = { + [0] = "End of file", + [T_FUNC] = "'function'", + [T_LOCAL] = "'local'", + [T_WHILE] = "'while", + [T_ELSE] = "'else'", + [T_FOR] = "'for'", + [T_IF] = "'if'", + [T_IN] = "'in'", + [T_ASLEFT] = "'x<<=y'", + [T_ASRIGHT] = "'x>>=y'", + [T_AND] = "'x&&y'", + [T_ASADD] = "'x+=y'", + [T_ASBAND] = "'x&=y'", + [T_ASBOR] = "'x|=y'", + [T_ASBXOR] = "'x^=y'", + [T_ASDIV] = "'x/=y'", + [T_ASMOD] = "'x%=y'", + [T_ASMUL] = "'x*=y'", + [T_ASSUB] = "'x-=y'", + [T_DEC] = "'x--'", + [T_INC] = "'x++'", + [T_EQ] = "'x==y'", + [T_NE] = "'x!=y'", + [T_LE] = "'x<=y'", + [T_GE] = "'x>=y'", + [T_LSHIFT] = "'x<<y'", + [T_RSHIFT] = "'x>>y'", + [T_LEXP] = "'{{'", + [T_REXP] = "'}}'", + [T_OR] = "'x||y'", + [T_ADD] = "'x+y'", + [T_ASSIGN] = "'x=y'", + [T_BAND] = "'x&y'", + [T_BOR] = "'x|y'", + [T_LBRACK] = "'['", + [T_RBRACK] = "']'", + [T_BXOR] = "'x^y'", + [T_LBRACE] = "'{'", + [T_RBRACE] = "'}'", + [T_COLON] = "':'", + [T_COMMA] = "','", + [T_COMPL] = "'~x'", + [T_DIV] = "'x/y'", + [T_GT] = "'x>y'", + [T_NOT] = "'!x'", + [T_LT] = "'x<y'", + [T_MOD] = "'x%y'", + [T_MUL] = "'x*y'", + [T_LPAREN] = "'('", + [T_RPAREN] = "')'", + [T_QMARK] = "'?'", + [T_SCOL] = "';'", + [T_SUB] = "'x-y'", + [T_DOT] = "'.'", + [T_STRING] = "String", + [T_LABEL] = "Label", + [T_NUMBER] = "Number", + [T_DOUBLE] = "Double", + [T_BOOL] = "Bool", + [T_TEXT] = "Text", + [T_ENDIF] = "'endif'", + [T_ENDFOR] = "'endfor'", + [T_ENDWHILE] = "'endwhile'", + [T_ENDFUNC] = "'endfuncton'", + [T_RETURN] = "'return'", + [T_BREAK] = "'break'", + [T_CONTINUE] = "'continue'", + //[T_LSTM] = "'{%'", + //[T_RSTM] = "'%}'" +}; + + +/* + * Stores the given codepoint as a utf8 multibyte sequence into the given + * output buffer and substracts the required amount of bytes from the given + * length pointer. + * + * Returns false if the multibyte sequence would not fit into the buffer, + * otherwise true. + */ + +bool +utf8enc(char **out, int *rem, int code) +{ + if (code >= 0 && code <= 0x7F) { + if (*rem < 1) + return false; + + *(*out)++ = code; (*rem)--; + + return true; + } + else if (code > 0 && code <= 0x7FF) { + if (*rem < 2) + return false; + + *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + + return true; + } + else if (code > 0 && code <= 0xFFFF) { + if (*rem < 3) + return false; + + *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; + *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + + return true; + } + else if (code > 0 && code <= 0x10FFFF) { + if (*rem < 4) + return false; + + *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; + *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + + return true; + } + + return true; +} + +/* + * Parses a comment from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -UT_ERROR_UNTERMINATED_COMMENT Unterminated string + */ + +static int +parse_comment(const char *buf, struct ut_opcode *op, struct ut_state *s) +{ + const char *p = buf; + + /* single line comment */ + if (p[0] == '/' && p[1] == '/') { + while (*p != 0 && *p != '\n') + p++; + + return (p - buf); + } + + /* multi line comment */ + while (*p) { + if (p[0] == '*' && p[1] == '/') + break; + + p++; + } + + return *p ? (p - buf) + 2 : -UT_ERROR_UNTERMINATED_COMMENT; +} + +/* + * Parses a string literal from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -UT_ERROR_UNTERMINATED_STRING Unterminated string + * -UT_ERROR_INVALID_ESCAPE Invalid escape sequence + * -UT_ERROR_OVERLONG_STRING String literal too long + */ + +static int +parse_string(const char *buf, struct ut_opcode *op, struct ut_state *s) +{ + char q = *(buf++); + char str[128] = { 0 }; + char *out = str; + const char *in = buf; + bool esc = false; + int rem = sizeof(str) - 1; + int code; + + while (*in) { + /* continuation of escape sequence */ + if (esc) { + /* \uFFFF */ + if (in[0] == 'u') { + if (isxdigit(in[1]) && isxdigit(in[2]) && + isxdigit(in[3]) && isxdigit(in[4])) { + if (!utf8enc(&out, &rem, + hex(in[1]) * 16 * 16 * 16 + + hex(in[2]) * 16 * 16 + + hex(in[3]) * 16 + + hex(in[4]))) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + in += 5; + } + else { + s->off += (in - buf); + + return -UT_ERROR_INVALID_ESCAPE; + } + } + + /* \xFF */ + else if (in[0] == 'x') { + if (isxdigit(in[1]) && isxdigit(in[2])) { + if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + in += 3; + } + else { + s->off += (in - buf); + return -UT_ERROR_INVALID_ESCAPE; + } + } + + /* \377, \77 or \7 */ + else if (in[0] >= '0' && in[0] <= '7') { + /* \377 */ + if (in[1] >= '0' && in[1] <= '7' && + in[2] >= '0' && in[2] <= '7') { + code = dec(in[0]) * 8 * 8 + + dec(in[1]) * 8 + + dec(in[2]); + + if (code > 255) { + s->off += (in - buf); + + return -UT_ERROR_INVALID_ESCAPE; + } + + if (!utf8enc(&out, &rem, code)) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + in += 3; + } + + /* \77 */ + else if (in[1] >= '0' && in[1] <= '7') { + if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + in += 2; + } + + /* \7 */ + else { + if (!utf8enc(&out, &rem, dec(in[0]))) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + in += 1; + } + } + + /* single character escape */ + else { + if (rem-- < 1) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + switch (in[0]) { + case 'a': *out = '\a'; break; + case 'b': *out = '\b'; break; + case 'e': *out = '\e'; break; + case 'f': *out = '\f'; break; + case 'n': *out = '\n'; break; + case 'r': *out = '\r'; break; + case 't': *out = '\t'; break; + case 'v': *out = '\v'; break; + default: + *out = *in; + break; + } + + in++; + out++; + } + + esc = false; + } + + /* begin of escape sequence */ + else if (*in == '\\') { + in++; + esc = true; + } + + /* terminating quote */ + else if (*in == q) { + op->val = json_object_new_string_len(str, sizeof(str) - 1 - rem); + + return (in - buf) + 2; + } + + /* ordinary char */ + else { + if (rem-- < 1) { + s->off += (in - buf); + + return -UT_ERROR_OVERLONG_STRING; + } + + *out++ = *in++; + } + } + + return -UT_ERROR_UNTERMINATED_STRING; +} + + +/* + * Parses a label from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -UT_ERROR_OVERLONG_STRING Label too long + */ + +static int +parse_label(const char *buf, struct ut_opcode *op, struct ut_state *s) +{ + const struct token *word; + char str[128] = { 0 }; + char *out = str; + const char *in = buf; + int rem = sizeof(str) - 1; + int i; + + while (*in == '_' || isalnum(*in)) { + if (rem-- < 1) { + s->off += (in - buf); + return -UT_ERROR_OVERLONG_STRING; + } + + *out++ = *in++; + } + + for (i = 0, word = &reserved_words[0]; + i < sizeof(reserved_words) / sizeof(reserved_words[0]); + i++, word = &reserved_words[i]) { + if (!strcmp(str, word->pat)) { + op->type = word->type; + + if (word->parse) + word->parse(str, op, s); + + return (in - buf); + } + } + + op->val = json_object_new_string(str); + + return (in - buf); +} + + +/* + * Parses a number literal from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -UT_ERROR_INVALID_ESCAPE Invalid number character + */ + +static int +parse_number(const char *buf, struct ut_opcode *op, struct ut_state *s) +{ + double d; + char *e; + int n; + + if (!strncmp(buf, "Infinity", 8)) { + op->type = T_DOUBLE; + op->val = json_object_new_double_rounded(INFINITY); + + return 8; + } + else if (!strncmp(buf, "NaN", 3)) { + op->type = T_DOUBLE; + op->val = json_object_new_double_rounded(NAN); + + return 3; + } + + n = strtol(buf, &e, 0); + + if (e > buf) { + if (*e == '.') { + d = strtod(buf, &e); + + if (e > buf) { + op->type = T_DOUBLE; + op->val = json_object_new_double_rounded(d); + + return (e - buf); + } + } + + op->type = T_NUMBER; + op->val = json_object_new_int64(n); + + return (e - buf); + } + + return -UT_ERROR_INVALID_ESCAPE; +} + + +/* + * Parses a bool literal from the given buffer. + * + * Returns the amount of consumed characters from the given buffer. + */ + +static int +parse_bool(const char *buf, struct ut_opcode *op, struct ut_state *s) +{ + if (!strncmp(buf, "false", 5)) { + op->val = json_object_new_boolean(false); + + return 5; + } + else if (!strncmp(buf, "true", 4)) { + op->val = json_object_new_boolean(true); + + return 4; + } + + return 0; +} + + +static int +match_token(const char *ptr, struct ut_opcode *op, struct ut_state *s) +{ + int i; + const struct token *tok; + + for (i = 0, tok = &tokens[0]; + i < sizeof(tokens) / sizeof(tokens[0]); + i++, tok = &tokens[i]) { + if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) || + (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1])) { + op->type = tok->type; + + if (tok->parse) + return tok->parse(ptr, op, s); + + return tok->plen; + } + } + + return -UT_ERROR_UNEXPECTED_CHAR; +} + +struct ut_opcode * +ut_get_token(struct ut_state *s, const char *input, int *mlen) +{ + struct ut_opcode op = { 0 }; + const char *o, *p; + + for (o = p = input; *p; p++) { + if (s->blocktype == UT_BLOCK_NONE) { + if (!strncmp(p, "{#", 2)) + s->blocktype = UT_BLOCK_COMMENT; + else if (!strncmp(p, "{{", 2)) + s->blocktype = UT_BLOCK_EXPRESSION; + else if (!strncmp(p, "{%", 2)) + s->blocktype = UT_BLOCK_STATEMENT; + + if (s->blocktype) { + *mlen = p - input; + s->start_tag_seen = 0; + s->off += *mlen; + + /* strip whitespace before block */ + if (p[2] == '-') + while (p > o && isspace(p[-1])) + p--; + + if (p == o) + return NULL; + + return ut_new_op(s, T_TEXT, json_object_new_string_len(o, p - o), (void *)1); + } + } + else if (s->blocktype == UT_BLOCK_COMMENT) { + if (!strncmp(p, "#}", 2) || !strncmp(p, "-#}", 3)) { + *mlen = (p - input) + 2; + + /* strip whitespace after block */ + if (*p == '-') { + (*mlen)++; + + while (isspace(p[3])) { + (*mlen)++; + p++; + } + } + + s->blocktype = UT_BLOCK_NONE; + s->off += *mlen; + + return NULL; + } + } + else if (s->blocktype == UT_BLOCK_STATEMENT || s->blocktype == UT_BLOCK_EXPRESSION) { + *mlen = match_token(p, &op, s); + + if (*mlen < 0) { + s->error.code = -*mlen; + + return NULL; + } + + /* disallow nesting blocks */ + else if ((s->start_tag_seen && s->blocktype == UT_BLOCK_STATEMENT && + (op.type == T_LEXP || op.type == T_REXP || op.type == T_LSTM)) || + (s->start_tag_seen && s->blocktype == UT_BLOCK_EXPRESSION && + (op.type == T_LSTM || op.type == T_RSTM || op.type == T_LEXP))) { + s->error.code = UT_ERROR_NESTED_BLOCKS; + + return NULL; + } + + /* emit additional empty statement (semicolon) at end of template block */ + else if ((s->blocktype == UT_BLOCK_STATEMENT && op.type == T_RSTM) || + (s->blocktype == UT_BLOCK_EXPRESSION && op.type == T_REXP)) { + if (!s->semicolon_emitted) { + s->semicolon_emitted = true; + op.type = T_SCOL; + *mlen = 0; + } + else { + s->semicolon_emitted = false; + s->blocktype = UT_BLOCK_NONE; + + /* strip whitespace after block */ + if (*p == '-') { + while (isspace(p[3])) { + (*mlen)++; + p++; + } + } + } + } + + s->start_tag_seen = 1; + s->off += *mlen; + + /* do not report '{%' and '%}' tags to parser */ + if (op.type == T_LSTM || op.type == T_RSTM || op.type == 0) + return NULL; + + return ut_new_op(s, op.type, op.val, (void *)1); + } + } + + /* allow unclosed '{%' blocks */ + if (s->blocktype == UT_BLOCK_EXPRESSION || s->blocktype == UT_BLOCK_COMMENT) { + s->error.code = UT_ERROR_UNTERMINATED_BLOCK; + + return NULL; + } + + if (p > input) { + *mlen = p - input; + s->off += *mlen; + + return ut_new_op(s, T_TEXT, json_object_new_string_len(o, p - o), (void *)1); + } + + return NULL; +} |