From 3756806674da909ec6dc10ad25862b592792604e Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Wed, 23 Dec 2020 20:54:05 +0100 Subject: treewide: rewrite ucode interpreter Replace the former AST walking interpreter implementation with a single pass bytecode compiler and a corresponding virtual machine. The rewrite lays the groundwork for a couple of improvements with will be subsequently implemented: - Ability to precompile ucode sources into binary byte code - Strippable debug information - Reduced runtime memory usage Signed-off-by: Jo-Philipp Wich --- lexer.h | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 6 deletions(-) (limited to 'lexer.h') diff --git a/lexer.h b/lexer.h index e45d3e3..fd462ff 100644 --- a/lexer.h +++ b/lexer.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Jo-Philipp Wich + * Copyright (C) 2020-2021 Jo-Philipp Wich * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -17,18 +17,156 @@ #ifndef __LEXER_H_ #define __LEXER_H_ -#include "ast.h" +#include "source.h" #define __T_MAX 82 #define T_EXCEPTION (__T_MAX + 0) #define T_CFUNC (__T_MAX + 1) #define T_RESSOURCE (__T_MAX + 2) -bool -utf8enc(char **out, int *rem, int code); -uint32_t -uc_get_token(struct uc_state *s, FILE *fp); +typedef enum { + TK_LEXP = 1, + TK_REXP, + TK_LSTM, + TK_RSTM, + TK_IF, + TK_ELSE, + TK_COMMA, + TK_ASBAND, + TK_ASBXOR, + TK_ASBOR, + TK_ASLEFT, + TK_ASRIGHT, + TK_ASMUL, + TK_ASDIV, + TK_ASMOD, + TK_ASADD, + TK_ASSUB, + TK_ASSIGN, + TK_QMARK, + TK_COLON, + TK_OR, + TK_AND, + TK_BOR, + TK_BXOR, + TK_BAND, + TK_EQ, + TK_NE, + TK_EQS, + TK_NES, + TK_LT, + TK_LE, + TK_GT, + TK_GE, + TK_IN, + TK_LSHIFT, + TK_RSHIFT, + TK_ADD, + TK_SUB, + TK_MUL, + TK_DIV, + TK_MOD, + TK_NOT, + TK_COMPL, + TK_INC, + TK_DEC, + TK_LPAREN, + TK_LBRACK, + TK_TEXT, + TK_LBRACE, + TK_RBRACE, + TK_SCOL, + TK_RPAREN, + TK_ENDIF, + TK_ELIF, + TK_WHILE, + TK_ENDWHILE, + TK_FOR, + TK_ENDFOR, + TK_FUNC, + TK_LABEL, + TK_ENDFUNC, + TK_TRY, + TK_CATCH, + TK_SWITCH, + TK_CASE, + TK_DEFAULT, + TK_ELLIP, + TK_RETURN, + TK_BREAK, + TK_CONTINUE, + TK_LOCAL, + TK_ARROW, + TK_DOT, + TK_RBRACK, + TK_BOOL, + TK_NUMBER, + TK_DOUBLE, + TK_STRING, + TK_REGEXP, + TK_NULL, + TK_THIS, + + TK_EOF, + TK_ERROR +} uc_tokentype_t; + +typedef enum { + UT_LEX_IDENTIFY_BLOCK, + UT_LEX_BLOCK_COMMENT_START, + UT_LEX_BLOCK_EXPRESSION_START, + UT_LEX_BLOCK_EXPRESSION_EMIT_TAG, + UT_LEX_BLOCK_STATEMENT_START, + UT_LEX_BLOCK_COMMENT, + UT_LEX_IDENTIFY_TOKEN, + UT_LEX_PARSE_TOKEN, + UT_LEX_EOF +} uc_lex_state_t; + +typedef struct { + uc_tokentype_t type; + json_object *val; + size_t pos; +} uc_token; + +typedef struct { + bool lstrip_blocks; + bool trim_blocks; + bool strict_declarations; +} uc_parse_config; + +typedef struct { + uc_lex_state_t state; + uc_parse_config *config; + uc_source *source; + uint8_t eof:1; + uint8_t skip_leading_whitespace:1; + uint8_t skip_leading_newline:1; + uint8_t within_expression_block:1; + uint8_t within_statement_block:1; + uint8_t semicolon_emitted:1; + uint8_t expect_div:1; + uint8_t is_escape:1; + size_t buflen; + char *buf, *bufstart, *bufend; + size_t lookbehindlen; + char *lookbehind; + const void *tok; + uc_token curr; + char esc[5]; + uint8_t esclen; + int lead_surrogate; + size_t lastoff; +} uc_lexer; + + +void uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source); +void uc_lexer_free(uc_lexer *lex); + +uc_token *uc_lexer_next_token(uc_lexer *lex); + +bool utf8enc(char **out, int *rem, int code); const char * uc_get_tokenname(int type); -- cgit v1.2.3