diff options
author | Jo-Philipp Wich <jo@mein.io> | 2021-02-26 21:19:42 +0100 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2021-02-26 21:19:42 +0100 |
commit | 14aace988c32faa1829b6839825ab40d8efa3968 (patch) | |
tree | 4977ea6596de176cf6cdc1612e81e30c725c7cd8 | |
parent | e416740e189f4fec06da2db5ebcd30092eb50270 (diff) |
lexer: improvements
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
-rw-r--r-- | compiler.c | 55 | ||||
-rw-r--r-- | lexer.c | 149 | ||||
-rw-r--r-- | lexer.h | 25 |
3 files changed, 108 insertions, 121 deletions
@@ -199,11 +199,64 @@ uc_compiler_set_srcpos(uc_compiler *compiler, size_t srcpos) static void uc_compiler_parse_advance(uc_compiler *compiler) { + bool no_regexp; + uc_value_put(compiler->parser->prev.val); compiler->parser->prev = compiler->parser->curr; while (true) { - compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex); + /* Follow JSLint logic and treat a slash after any of the + * `(,=:[!&|?{};` characters as the beginning of a regex + * literal... */ + switch (compiler->parser->prev.type) { + case TK_LPAREN: + case TK_COMMA: + + case TK_ASADD: + case TK_ASBAND: + case TK_ASBOR: + case TK_ASBXOR: + case TK_ASDIV: + case TK_ASLEFT: + case TK_ASMOD: + case TK_ASMUL: + case TK_ASRIGHT: + case TK_ASSIGN: + case TK_ASSUB: + case TK_EQ: + case TK_EQS: + case TK_GE: + case TK_LE: + case TK_NE: + case TK_NES: + + case TK_COLON: + case TK_LBRACK: + case TK_NOT: + + case TK_AND: + case TK_BAND: + + case TK_OR: + case TK_BOR: + + case TK_QMARK: + + case TK_LBRACE: + case TK_RBRACE: + + case TK_LSTM: + case TK_LEXP: + + case TK_SCOL: + no_regexp = false; + break; + + default: + no_regexp = (compiler->parser->prev.type != 0); + } + + compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex, no_regexp); if (compiler->parser->curr.type != TK_ERROR) break; @@ -48,7 +48,7 @@ struct token { char pat[4]; }; int plen; - uc_token *(*parse)(uc_lexer *); + uc_token *(*parse)(uc_lexer *, bool); }; #define dec(o) \ @@ -58,11 +58,11 @@ struct token { (((x) >= 'a') ? (10 + (x) - 'a') : \ (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) -static uc_token *parse_comment(uc_lexer *); -static uc_token *parse_string(uc_lexer *); -static uc_token *parse_regexp(uc_lexer *); -static uc_token *parse_number(uc_lexer *); -static uc_token *parse_label(uc_lexer *); +static uc_token *parse_comment(uc_lexer *, bool); +static uc_token *parse_string(uc_lexer *, bool); +static uc_token *parse_regexp(uc_lexer *, bool); +static uc_token *parse_number(uc_lexer *, bool); +static uc_token *parse_label(uc_lexer *, bool); static const struct token tokens[] = { { TK_ASLEFT, { .pat = "<<=" }, 3 }, @@ -229,57 +229,6 @@ emit_op(uc_lexer *lex, uint32_t pos, int type, struct json_object *val) lex->curr.val = val; lex->curr.pos = pos; - /* Follow JSLint logic and treat a slash after any of the - * `(,=:[!&|?{};` characters as the beginning of a regex - * literal... */ - switch (type) { - case TK_LPAREN: - case TK_COMMA: - - case TK_ASADD: - case TK_ASBAND: - case TK_ASBOR: - case TK_ASBXOR: - case TK_ASDIV: - case TK_ASLEFT: - case TK_ASMOD: - case TK_ASMUL: - case TK_ASRIGHT: - case TK_ASSIGN: - case TK_ASSUB: - case TK_EQ: - case TK_EQS: - case TK_GE: - case TK_LE: - case TK_NE: - case TK_NES: - - case TK_COLON: - case TK_LBRACK: - case TK_NOT: - - case TK_AND: - case TK_BAND: - - case TK_OR: - case TK_BOR: - - case TK_QMARK: - - case TK_LBRACE: - case TK_RBRACE: - - case TK_LSTM: - case TK_LEXP: - - case TK_SCOL: - lex->expect_div = false; - break; - - default: - lex->expect_div = true; - } - return &lex->curr; } @@ -424,7 +373,7 @@ buf_consume(uc_lexer *lex, size_t len) { } static uc_token * -parse_comment(uc_lexer *lex) +parse_comment(uc_lexer *lex, bool no_regexp) { const struct token *tok = lex->tok; const char *ptr, *end; @@ -468,7 +417,7 @@ append_utf8(uc_lexer *lex, int code) { } static uc_token * -parse_string(uc_lexer *lex) +parse_string(uc_lexer *lex, bool no_regexp) { const struct token *tok = lex->tok; char q = tok->pat[0]; @@ -696,7 +645,7 @@ enum { }; static uc_token * -parse_regexp(uc_lexer *lex) +parse_regexp(uc_lexer *lex, bool no_regexp) { bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; uc_token *rv; @@ -705,9 +654,7 @@ parse_regexp(uc_lexer *lex) switch (lex->esc[0]) { case UT_LEX_PARSE_REGEX_INIT: - if (lex->expect_div) { - lex->expect_div = false; - + if (no_regexp) { if (buf_startswith(lex, "=")) { buf_consume(lex, 1); @@ -721,7 +668,7 @@ parse_regexp(uc_lexer *lex) break; case UT_LEX_PARSE_REGEX_PATTERN: - rv = parse_string(lex); + rv = parse_string(lex, no_regexp); if (rv && rv->type == TK_ERROR) return rv; @@ -788,7 +735,7 @@ parse_regexp(uc_lexer *lex) */ static uc_token * -parse_label(uc_lexer *lex) +parse_label(uc_lexer *lex, bool no_regexp) { const struct token *tok = lex->tok; const struct keyword *word; @@ -856,7 +803,7 @@ is_numeric_char(uc_lexer *lex, char c) } static uc_token * -parse_number(uc_lexer *lex) +parse_number(uc_lexer *lex, bool no_regexp) { const struct token *tok = lex->tok; uc_token *rv = NULL; @@ -909,7 +856,7 @@ parse_number(uc_lexer *lex) } static uc_token * -lex_step(uc_lexer *lex, FILE *fp) +lex_step(uc_lexer *lex, FILE *fp, bool no_regexp) { uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; union { uint32_t n; char str[4]; } search; @@ -919,7 +866,7 @@ lex_step(uc_lexer *lex, FILE *fp) uc_token *rv; size_t i; - /* only less than UT_LEX_MAX_TOKEN_LEN unreach buffer chars remaining, + /* only less than UT_LEX_MAX_TOKEN_LEN unread buffer chars remaining, * move the remaining bytes to the beginning and read more data */ if (buf_remaining(lex) < UT_LEX_MAX_TOKEN_LEN) { if (!lex->buf) { @@ -943,19 +890,19 @@ lex_step(uc_lexer *lex, FILE *fp) switch (lex->state) { case UT_LEX_IDENTIFY_BLOCK: /* previous block had strip trailing whitespace flag, skip leading whitespace */ - if (lex->skip_leading_whitespace) { + if (lex->modifier == MINUS) { while (buf_remaining(lex) && isspace(lex->bufstart[0])) buf_consume(lex, 1); - lex->skip_leading_whitespace = false; + lex->modifier = UNSPEC; } /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ - else if (lex->skip_leading_newline) { + else if (lex->modifier == NEWLINE) { if (buf_startswith(lex, "\n")) buf_consume(lex, 1); - lex->skip_leading_newline = false; + lex->modifier = UNSPEC; } /* scan forward through buffer to identify start token */ @@ -1008,7 +955,7 @@ lex_step(uc_lexer *lex, FILE *fp) case UT_LEX_BLOCK_EXPRESSION_START: case UT_LEX_BLOCK_STATEMENT_START: rv = NULL; - lex->skip_leading_whitespace = 0; + lex->modifier = UNSPEC; /* strip whitespace before block */ if (buf_startswith(lex, "-")) { @@ -1036,11 +983,12 @@ lex_step(uc_lexer *lex, FILE *fp) switch (lex->state) { case UT_LEX_BLOCK_COMMENT_START: lex->state = UT_LEX_BLOCK_COMMENT; + lex->block = COMMENT; break; case UT_LEX_BLOCK_STATEMENT_START: - lex->within_statement_block = 1; lex->state = UT_LEX_IDENTIFY_TOKEN; + lex->block = STATEMENTS; break; case UT_LEX_BLOCK_EXPRESSION_START: @@ -1059,14 +1007,13 @@ lex_step(uc_lexer *lex, FILE *fp) while (lex->bufstart < lex->bufend - 2) { if (buf_startswith(lex, "-#}")) { lex->state = UT_LEX_IDENTIFY_BLOCK; - lex->skip_leading_whitespace = 1; + lex->modifier = MINUS; buf_consume(lex, 3); lex->lastoff = lex->source->off; break; } else if (buf_startswith(lex, "#}")) { lex->state = UT_LEX_IDENTIFY_BLOCK; - lex->skip_leading_whitespace = 0; buf_consume(lex, 2); lex->lastoff = lex->source->off; break; @@ -1088,8 +1035,8 @@ lex_step(uc_lexer *lex, FILE *fp) case UT_LEX_BLOCK_EXPRESSION_EMIT_TAG: - lex->within_expression_block = 1; lex->state = UT_LEX_IDENTIFY_TOKEN; + lex->block = EXPRESSION; return emit_op(lex, lex->source->off, TK_LEXP, NULL); @@ -1129,43 +1076,32 @@ lex_step(uc_lexer *lex, FILE *fp) } /* disallow nesting blocks */ - if ((lex->within_expression_block && - (tok->type == TK_LSTM || tok->type == TK_RSTM || tok->type == TK_LEXP)) || - (lex->within_statement_block && - (tok->type == TK_LEXP || tok->type == TK_REXP || tok->type == TK_LSTM))) { + if (tok->type == TK_LSTM || tok->type == TK_LEXP) { buf_consume(lex, tok->plen); return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, xjs_new_string("Template blocks may not be nested")); } /* found end of block */ - else if ((lex->within_statement_block && tok->type == TK_RSTM) || - (lex->within_expression_block && tok->type == TK_REXP)) { - /* emit additional empty statement (semicolon) at end of template block */ - if (!lex->semicolon_emitted) { - lex->semicolon_emitted = true; - - return emit_op(lex, lex->source->off, TK_SCOL, NULL); - } - + else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) || + (lex->block == EXPRESSION && tok->type == TK_REXP)) { /* strip whitespace after block */ if (tok->pat[0] == '-') - lex->skip_leading_whitespace = true; + lex->modifier = MINUS; /* strip newline after statement block */ - else if (lex->within_statement_block && + else if (lex->block == STATEMENTS && lex->config && lex->config->trim_blocks) - lex->skip_leading_newline = true; + lex->modifier = NEWLINE; - lex->semicolon_emitted = false; - lex->within_statement_block = false; - lex->within_expression_block = false; lex->state = UT_LEX_IDENTIFY_BLOCK; + lex->block = NONE; } /* do not report statement tags to the parser */ - if (tok->type != 0 && tok->type != TK_LSTM && tok->type != TK_RSTM) - rv = emit_op(lex, lex->source->off, tok->type, NULL); + if (tok->type != 0 && tok->type != TK_LSTM) + rv = emit_op(lex, lex->source->off, + (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL); else rv = NULL; @@ -1180,7 +1116,7 @@ lex_step(uc_lexer *lex, FILE *fp) return emit_op(lex, lex->source->off, TK_ERROR, xjs_new_string("Unexpected character")); /* we're at eof, allow unclosed statement blocks */ - if (lex->within_statement_block) { + if (lex->block == STATEMENTS) { lex->state = UT_LEX_EOF; return NULL; @@ -1192,7 +1128,7 @@ lex_step(uc_lexer *lex, FILE *fp) case UT_LEX_PARSE_TOKEN: tok = lex->tok; - rv = tok->parse(lex); + rv = tok->parse(lex, no_regexp); if (rv) { memset(lex->esc, 0, sizeof(lex->esc)); @@ -1224,14 +1160,11 @@ uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source) lex->source = uc_source_get(source); lex->eof = 0; - lex->skip_leading_whitespace = 0; - lex->skip_leading_newline = 0; - lex->within_statement_block = 0; - lex->within_statement_block = 0; - lex->semicolon_emitted = 0; - lex->expect_div = 0; lex->is_escape = 0; + lex->block = NONE; + lex->modifier = UNSPEC; + lex->buflen = 0; lex->buf = NULL; lex->bufstart = NULL; @@ -1260,12 +1193,12 @@ uc_lexer_free(uc_lexer *lex) } uc_token * -uc_lexer_next_token(uc_lexer *lex) +uc_lexer_next_token(uc_lexer *lex, bool no_regexp) { uc_token *rv; while (lex->state != UT_LEX_EOF) { - rv = lex_step(lex, lex->source->fp); + rv = lex_step(lex, lex->source->fp, no_regexp); if (rv != NULL) return rv; @@ -19,11 +19,6 @@ #include "source.h" -#define __T_MAX 82 -#define T_EXCEPTION (__T_MAX + 0) -#define T_CFUNC (__T_MAX + 1) -#define T_RESSOURCE (__T_MAX + 2) - typedef enum { TK_LEXP = 1, @@ -141,12 +136,6 @@ typedef struct { uc_parse_config *config; uc_source *source; uint8_t eof:1; - uint8_t skip_leading_whitespace:1; - uint8_t skip_leading_newline:1; - uint8_t within_expression_block:1; - uint8_t within_statement_block:1; - uint8_t semicolon_emitted:1; - uint8_t expect_div:1; uint8_t is_escape:1; size_t buflen; char *buf, *bufstart, *bufend; @@ -158,13 +147,25 @@ typedef struct { uint8_t esclen; int lead_surrogate; size_t lastoff; + enum { + UNSPEC, + PLUS, + MINUS, + NEWLINE + } modifier; + enum { + NONE, + EXPRESSION = '{', + STATEMENTS = '%', + COMMENT = '#' + } block; } uc_lexer; void uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source); void uc_lexer_free(uc_lexer *lex); -uc_token *uc_lexer_next_token(uc_lexer *lex); +uc_token *uc_lexer_next_token(uc_lexer *lex, bool no_regexp); bool utf8enc(char **out, int *rem, int code); |