diff options
author | Jo-Philipp Wich <jo@mein.io> | 2021-04-28 23:52:37 +0200 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2021-04-29 00:10:56 +0200 |
commit | e66b2ad400203c27d7a17edea2b9952f110e9020 (patch) | |
tree | b3ed1d3e4e8141dd215ac3b116a0440bc93b5cbe | |
parent | e29b5744132d7dfb2989c70d4255840126d6ad19 (diff) |
compiler, lexer: improve lexical state handling
- Instead of disambiguating division operator vs. regexp literal by looking
at the preceeding token, raise a "no regexp" flag within the appropriate
parser states to tell the lexer how to treat a forward slash when parsing
the next token
- Introduce another "no keyword" flag which disables parsing labels into
keywords when reading the next token and set it in the appropriate parser
states. This allows using reserved names in object declarations and
property access expressions
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
-rw-r--r-- | compiler.c | 110 | ||||
-rw-r--r-- | lexer.c | 79 | ||||
-rw-r--r-- | lexer.h | 4 |
3 files changed, 97 insertions, 96 deletions
@@ -195,64 +195,11 @@ uc_compiler_set_srcpos(uc_compiler *compiler, size_t srcpos) static void uc_compiler_parse_advance(uc_compiler *compiler) { - bool no_regexp; - ucv_put(compiler->parser->prev.uv); compiler->parser->prev = compiler->parser->curr; while (true) { - /* Follow JSLint logic and treat a slash after any of the - * `(,=:[!&|?{};` characters as the beginning of a regex - * literal... */ - switch (compiler->parser->prev.type) { - case TK_LPAREN: - case TK_COMMA: - - case TK_ASADD: - case TK_ASBAND: - case TK_ASBOR: - case TK_ASBXOR: - case TK_ASDIV: - case TK_ASLEFT: - case TK_ASMOD: - case TK_ASMUL: - case TK_ASRIGHT: - case TK_ASSIGN: - case TK_ASSUB: - case TK_EQ: - case TK_EQS: - case TK_GE: - case TK_LE: - case TK_NE: - case TK_NES: - - case TK_COLON: - case TK_LBRACK: - case TK_NOT: - - case TK_AND: - case TK_BAND: - - case TK_OR: - case TK_BOR: - - case TK_QMARK: - - case TK_LBRACE: - case TK_RBRACE: - - case TK_LSTM: - case TK_LEXP: - - case TK_SCOL: - no_regexp = false; - break; - - default: - no_regexp = (compiler->parser->prev.type != 0); - } - - compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex, no_regexp); + compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex); if (compiler->parser->curr.type != TK_ERROR) break; @@ -359,22 +306,41 @@ uc_compiler_parse_precedence(uc_compiler *compiler, uc_precedence_t precedence) uc_parse_rule *rule; bool assignable; - uc_compiler_parse_advance(compiler); - - rule = uc_compiler_parse_rule(compiler->parser->prev.type); + rule = uc_compiler_parse_rule(compiler->parser->curr.type); if (!rule->prefix) { - uc_compiler_syntax_error(compiler, compiler->parser->prev.pos, "Expecting expression"); + uc_compiler_syntax_error(compiler, compiler->parser->curr.pos, "Expecting expression"); + uc_compiler_parse_advance(compiler); return; } + /* allow reserved words as property names in object literals */ + if (rule->prefix == uc_compiler_compile_object) + compiler->parser->lex.no_keyword = true; + + /* unless a sub-expression follows, treat subsequent slash as division + * operator and not as beginning of regexp literal */ + if (rule->prefix != uc_compiler_compile_paren && + rule->prefix != uc_compiler_compile_unary && + rule->prefix != uc_compiler_compile_array) + compiler->parser->lex.no_regexp = true; + + uc_compiler_parse_advance(compiler); + assignable = (precedence <= P_ASSIGN); rule->prefix(compiler, assignable); while (precedence <= uc_compiler_parse_rule(compiler->parser->curr.type)->precedence) { + rule = uc_compiler_parse_rule(compiler->parser->curr.type); + + /* allow reserved words in property accessors */ + if (rule->infix == uc_compiler_compile_dot) + compiler->parser->lex.no_keyword = true; + uc_compiler_parse_advance(compiler); - uc_compiler_parse_rule(compiler->parser->prev.type)->infix(compiler, assignable); + + rule->infix(compiler, assignable); } if (assignable && uc_compiler_parse_at_assignment_op(compiler)) @@ -1207,7 +1173,14 @@ uc_compiler_compile_paren(uc_compiler *compiler, bool assignable) continue; } else { - maybe_arrowfn = uc_compiler_parse_match(compiler, TK_RPAREN); + maybe_arrowfn = uc_compiler_parse_check(compiler, TK_RPAREN); + + if (maybe_arrowfn) { + /* A subsequent slash cannot be a regular expression literal */ + compiler->parser->lex.no_regexp = true; + uc_compiler_parse_advance(compiler); + } + break; } } @@ -1276,6 +1249,9 @@ uc_compiler_compile_paren(uc_compiler *compiler, bool assignable) if (!uc_compiler_parse_check(compiler, TK_RPAREN)) uc_compiler_compile_expression(compiler); + /* A subsequent slash cannot be a regular expression literal */ + compiler->parser->lex.no_regexp = true; + /* At this point we expect the end of the parenthesized expression, anything * else is a syntax error */ uc_compiler_parse_consume(compiler, TK_RPAREN); @@ -1315,6 +1291,8 @@ uc_compiler_compile_call(uc_compiler *compiler, bool assignable) while (uc_compiler_parse_match(compiler, TK_COMMA)); } + /* after a function call expression, no regexp literal can follow */ + compiler->parser->lex.no_regexp = true; uc_compiler_parse_consume(compiler, TK_RPAREN); /* if lhs is a dot or bracket expression, emit a method call */ @@ -1564,6 +1542,9 @@ uc_compiler_compile_or(uc_compiler *compiler, bool assignable) static void uc_compiler_compile_dot(uc_compiler *compiler, bool assignable) { + /* no regexp literal possible after property access */ + compiler->parser->lex.no_regexp = true; + /* parse label lhs */ uc_compiler_parse_consume(compiler, TK_LABEL); uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv); @@ -1578,6 +1559,9 @@ uc_compiler_compile_subscript(uc_compiler *compiler, bool assignable) { /* compile lhs */ uc_compiler_compile_expression(compiler); + + /* no regexp literal possible after computed property access */ + compiler->parser->lex.no_regexp = true; uc_compiler_parse_consume(compiler, TK_RBRACK); /* depending on context, compile into I_UVAL, I_SVAL or I_LVAL operation */ @@ -1653,6 +1637,8 @@ uc_compiler_compile_array(uc_compiler *compiler, bool assignable) } while (uc_compiler_parse_match(compiler, TK_COMMA)); + /* no regexp literal possible after array literal */ + compiler->parser->lex.no_regexp = true; uc_compiler_parse_consume(compiler, TK_RBRACK); /* push items on stack */ @@ -1751,9 +1737,13 @@ uc_compiler_compile_object(uc_compiler *compiler, bool assignable) hint_count += 2; len += 2; + + compiler->parser->lex.no_keyword = true; } while (uc_compiler_parse_match(compiler, TK_COMMA)); + /* no regexp literal possible after object literal */ + compiler->parser->lex.no_regexp = true; uc_compiler_parse_consume(compiler, TK_RBRACE); /* set items on stack */ @@ -48,7 +48,7 @@ struct token { char pat[4]; } u; unsigned plen; - uc_token *(*parse)(uc_lexer *, bool); + uc_token *(*parse)(uc_lexer *); }; #define dec(o) \ @@ -58,11 +58,11 @@ struct token { (((x) >= 'a') ? (10 + (x) - 'a') : \ (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) -static uc_token *parse_comment(uc_lexer *, bool); -static uc_token *parse_string(uc_lexer *, bool); -static uc_token *parse_regexp(uc_lexer *, bool); -static uc_token *parse_number(uc_lexer *, bool); -static uc_token *parse_label(uc_lexer *, bool); +static uc_token *parse_comment(uc_lexer *); +static uc_token *parse_string(uc_lexer *); +static uc_token *parse_regexp(uc_lexer *); +static uc_token *parse_number(uc_lexer *); +static uc_token *parse_label(uc_lexer *); static const struct token tokens[] = { { TK_ASLEFT, { .pat = "<<=" }, 3, NULL }, @@ -353,7 +353,7 @@ buf_consume(uc_lexer *lex, size_t len) { } static uc_token * -parse_comment(uc_lexer *lex, bool no_regexp) +parse_comment(uc_lexer *lex) { const struct token *tok = lex->tok; const char *ptr, *end; @@ -397,7 +397,7 @@ append_utf8(uc_lexer *lex, int code) { } static uc_token * -parse_string(uc_lexer *lex, bool no_regexp) +parse_string(uc_lexer *lex) { const struct token *tok = lex->tok; char q = tok->u.pat[0]; @@ -625,7 +625,7 @@ enum { }; static uc_token * -parse_regexp(uc_lexer *lex, bool no_regexp) +parse_regexp(uc_lexer *lex) { bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; uc_token *rv; @@ -634,7 +634,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp) switch (lex->esc[0]) { case UT_LEX_PARSE_REGEX_INIT: - if (no_regexp) { + if (lex->no_regexp) { if (buf_startswith(lex, "=")) { buf_consume(lex, 1); @@ -648,7 +648,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp) break; case UT_LEX_PARSE_REGEX_PATTERN: - rv = parse_string(lex, no_regexp); + rv = parse_string(lex); if (rv && rv->type == TK_ERROR) return rv; @@ -716,7 +716,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp) */ static uc_token * -parse_label(uc_lexer *lex, bool no_regexp) +parse_label(uc_lexer *lex) { const struct token *tok = lex->tok; const struct keyword *word; @@ -728,24 +728,26 @@ parse_label(uc_lexer *lex, bool no_regexp) lookbehind_append(lex, tok->u.pat, tok->plen); if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) { - for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { - if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) { - lookbehind_reset(lex); - - switch (word->type) { - case TK_DOUBLE: - rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_double_new(word->u.d)); - break; - - case TK_BOOL: - rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_boolean_new(word->u.b)); - break; + if (lex->no_keyword == false) { + for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { + if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) { + lookbehind_reset(lex); + + switch (word->type) { + case TK_DOUBLE: + rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_double_new(word->u.d)); + break; + + case TK_BOOL: + rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_boolean_new(word->u.b)); + break; + + default: + rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL); + } - default: - rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL); + return rv; } - - return rv; } } @@ -784,7 +786,7 @@ is_numeric_char(uc_lexer *lex, char c) } static uc_token * -parse_number(uc_lexer *lex, bool no_regexp) +parse_number(uc_lexer *lex) { const struct token *tok = lex->tok; uc_token *rv = NULL; @@ -837,7 +839,7 @@ parse_number(uc_lexer *lex, bool no_regexp) } static uc_token * -lex_step(uc_lexer *lex, FILE *fp, bool no_regexp) +lex_step(uc_lexer *lex, FILE *fp) { uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; union { uint32_t n; char str[4]; } search; @@ -1110,7 +1112,7 @@ lex_step(uc_lexer *lex, FILE *fp, bool no_regexp) case UT_LEX_PARSE_TOKEN: tok = lex->tok; - rv = tok->parse(lex, no_regexp); + rv = tok->parse(lex); if (rv) { memset(lex->esc, 0, sizeof(lex->esc)); @@ -1175,15 +1177,22 @@ uc_lexer_free(uc_lexer *lex) } uc_token * -uc_lexer_next_token(uc_lexer *lex, bool no_regexp) +uc_lexer_next_token(uc_lexer *lex) { - uc_token *rv; + uc_token *rv = NULL; while (lex->state != UT_LEX_EOF) { - rv = lex_step(lex, lex->source->fp, no_regexp); + rv = lex_step(lex, lex->source->fp); if (rv != NULL) - return rv; + break; + } + + if (rv) { + lex->no_keyword = false; + lex->no_regexp = false; + + return rv; } return emit_op(lex, lex->source->off, TK_EOF, NULL); @@ -132,6 +132,8 @@ typedef struct { uc_source *source; uint8_t eof:1; uint8_t is_escape:1; + uint8_t no_regexp:1; + uint8_t no_keyword:1; size_t buflen; char *buf, *bufstart, *bufend; size_t lookbehindlen; @@ -160,7 +162,7 @@ typedef struct { void uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source); void uc_lexer_free(uc_lexer *lex); -uc_token *uc_lexer_next_token(uc_lexer *lex, bool no_regexp); +uc_token *uc_lexer_next_token(uc_lexer *lex); bool utf8enc(char **out, int *rem, int code); |