compiler, lexer: improve lexical state handling

- Instead of disambiguating division operator vs. regexp literal by looking at the preceeding token, raise a "no regexp" flag within the appropriate parser states to tell the lexer how to treat a forward slash when parsing the next token - Introduce another "no keyword" flag which disables parsing labels into keywords when reading the next token and set it in the appropriate parser states. This allows using reserved names in object declarations and property access expressions Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2021-04-28 23:52:37 +0200
committer: Jo-Philipp Wich <jo@mein.io> 2021-04-29 00:10:56 +0200
commit: e66b2ad400203c27d7a17edea2b9952f110e9020 (patch)
tree: b3ed1d3e4e8141dd215ac3b116a0440bc93b5cbe
parent: e29b5744132d7dfb2989c70d4255840126d6ad19 (diff)
3 files changed, 97 insertions, 96 deletions
diff --git a/compiler.c b/compiler.c
index fd5b3af..c5cfef5 100644
--- a/compiler.c
+++ b/compiler.c
@@ -195,64 +195,11 @@ uc_compiler_set_srcpos(uc_compiler *compiler, size_t srcpos)
 static void
 uc_compiler_parse_advance(uc_compiler *compiler)
 {
-	bool no_regexp;
-
 	ucv_put(compiler->parser->prev.uv);
 	compiler->parser->prev = compiler->parser->curr;
 
 	while (true) {
-		/* Follow JSLint logic and treat a slash after any of the
-		* `(,=:[!&|?{};` characters as the beginning of a regex
-		* literal... */
-		switch (compiler->parser->prev.type) {
-		case TK_LPAREN:
-		case TK_COMMA:
-
-		case TK_ASADD:
-		case TK_ASBAND:
-		case TK_ASBOR:
-		case TK_ASBXOR:
-		case TK_ASDIV:
-		case TK_ASLEFT:
-		case TK_ASMOD:
-		case TK_ASMUL:
-		case TK_ASRIGHT:
-		case TK_ASSIGN:
-		case TK_ASSUB:
-		case TK_EQ:
-		case TK_EQS:
-		case TK_GE:
-		case TK_LE:
-		case TK_NE:
-		case TK_NES:
-
-		case TK_COLON:
-		case TK_LBRACK:
-		case TK_NOT:
-
-		case TK_AND:
-		case TK_BAND:
-
-		case TK_OR:
-		case TK_BOR:
-
-		case TK_QMARK:
-
-		case TK_LBRACE:
-		case TK_RBRACE:
-
-		case TK_LSTM:
-		case TK_LEXP:
-
-		case TK_SCOL:
-			no_regexp = false;
-			break;
-
-		default:
-			no_regexp = (compiler->parser->prev.type != 0);
-		}
-
-		compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex, no_regexp);
+		compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex);
 
 		if (compiler->parser->curr.type != TK_ERROR)
 			break;
@@ -359,22 +306,41 @@ uc_compiler_parse_precedence(uc_compiler *compiler, uc_precedence_t precedence)
 	uc_parse_rule *rule;
 	bool assignable;
 
-	uc_compiler_parse_advance(compiler);
-
-	rule = uc_compiler_parse_rule(compiler->parser->prev.type);
+	rule = uc_compiler_parse_rule(compiler->parser->curr.type);
 
 	if (!rule->prefix) {
-		uc_compiler_syntax_error(compiler, compiler->parser->prev.pos, "Expecting expression");
+		uc_compiler_syntax_error(compiler, compiler->parser->curr.pos, "Expecting expression");
+		uc_compiler_parse_advance(compiler);
 
 		return;
 	}
 
+	/* allow reserved words as property names in object literals */
+	if (rule->prefix == uc_compiler_compile_object)
+		compiler->parser->lex.no_keyword = true;
+
+	/* unless a sub-expression follows, treat subsequent slash as division
+	 * operator and not as beginning of regexp literal */
+	if (rule->prefix != uc_compiler_compile_paren &&
+	    rule->prefix != uc_compiler_compile_unary &&
+	    rule->prefix != uc_compiler_compile_array)
+		compiler->parser->lex.no_regexp = true;
+
+	uc_compiler_parse_advance(compiler);
+
 	assignable = (precedence <= P_ASSIGN);
 	rule->prefix(compiler, assignable);
 
 	while (precedence <= uc_compiler_parse_rule(compiler->parser->curr.type)->precedence) {
+		rule = uc_compiler_parse_rule(compiler->parser->curr.type);
+
+		/* allow reserved words in property accessors */
+		if (rule->infix == uc_compiler_compile_dot)
+			compiler->parser->lex.no_keyword = true;
+
 		uc_compiler_parse_advance(compiler);
-		uc_compiler_parse_rule(compiler->parser->prev.type)->infix(compiler, assignable);
+
+		rule->infix(compiler, assignable);
 	}
 
 	if (assignable && uc_compiler_parse_at_assignment_op(compiler))
@@ -1207,7 +1173,14 @@ uc_compiler_compile_paren(uc_compiler *compiler, bool assignable)
 			continue;
 		}
 		else {
-			maybe_arrowfn = uc_compiler_parse_match(compiler, TK_RPAREN);
+			maybe_arrowfn = uc_compiler_parse_check(compiler, TK_RPAREN);
+
+			if (maybe_arrowfn) {
+				/* A subsequent slash cannot be a regular expression literal */
+				compiler->parser->lex.no_regexp = true;
+				uc_compiler_parse_advance(compiler);
+			}
+
 			break;
 		}
 	}
@@ -1276,6 +1249,9 @@ uc_compiler_compile_paren(uc_compiler *compiler, bool assignable)
 	if (!uc_compiler_parse_check(compiler, TK_RPAREN))
 		uc_compiler_compile_expression(compiler);
 
+	/* A subsequent slash cannot be a regular expression literal */
+	compiler->parser->lex.no_regexp = true;
+
 	/* At this point we expect the end of the parenthesized expression, anything
 	 * else is a syntax error */
 	uc_compiler_parse_consume(compiler, TK_RPAREN);
@@ -1315,6 +1291,8 @@ uc_compiler_compile_call(uc_compiler *compiler, bool assignable)
 		while (uc_compiler_parse_match(compiler, TK_COMMA));
 	}
 
+	/* after a function call expression, no regexp literal can follow */
+	compiler->parser->lex.no_regexp = true;
 	uc_compiler_parse_consume(compiler, TK_RPAREN);
 
 	/* if lhs is a dot or bracket expression, emit a method call */
@@ -1564,6 +1542,9 @@ uc_compiler_compile_or(uc_compiler *compiler, bool assignable)
 static void
 uc_compiler_compile_dot(uc_compiler *compiler, bool assignable)
 {
+	/* no regexp literal possible after property access */
+	compiler->parser->lex.no_regexp = true;
+
 	/* parse label lhs */
 	uc_compiler_parse_consume(compiler, TK_LABEL);
 	uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv);
@@ -1578,6 +1559,9 @@ uc_compiler_compile_subscript(uc_compiler *compiler, bool assignable)
 {
 	/* compile lhs */
 	uc_compiler_compile_expression(compiler);
+
+	/* no regexp literal possible after computed property access */
+	compiler->parser->lex.no_regexp = true;
 	uc_compiler_parse_consume(compiler, TK_RBRACK);
 
 	/* depending on context, compile into I_UVAL, I_SVAL or I_LVAL operation */
@@ -1653,6 +1637,8 @@ uc_compiler_compile_array(uc_compiler *compiler, bool assignable)
 	}
 	while (uc_compiler_parse_match(compiler, TK_COMMA));
 
+	/* no regexp literal possible after array literal */
+	compiler->parser->lex.no_regexp = true;
 	uc_compiler_parse_consume(compiler, TK_RBRACK);
 
 	/* push items on stack */
@@ -1751,9 +1737,13 @@ uc_compiler_compile_object(uc_compiler *compiler, bool assignable)
 
 		hint_count += 2;
 		len += 2;
+
+		compiler->parser->lex.no_keyword = true;
 	}
 	while (uc_compiler_parse_match(compiler, TK_COMMA));
 
+	/* no regexp literal possible after object literal */
+	compiler->parser->lex.no_regexp = true;
 	uc_compiler_parse_consume(compiler, TK_RBRACE);
 
 	/* set items on stack */
diff --git a/lexer.c b/lexer.c
index aaba314..0fbea79 100644
--- a/lexer.c
+++ b/lexer.c
@@ -48,7 +48,7 @@ struct token {
 		char pat[4];
 	} u;
 	unsigned plen;
-	uc_token *(*parse)(uc_lexer *, bool);
+	uc_token *(*parse)(uc_lexer *);
 };
 
 #define dec(o) \
@@ -58,11 +58,11 @@ struct token {
 	(((x) >= 'a') ? (10 + (x) - 'a') : \
 		(((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 
-static uc_token *parse_comment(uc_lexer *, bool);
-static uc_token *parse_string(uc_lexer *, bool);
-static uc_token *parse_regexp(uc_lexer *, bool);
-static uc_token *parse_number(uc_lexer *, bool);
-static uc_token *parse_label(uc_lexer *, bool);
+static uc_token *parse_comment(uc_lexer *);
+static uc_token *parse_string(uc_lexer *);
+static uc_token *parse_regexp(uc_lexer *);
+static uc_token *parse_number(uc_lexer *);
+static uc_token *parse_label(uc_lexer *);
 
 static const struct token tokens[] = {
 	{ TK_ASLEFT,	{ .pat = "<<=" },   3, NULL },
@@ -353,7 +353,7 @@ buf_consume(uc_lexer *lex, size_t len) {
 }
 
 static uc_token *
-parse_comment(uc_lexer *lex, bool no_regexp)
+parse_comment(uc_lexer *lex)
 {
 	const struct token *tok = lex->tok;
 	const char *ptr, *end;
@@ -397,7 +397,7 @@ append_utf8(uc_lexer *lex, int code) {
 }
 
 static uc_token *
-parse_string(uc_lexer *lex, bool no_regexp)
+parse_string(uc_lexer *lex)
 {
 	const struct token *tok = lex->tok;
 	char q = tok->u.pat[0];
@@ -625,7 +625,7 @@ enum {
 };
 
 static uc_token *
-parse_regexp(uc_lexer *lex, bool no_regexp)
+parse_regexp(uc_lexer *lex)
 {
 	bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
 	uc_token *rv;
@@ -634,7 +634,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp)
 
 	switch (lex->esc[0]) {
 	case UT_LEX_PARSE_REGEX_INIT:
-		if (no_regexp) {
+		if (lex->no_regexp) {
 			if (buf_startswith(lex, "=")) {
 				buf_consume(lex, 1);
 
@@ -648,7 +648,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp)
 		break;
 
 	case UT_LEX_PARSE_REGEX_PATTERN:
-		rv = parse_string(lex, no_regexp);
+		rv = parse_string(lex);
 
 		if (rv && rv->type == TK_ERROR)
 			return rv;
@@ -716,7 +716,7 @@ parse_regexp(uc_lexer *lex, bool no_regexp)
  */
 
 static uc_token *
-parse_label(uc_lexer *lex, bool no_regexp)
+parse_label(uc_lexer *lex)
 {
 	const struct token *tok = lex->tok;
 	const struct keyword *word;
@@ -728,24 +728,26 @@ parse_label(uc_lexer *lex, bool no_regexp)
 		lookbehind_append(lex, tok->u.pat, tok->plen);
 
 	if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
-		for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
-			if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
-				lookbehind_reset(lex);
-
-				switch (word->type) {
-				case TK_DOUBLE:
-					rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_double_new(word->u.d));
-					break;
-
-				case TK_BOOL:
-					rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_boolean_new(word->u.b));
-					break;
+		if (lex->no_keyword == false) {
+			for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
+				if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
+					lookbehind_reset(lex);
+
+					switch (word->type) {
+					case TK_DOUBLE:
+						rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_double_new(word->u.d));
+						break;
+
+					case TK_BOOL:
+						rv = emit_op(lex, lex->source->off - word->plen, word->type, ucv_boolean_new(word->u.b));
+						break;
+
+					default:
+						rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL);
+					}
 
-				default:
-					rv = emit_op(lex, lex->source->off - word->plen, word->type, NULL);
+					return rv;
 				}
-
-				return rv;
 			}
 		}
 
@@ -784,7 +786,7 @@ is_numeric_char(uc_lexer *lex, char c)
 }
 
 static uc_token *
-parse_number(uc_lexer *lex, bool no_regexp)
+parse_number(uc_lexer *lex)
 {
 	const struct token *tok = lex->tok;
 	uc_token *rv = NULL;
@@ -837,7 +839,7 @@ parse_number(uc_lexer *lex, bool no_regexp)
 }
 
 static uc_token *
-lex_step(uc_lexer *lex, FILE *fp, bool no_regexp)
+lex_step(uc_lexer *lex, FILE *fp)
 {
 	uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
 	union { uint32_t n; char str[4]; } search;
@@ -1110,7 +1112,7 @@ lex_step(uc_lexer *lex, FILE *fp, bool no_regexp)
 
 	case UT_LEX_PARSE_TOKEN:
 		tok = lex->tok;
-		rv = tok->parse(lex, no_regexp);
+		rv = tok->parse(lex);
 
 		if (rv) {
 			memset(lex->esc, 0, sizeof(lex->esc));
@@ -1175,15 +1177,22 @@ uc_lexer_free(uc_lexer *lex)
 }
 
 uc_token *
-uc_lexer_next_token(uc_lexer *lex, bool no_regexp)
+uc_lexer_next_token(uc_lexer *lex)
 {
-	uc_token *rv;
+	uc_token *rv = NULL;
 
 	while (lex->state != UT_LEX_EOF) {
-		rv = lex_step(lex, lex->source->fp, no_regexp);
+		rv = lex_step(lex, lex->source->fp);
 
 		if (rv != NULL)
-			return rv;
+			break;
+	}
+
+	if (rv) {
+		lex->no_keyword = false;
+		lex->no_regexp = false;
+
+		return rv;
 	}
 
 	return emit_op(lex, lex->source->off, TK_EOF, NULL);
diff --git a/lexer.h b/lexer.h
index 069e9e0..60f6ce0 100644
--- a/lexer.h
+++ b/lexer.h
@@ -132,6 +132,8 @@ typedef struct {
 	uc_source *source;
 	uint8_t eof:1;
 	uint8_t is_escape:1;
+	uint8_t no_regexp:1;
+	uint8_t no_keyword:1;
 	size_t buflen;
 	char *buf, *bufstart, *bufend;
 	size_t lookbehindlen;
@@ -160,7 +162,7 @@ typedef struct {
 void uc_lexer_init(uc_lexer *lex, uc_parse_config *config, uc_source *source);
 void uc_lexer_free(uc_lexer *lex);
 
-uc_token *uc_lexer_next_token(uc_lexer *lex, bool no_regexp);
+uc_token *uc_lexer_next_token(uc_lexer *lex);
 
 bool utf8enc(char **out, int *rem, int code);
author	Jo-Philipp Wich <jo@mein.io>	2021-04-28 23:52:37 +0200
committer	Jo-Philipp Wich <jo@mein.io>	2021-04-29 00:10:56 +0200
commit	e66b2ad400203c27d7a17edea2b9952f110e9020 (patch)
tree	b3ed1d3e4e8141dd215ac3b116a0440bc93b5cbe
parent	e29b5744132d7dfb2989c70d4255840126d6ad19 (diff)