lexer: emit comment and template statement block tokens

Tweak the token stream reported by the lexer in order to make it more useful for alternative, non-compilation downstream parse processes such as code intelligence gathering within a language server implementation. - Instead of silently discarding source code comments in the lexing phase, emit TK_COMMENT tokens which is useful to e.g. parse type annotations and other structured information. - Do not silently discard TK_LSTM tokens but report them to downstream parsers instead. - Do not silently emit TK_RSTM tokens as TK_SCOL but report them as-is to downstrem parsers. - Adjust the byte code compiler to properly deal with the changed token reporting by discarding incoming TK_COMMENT and TK_LSTM tokens and by remapping read TK_RSTM tokens to the TK_SCOL type. Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2024-09-23 23:33:47 +0200
committer: Jo-Philipp Wich <jo@mein.io> 2024-09-23 23:33:47 +0200
commit: 855854f6c2ae2e667f6bbfd5f67caab32b4ebf86 (patch)
tree: 11d99de43b268fa1b43d278299eb5f2f4ced8963 /lexer.c
parent: 328a50ff82c9bf089dcd381d404dece683ef54d2 (diff)
1 files changed, 19 insertions, 7 deletions
diff --git a/lexer.c b/lexer.c
index 3e640c6..52945dc 100644
--- a/lexer.c
+++ b/lexer.c
@@ -174,16 +174,23 @@ emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_c
 static uc_token_t *
 parse_comment(uc_lexer_t *lex, int kind)
 {
+	size_t off = lex->source->off - 1;
 	int ch;
 
+	uc_vector_push(&lex->buffer, '/');
+
 	while (true) {
 		ch = next_char(lex);
 
+		uc_vector_push(&lex->buffer, ch);
+
 		if (kind == '/' && (ch == '\n' || ch == EOF))
 			break;
 
-		if (kind == '*' && ch == '*' && check_char(lex, '/'))
+		if (kind == '*' && ch == '*' && check_char(lex, '/')) {
+			uc_vector_push(&lex->buffer, '/');
 			break;
+		}
 
 		if (ch == EOF) {
 			lex->state = UC_LEX_EOF;
@@ -192,7 +199,7 @@ parse_comment(uc_lexer_t *lex, int kind)
 		}
 	}
 
-	return NULL;
+	return emit_buffer(lex, off, TK_COMMENT, NULL);
 }
 
 static void
@@ -957,8 +964,7 @@ lex_step(uc_lexer_t *lex)
 
 					/* found start of statement block */
 					case '%':
-						lex->state = UC_LEX_IDENTIFY_TOKEN;
-						lex->block = STATEMENTS;
+						lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG;
 
 						if (check_char(lex, '-'))
 							strip = " \n\t\v\f\r";
@@ -1019,11 +1025,12 @@ lex_step(uc_lexer_t *lex)
 				return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
 			}
 
+			tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL);
+
 			lex->lastoff = lex->source->off;
 			lex->state = UC_LEX_IDENTIFY_BLOCK;
 
-			continue;
-
+			return tok;
 
 		case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
 			lex->state = UC_LEX_IDENTIFY_TOKEN;
@@ -1031,6 +1038,11 @@ lex_step(uc_lexer_t *lex)
 
 			return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);
 
+		case UC_LEX_BLOCK_STATEMENT_EMIT_TAG:
+			lex->state = UC_LEX_IDENTIFY_TOKEN;
+			lex->block = STATEMENTS;
+
+			return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL);
 
 		case UC_LEX_IDENTIFY_TOKEN:
 			do { tok = lex_find_token(lex); } while (tok == NULL);
@@ -1049,7 +1061,7 @@ lex_step(uc_lexer_t *lex)
 				lex->state = UC_LEX_IDENTIFY_BLOCK;
 				lex->block = NONE;
 
-				tok = emit_op(lex, -2, TK_SCOL, NULL);
+				tok = emit_op(lex, -2, TK_RSTM, NULL);
 			}
 
 			/* found end of expression block */
author	Jo-Philipp Wich <jo@mein.io>	2024-09-23 23:33:47 +0200
committer	Jo-Philipp Wich <jo@mein.io>	2024-09-23 23:33:47 +0200
commit	855854f6c2ae2e667f6bbfd5f67caab32b4ebf86 (patch)
tree	11d99de43b268fa1b43d278299eb5f2f4ced8963 /lexer.c
parent	328a50ff82c9bf089dcd381d404dece683ef54d2 (diff)