From 328a50ff82c9bf089dcd381d404dece683ef54d2 Mon Sep 17 00:00:00 2001
From: Jo-Philipp Wich <jo@mein.io>
Date: Mon, 23 Sep 2024 23:20:12 +0200
Subject: lexer: improve token position reporting

 - Report end position for emitted tokens. This is required to reliably
   determine the token length, e.g. for downstream code intelligence
   use cases

 - Fix start offset of continued template literal string tokens.
   Previously the start offset of a literal string following a `${...}`
   placeholder expressions was shifted by one byte

 - Report proper start offset of `TK_LEXP` tokens.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
---
 include/ucode/lexer.h |  1 +
 lexer.c               | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h
index 1728aa3..8929731 100644
--- a/include/ucode/lexer.h
+++ b/include/ucode/lexer.h
@@ -138,6 +138,7 @@ typedef struct {
 	uc_tokentype_t type;
 	uc_value_t *uv;
 	size_t pos;
+	size_t end;
 } uc_token_t;
 
 typedef struct {
diff --git a/lexer.c b/lexer.c
index 53f00f5..3e640c6 100644
--- a/lexer.c
+++ b/lexer.c
@@ -145,6 +145,8 @@ emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
 	else
 		lex->curr.pos = (size_t)pos;
 
+	lex->curr.end = lex->source->off;
+
 	return &lex->curr;
 }
 
@@ -338,7 +340,7 @@ parse_escape(uc_lexer_t *lex, const char *regex_macros)
 static uc_token_t *
 parse_string(uc_lexer_t *lex, int kind)
 {
-	uc_token_t *err;
+	uc_token_t *err, *tok;
 	unsigned type;
 	int code, ch;
 	size_t off;
@@ -359,7 +361,10 @@ parse_string(uc_lexer_t *lex, int kind)
 			if (type == TK_TEMPLATE && check_char(lex, '{')) {
 				lex->state = UC_LEX_PLACEHOLDER_START;
 
-				return emit_buffer(lex, off, type, NULL);
+				tok = emit_buffer(lex, off, type, NULL);
+				tok->end -= 2;
+
+				return tok;
 			}
 
 			uc_vector_push(&lex->buffer, '$');
@@ -987,6 +992,8 @@ lex_step(uc_lexer_t *lex)
 			if (!tok)
 				continue;
 
+			tok->end -= 2;
+
 			return tok;
 
 
@@ -1022,7 +1029,7 @@ lex_step(uc_lexer_t *lex)
 			lex->state = UC_LEX_IDENTIFY_TOKEN;
 			lex->block = EXPRESSION;
 
-			return emit_op(lex, lex->source->off, TK_LEXP, NULL);
+			return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);
 
 
 		case UC_LEX_IDENTIFY_TOKEN:
@@ -1092,7 +1099,10 @@ lex_step(uc_lexer_t *lex)
 		case UC_LEX_PLACEHOLDER_END:
 			lex->state = UC_LEX_IDENTIFY_TOKEN;
 
-			return parse_string(lex, '`');
+			tok = parse_string(lex, '`');
+			tok->pos++;
+
+			return tok;
 
 
 		case UC_LEX_EOF:
-- 
cgit v1.2.3


From 855854f6c2ae2e667f6bbfd5f67caab32b4ebf86 Mon Sep 17 00:00:00 2001
From: Jo-Philipp Wich <jo@mein.io>
Date: Mon, 23 Sep 2024 23:33:47 +0200
Subject: lexer: emit comment and template statement block tokens

Tweak the token stream reported by the lexer in order to make it more useful
for alternative, non-compilation downstream parse processes such as code
intelligence gathering within a language server implementation.

 - Instead of silently discarding source code comments in the lexing phase,
   emit TK_COMMENT tokens which is useful to e.g. parse type annotations and
   other structured information.

 - Do not silently discard TK_LSTM tokens but report them to downstream
   parsers instead.

 - Do not silently emit TK_RSTM tokens as TK_SCOL but report them as-is to
   downstrem parsers.

 - Adjust the byte code compiler to properly deal with the changed token
   reporting by discarding incoming TK_COMMENT and TK_LSTM tokens and by
   remapping read TK_RSTM tokens to the TK_SCOL type.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
---
 compiler.c            | 12 +++++++++++-
 include/ucode/lexer.h |  2 ++
 lexer.c               | 26 +++++++++++++++++++-------
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/compiler.c b/compiler.c
index 53e61e7..b64537a 100644
--- a/compiler.c
+++ b/compiler.c
@@ -242,7 +242,17 @@ uc_compiler_parse_advance(uc_compiler_t *compiler)
 	compiler->parser->prev = compiler->parser->curr;
 
 	while (true) {
-		compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex);
+		uc_token_t *tok = uc_lexer_next_token(&compiler->parser->lex);
+
+		if (tok->type == TK_COMMENT || tok->type == TK_LSTM) {
+			ucv_put(tok->uv);
+			continue;
+		}
+		else if (tok->type == TK_RSTM) {
+			tok->type = TK_SCOL;
+		}
+
+		compiler->parser->curr = *tok;
 
 		if (compiler->parser->curr.type != TK_ERROR)
 			break;
diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h
index 8929731..8dcba7a 100644
--- a/include/ucode/lexer.h
+++ b/include/ucode/lexer.h
@@ -121,12 +121,14 @@ typedef enum {
 	TK_EXPORT,
 
 	TK_EOF,
+	TK_COMMENT,
 	TK_ERROR
 } uc_tokentype_t;
 
 typedef enum {
 	UC_LEX_IDENTIFY_BLOCK,
 	UC_LEX_BLOCK_EXPRESSION_EMIT_TAG,
+	UC_LEX_BLOCK_STATEMENT_EMIT_TAG,
 	UC_LEX_BLOCK_COMMENT,
 	UC_LEX_IDENTIFY_TOKEN,
 	UC_LEX_PLACEHOLDER_START,
diff --git a/lexer.c b/lexer.c
index 3e640c6..52945dc 100644
--- a/lexer.c
+++ b/lexer.c
@@ -174,16 +174,23 @@ emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_c
 static uc_token_t *
 parse_comment(uc_lexer_t *lex, int kind)
 {
+	size_t off = lex->source->off - 1;
 	int ch;
 
+	uc_vector_push(&lex->buffer, '/');
+
 	while (true) {
 		ch = next_char(lex);
 
+		uc_vector_push(&lex->buffer, ch);
+
 		if (kind == '/' && (ch == '\n' || ch == EOF))
 			break;
 
-		if (kind == '*' && ch == '*' && check_char(lex, '/'))
+		if (kind == '*' && ch == '*' && check_char(lex, '/')) {
+			uc_vector_push(&lex->buffer, '/');
 			break;
+		}
 
 		if (ch == EOF) {
 			lex->state = UC_LEX_EOF;
@@ -192,7 +199,7 @@ parse_comment(uc_lexer_t *lex, int kind)
 		}
 	}
 
-	return NULL;
+	return emit_buffer(lex, off, TK_COMMENT, NULL);
 }
 
 static void
@@ -957,8 +964,7 @@ lex_step(uc_lexer_t *lex)
 
 					/* found start of statement block */
 					case '%':
-						lex->state = UC_LEX_IDENTIFY_TOKEN;
-						lex->block = STATEMENTS;
+						lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG;
 
 						if (check_char(lex, '-'))
 							strip = " \n\t\v\f\r";
@@ -1019,11 +1025,12 @@ lex_step(uc_lexer_t *lex)
 				return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
 			}
 
+			tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL);
+
 			lex->lastoff = lex->source->off;
 			lex->state = UC_LEX_IDENTIFY_BLOCK;
 
-			continue;
-
+			return tok;
 
 		case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
 			lex->state = UC_LEX_IDENTIFY_TOKEN;
@@ -1031,6 +1038,11 @@ lex_step(uc_lexer_t *lex)
 
 			return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);
 
+		case UC_LEX_BLOCK_STATEMENT_EMIT_TAG:
+			lex->state = UC_LEX_IDENTIFY_TOKEN;
+			lex->block = STATEMENTS;
+
+			return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL);
 
 		case UC_LEX_IDENTIFY_TOKEN:
 			do { tok = lex_find_token(lex); } while (tok == NULL);
@@ -1049,7 +1061,7 @@ lex_step(uc_lexer_t *lex)
 				lex->state = UC_LEX_IDENTIFY_BLOCK;
 				lex->block = NONE;
 
-				tok = emit_op(lex, -2, TK_SCOL, NULL);
+				tok = emit_op(lex, -2, TK_RSTM, NULL);
 			}
 
 			/* found end of expression block */
-- 
cgit v1.2.3


From 2b2e732b6081afb473a2cc698fd4397260b0960c Mon Sep 17 00:00:00 2001
From: Jo-Philipp Wich <jo@mein.io>
Date: Mon, 23 Sep 2024 23:52:10 +0200
Subject: lexer: make api functions public

Make the lexer API functions `uc_lexer_init()`, `us_lexer_free()` and
`uc_lexer_next_token()` public for use in loadable extensions.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
---
 include/ucode/lexer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h
index 8dcba7a..fd375b8 100644
--- a/include/ucode/lexer.h
+++ b/include/ucode/lexer.h
@@ -177,10 +177,10 @@ typedef struct {
 } uc_lexer_t;
 
 
-__hidden void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source);
-__hidden void uc_lexer_free(uc_lexer_t *lex);
+void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source);
+void uc_lexer_free(uc_lexer_t *lex);
 
-__hidden uc_token_t *uc_lexer_next_token(uc_lexer_t *lex);
+uc_token_t *uc_lexer_next_token(uc_lexer_t *lex);
 
 __hidden bool uc_lexer_is_keyword(uc_value_t *label);
 
-- 
cgit v1.2.3