From 328a50ff82c9bf089dcd381d404dece683ef54d2 Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Mon, 23 Sep 2024 23:20:12 +0200 Subject: lexer: improve token position reporting - Report end position for emitted tokens. This is required to reliably determine the token length, e.g. for downstream code intelligence use cases - Fix start offset of continued template literal string tokens. Previously the start offset of a literal string following a `${...}` placeholder expressions was shifted by one byte - Report proper start offset of `TK_LEXP` tokens. Signed-off-by: Jo-Philipp Wich --- include/ucode/lexer.h | 1 + lexer.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h index 1728aa3..8929731 100644 --- a/include/ucode/lexer.h +++ b/include/ucode/lexer.h @@ -138,6 +138,7 @@ typedef struct { uc_tokentype_t type; uc_value_t *uv; size_t pos; + size_t end; } uc_token_t; typedef struct { diff --git a/lexer.c b/lexer.c index 53f00f5..3e640c6 100644 --- a/lexer.c +++ b/lexer.c @@ -145,6 +145,8 @@ emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv) else lex->curr.pos = (size_t)pos; + lex->curr.end = lex->source->off; + return &lex->curr; } @@ -338,7 +340,7 @@ parse_escape(uc_lexer_t *lex, const char *regex_macros) static uc_token_t * parse_string(uc_lexer_t *lex, int kind) { - uc_token_t *err; + uc_token_t *err, *tok; unsigned type; int code, ch; size_t off; @@ -359,7 +361,10 @@ parse_string(uc_lexer_t *lex, int kind) if (type == TK_TEMPLATE && check_char(lex, '{')) { lex->state = UC_LEX_PLACEHOLDER_START; - return emit_buffer(lex, off, type, NULL); + tok = emit_buffer(lex, off, type, NULL); + tok->end -= 2; + + return tok; } uc_vector_push(&lex->buffer, '$'); @@ -987,6 +992,8 @@ lex_step(uc_lexer_t *lex) if (!tok) continue; + tok->end -= 2; + return tok; @@ -1022,7 +1029,7 @@ lex_step(uc_lexer_t *lex) lex->state = UC_LEX_IDENTIFY_TOKEN; lex->block = EXPRESSION; - return emit_op(lex, lex->source->off, TK_LEXP, NULL); + return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL); case UC_LEX_IDENTIFY_TOKEN: @@ -1092,7 +1099,10 @@ lex_step(uc_lexer_t *lex) case UC_LEX_PLACEHOLDER_END: lex->state = UC_LEX_IDENTIFY_TOKEN; - return parse_string(lex, '`'); + tok = parse_string(lex, '`'); + tok->pos++; + + return tok; case UC_LEX_EOF: -- cgit v1.2.3 From 855854f6c2ae2e667f6bbfd5f67caab32b4ebf86 Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Mon, 23 Sep 2024 23:33:47 +0200 Subject: lexer: emit comment and template statement block tokens Tweak the token stream reported by the lexer in order to make it more useful for alternative, non-compilation downstream parse processes such as code intelligence gathering within a language server implementation. - Instead of silently discarding source code comments in the lexing phase, emit TK_COMMENT tokens which is useful to e.g. parse type annotations and other structured information. - Do not silently discard TK_LSTM tokens but report them to downstream parsers instead. - Do not silently emit TK_RSTM tokens as TK_SCOL but report them as-is to downstrem parsers. - Adjust the byte code compiler to properly deal with the changed token reporting by discarding incoming TK_COMMENT and TK_LSTM tokens and by remapping read TK_RSTM tokens to the TK_SCOL type. Signed-off-by: Jo-Philipp Wich --- compiler.c | 12 +++++++++++- include/ucode/lexer.h | 2 ++ lexer.c | 26 +++++++++++++++++++------- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/compiler.c b/compiler.c index 53e61e7..b64537a 100644 --- a/compiler.c +++ b/compiler.c @@ -242,7 +242,17 @@ uc_compiler_parse_advance(uc_compiler_t *compiler) compiler->parser->prev = compiler->parser->curr; while (true) { - compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex); + uc_token_t *tok = uc_lexer_next_token(&compiler->parser->lex); + + if (tok->type == TK_COMMENT || tok->type == TK_LSTM) { + ucv_put(tok->uv); + continue; + } + else if (tok->type == TK_RSTM) { + tok->type = TK_SCOL; + } + + compiler->parser->curr = *tok; if (compiler->parser->curr.type != TK_ERROR) break; diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h index 8929731..8dcba7a 100644 --- a/include/ucode/lexer.h +++ b/include/ucode/lexer.h @@ -121,12 +121,14 @@ typedef enum { TK_EXPORT, TK_EOF, + TK_COMMENT, TK_ERROR } uc_tokentype_t; typedef enum { UC_LEX_IDENTIFY_BLOCK, UC_LEX_BLOCK_EXPRESSION_EMIT_TAG, + UC_LEX_BLOCK_STATEMENT_EMIT_TAG, UC_LEX_BLOCK_COMMENT, UC_LEX_IDENTIFY_TOKEN, UC_LEX_PLACEHOLDER_START, diff --git a/lexer.c b/lexer.c index 3e640c6..52945dc 100644 --- a/lexer.c +++ b/lexer.c @@ -174,16 +174,23 @@ emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_c static uc_token_t * parse_comment(uc_lexer_t *lex, int kind) { + size_t off = lex->source->off - 1; int ch; + uc_vector_push(&lex->buffer, '/'); + while (true) { ch = next_char(lex); + uc_vector_push(&lex->buffer, ch); + if (kind == '/' && (ch == '\n' || ch == EOF)) break; - if (kind == '*' && ch == '*' && check_char(lex, '/')) + if (kind == '*' && ch == '*' && check_char(lex, '/')) { + uc_vector_push(&lex->buffer, '/'); break; + } if (ch == EOF) { lex->state = UC_LEX_EOF; @@ -192,7 +199,7 @@ parse_comment(uc_lexer_t *lex, int kind) } } - return NULL; + return emit_buffer(lex, off, TK_COMMENT, NULL); } static void @@ -957,8 +964,7 @@ lex_step(uc_lexer_t *lex) /* found start of statement block */ case '%': - lex->state = UC_LEX_IDENTIFY_TOKEN; - lex->block = STATEMENTS; + lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG; if (check_char(lex, '-')) strip = " \n\t\v\f\r"; @@ -1019,11 +1025,12 @@ lex_step(uc_lexer_t *lex) return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); } + tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL); + lex->lastoff = lex->source->off; lex->state = UC_LEX_IDENTIFY_BLOCK; - continue; - + return tok; case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: lex->state = UC_LEX_IDENTIFY_TOKEN; @@ -1031,6 +1038,11 @@ lex_step(uc_lexer_t *lex) return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL); + case UC_LEX_BLOCK_STATEMENT_EMIT_TAG: + lex->state = UC_LEX_IDENTIFY_TOKEN; + lex->block = STATEMENTS; + + return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL); case UC_LEX_IDENTIFY_TOKEN: do { tok = lex_find_token(lex); } while (tok == NULL); @@ -1049,7 +1061,7 @@ lex_step(uc_lexer_t *lex) lex->state = UC_LEX_IDENTIFY_BLOCK; lex->block = NONE; - tok = emit_op(lex, -2, TK_SCOL, NULL); + tok = emit_op(lex, -2, TK_RSTM, NULL); } /* found end of expression block */ -- cgit v1.2.3 From 2b2e732b6081afb473a2cc698fd4397260b0960c Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Mon, 23 Sep 2024 23:52:10 +0200 Subject: lexer: make api functions public Make the lexer API functions `uc_lexer_init()`, `us_lexer_free()` and `uc_lexer_next_token()` public for use in loadable extensions. Signed-off-by: Jo-Philipp Wich --- include/ucode/lexer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h index 8dcba7a..fd375b8 100644 --- a/include/ucode/lexer.h +++ b/include/ucode/lexer.h @@ -177,10 +177,10 @@ typedef struct { } uc_lexer_t; -__hidden void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source); -__hidden void uc_lexer_free(uc_lexer_t *lex); +void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source); +void uc_lexer_free(uc_lexer_t *lex); -__hidden uc_token_t *uc_lexer_next_token(uc_lexer_t *lex); +uc_token_t *uc_lexer_next_token(uc_lexer_t *lex); __hidden bool uc_lexer_is_keyword(uc_value_t *label); -- cgit v1.2.3