Merge pull request #73 from jow-/syntax-add-template-strings

syntax: implement support for ES6 template literals
author: Jo-Philipp Wich <jo@mein.io> 2022-04-13 16:37:28 +0200
committer: GitHub <noreply@github.com> 2022-04-13 16:37:28 +0200
commit: 568adbe8d465e200e7d1a5958bbe215e1e04d427 (patch)
tree: c23ec0351ac1f08917604bfa824fa83d0ab5668c
parent: 23ddf91d6380da392c8eea7b7fe12c2cd687b6de (diff)
parent: e14b0993b101839d2d40b5c4f184e6b0c2083b65 (diff)
5 files changed, 209 insertions, 10 deletions
diff --git a/compiler.c b/compiler.c
index 80b873d..d4725b1 100644
--- a/compiler.c
+++ b/compiler.c
@@ -33,6 +33,7 @@ static void uc_compiler_compile_paren(uc_compiler_t *compiler);
 static void uc_compiler_compile_call(uc_compiler_t *compiler);
 static void uc_compiler_compile_post_inc(uc_compiler_t *compiler);
 static void uc_compiler_compile_constant(uc_compiler_t *compiler);
+static void uc_compiler_compile_template(uc_compiler_t *compiler);
 static void uc_compiler_compile_comma(uc_compiler_t *compiler);
 static void uc_compiler_compile_labelexpr(uc_compiler_t *compiler);
 static void uc_compiler_compile_function(uc_compiler_t *compiler);
@@ -72,6 +73,7 @@ uc_compiler_parse_rules[TK_ERROR + 1] = {
 	[TK_NULL]		= { uc_compiler_compile_constant, NULL, P_NONE },
 	[TK_THIS]		= { uc_compiler_compile_constant, NULL, P_NONE },
 	[TK_REGEXP]		= { uc_compiler_compile_constant, NULL, P_NONE },
+	[TK_TEMPLATE]	= { uc_compiler_compile_template, NULL, P_NONE },
 	[TK_COMMA]		= { NULL, uc_compiler_compile_comma, P_COMMA },
 	[TK_LABEL]		= { uc_compiler_compile_labelexpr, NULL, P_NONE },
 	[TK_FUNC]		= { uc_compiler_compile_function, NULL, P_NONE },
@@ -1484,6 +1486,27 @@ uc_compiler_compile_constant(uc_compiler_t *compiler)
 }
 
 static void
+uc_compiler_compile_template(uc_compiler_t *compiler)
+{
+	uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv);
+
+	while (true) {
+		if (uc_compiler_parse_match(compiler, TK_TEMPLATE)) {
+			uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv);
+			uc_compiler_emit_insn(compiler, 0, I_ADD);
+		}
+		else if (uc_compiler_parse_match(compiler, TK_PLACEH)) {
+			uc_compiler_compile_expression(compiler);
+			uc_compiler_emit_insn(compiler, 0, I_ADD);
+			uc_compiler_parse_consume(compiler, TK_RBRACE);
+		}
+		else {
+			break;
+		}
+	}
+}
+
+static void
 uc_compiler_compile_comma(uc_compiler_t *compiler)
 {
 	uc_compiler_emit_insn(compiler, 0, I_POP);
diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h
index 134f5ef..835bc2b 100644
--- a/include/ucode/lexer.h
+++ b/include/ucode/lexer.h
@@ -115,6 +115,8 @@ typedef enum {
 	TK_ASOR,
 	TK_ASNULLISH,
 	TK_NULLISH,
+	TK_PLACEH,
+	TK_TEMPLATE,
 
 	TK_EOF,
 	TK_ERROR
@@ -129,6 +131,7 @@ typedef enum {
 	UC_LEX_BLOCK_COMMENT,
 	UC_LEX_IDENTIFY_TOKEN,
 	UC_LEX_PARSE_TOKEN,
+	UC_LEX_PLACEHOLDER,
 	UC_LEX_EOF
 } uc_lex_state_t;
 
@@ -144,6 +147,7 @@ typedef struct {
 	uc_source_t *source;
 	uint8_t eof:1;
 	uint8_t is_escape:1;
+	uint8_t is_placeholder:1;
 	uint8_t no_regexp:1;
 	uint8_t no_keyword:1;
 	size_t buflen;
@@ -168,6 +172,10 @@ typedef struct {
 		STATEMENTS = '%',
 		COMMENT = '#'
 	} block;
+	struct {
+		size_t count;
+		size_t *entries;
+	} templates;
 } uc_lexer_t;
 
 
diff --git a/include/ucode/util.h b/include/ucode/util.h
index 3203499..093951e 100644
--- a/include/ucode/util.h
+++ b/include/ucode/util.h
@@ -68,6 +68,11 @@
 #define uc_vector_last(vec) \
 	(&((vec)->entries[(vec)->count - 1]))
 
+#define uc_vector_push(vec, val) do { \
+	uc_vector_grow(vec); \
+	(vec)->entries[(vec)->count++] = (val); \
+} while(0)
+
 
 /* "failsafe" utility functions */
 
diff --git a/lexer.c b/lexer.c
index 5fe7f6b..9ccc3ae 100644
--- a/lexer.c
+++ b/lexer.c
@@ -107,6 +107,7 @@ static const struct token tokens[] = {
 	{ TK_ARROW,		{ .pat = "=>" },    2, NULL },
 	{ TK_NULLISH,	{ .pat = "??" },    2, NULL },
 	{ TK_QDOT,		{ .pat = "?." },    2, NULL },
+	{ TK_PLACEH,	{ .pat = "${" },    2, NULL },
 	{ TK_ADD,		{ .pat = "+" },     1, NULL },
 	{ TK_ASSIGN,	{ .pat = "=" },     1, NULL },
 	{ TK_BAND,		{ .pat = "&" },     1, NULL },
@@ -138,6 +139,9 @@ static const struct token tokens[] = {
 	{ TK_LABEL,		{ .pat = "az" },    0, parse_label },
 	{ TK_LABEL,		{ .pat = "AZ" },    0, parse_label },
 	{ TK_NUMBER,	{ .pat = "09" },    0, parse_number },
+
+	/* NB: this must be last for simple retrieval */
+	{ TK_TEMPLATE,	{ .pat = "`" },     1, parse_string }
 };
 
 static const struct keyword reserved_words[] = {
@@ -313,6 +317,22 @@ parse_string(uc_lexer_t *lex)
 		return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
 
 	for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
+		/* continuation of placeholder start */
+		if (lex->is_placeholder) {
+			if (*ptr == '{') {
+				buf_consume(lex, 1);
+				rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
+
+				if (!rv)
+					rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
+
+				return rv;
+			}
+
+			lex->is_placeholder = false;
+			lookbehind_append(lex, "$", 1);
+		}
+
 		/* continuation of escape sequence */
 		if (lex->is_escape) {
 			if (lex->esclen == 0) {
@@ -486,10 +506,10 @@ parse_string(uc_lexer_t *lex)
 			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
 			buf_consume(lex, (ptr + 1) - lex->bufstart);
 
-			rv = lookbehind_to_text(lex, lex->lastoff, TK_STRING, NULL);
+			rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
 
 			if (!rv)
-				rv = emit_op(lex, lex->lastoff, TK_STRING, ucv_string_new_length("", 0));
+				rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
 
 			return rv;
 		}
@@ -500,6 +520,13 @@ parse_string(uc_lexer_t *lex)
 			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
 			buf_consume(lex, (ptr - lex->bufstart) + 1);
 		}
+
+		/* potential placeholder start */
+		else if (q == '`' && *ptr == '$') {
+			lex->is_placeholder = true;
+			lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
+			buf_consume(lex, (ptr - lex->bufstart) + 1);
+		}
 	}
 
 	lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
@@ -721,7 +748,7 @@ lex_step(uc_lexer_t *lex, FILE *fp)
 	uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
 	union { uint32_t n; char str[4]; } search;
 	const struct token *tok;
-	size_t rlen, rem;
+	size_t rlen, rem, *nest;
 	char *ptr, c;
 	uc_token_t *rv;
 	size_t i;
@@ -966,6 +993,26 @@ lex_step(uc_lexer_t *lex, FILE *fp)
 					lex->block = NONE;
 				}
 
+				/* track opening braces */
+				else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
+					nest = uc_vector_last(&lex->templates);
+					(*nest)++;
+				}
+
+				/* check end of placeholder expression */
+				else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
+					nest = uc_vector_last(&lex->templates);
+
+					if (*nest == 0) {
+						lex->templates.count--;
+						lex->state = UC_LEX_PARSE_TOKEN;
+						lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */
+					}
+					else {
+						(*nest)--;
+					}
+				}
+
 				/* do not report statement tags to the parser */
 				if (tok->type != 0 && tok->type != TK_LSTM)
 					rv = emit_op(lex, lex->source->off,
@@ -1001,7 +1048,8 @@ lex_step(uc_lexer_t *lex, FILE *fp)
 
 		if (rv) {
 			memset(lex->esc, 0, sizeof(lex->esc));
-			lex->state = UC_LEX_IDENTIFY_TOKEN;
+			lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN;
+			lex->is_placeholder = false;
 			lex->tok = NULL;
 
 			if (rv == UC_LEX_CONTINUE_PARSING)
@@ -1013,6 +1061,14 @@ lex_step(uc_lexer_t *lex, FILE *fp)
 		break;
 
 
+	case UC_LEX_PLACEHOLDER:
+		lex->state = UC_LEX_IDENTIFY_TOKEN;
+
+		uc_vector_push(&lex->templates, 0);
+
+		return emit_op(lex, lex->source->off, TK_PLACEH, NULL);
+
+
 	case UC_LEX_EOF:
 		break;
 	}
@@ -1051,6 +1107,9 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
 
 	lex->lastoff = 0;
 
+	lex->templates.count = 0;
+	lex->templates.entries = NULL;
+
 	if (config && config->raw_mode) {
 		lex->state = UC_LEX_IDENTIFY_TOKEN;
 		lex->block = STATEMENTS;
@@ -1060,6 +1119,7 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
 void
 uc_lexer_free(uc_lexer_t *lex)
 {
+	uc_vector_clear(&lex->templates);
 	uc_source_put(lex->source);
 
 	free(lex->lookbehind);
@@ -1095,12 +1155,13 @@ uc_tokenname(unsigned type)
 	size_t i;
 
 	switch (type) {
-	case 0:        return "End of file";
-	case TK_STRING: return "String";
-	case TK_LABEL:  return "Label";
-	case TK_NUMBER: return "Number";
-	case TK_DOUBLE: return "Double";
-	case TK_REGEXP: return "Regexp";
+	case 0:           return "End of file";
+	case TK_TEMPLATE: return "Template";
+	case TK_STRING:   return "String";
+	case TK_LABEL:    return "Label";
+	case TK_NUMBER:   return "Number";
+	case TK_DOUBLE:   return "Double";
+	case TK_REGEXP:   return "Regexp";
 	}
 
 	for (i = 0; i < ARRAY_SIZE(tokens); i++) {
diff --git a/tests/custom/00_syntax/27_template_literals b/tests/custom/00_syntax/27_template_literals
new file mode 100644
index 0000000..40fa9ce
--- /dev/null
+++ b/tests/custom/00_syntax/27_template_literals
@@ -0,0 +1,102 @@
+The ucode language supports ES6 template literals for easy interpolation
+of expression results into strings.
+
+
+1. Simple template literals are equivalent to strings.
+
+-- Testcase --
+{{ `foo` === 'foo' }}
+-- End --
+
+-- Expect stdout --
+true
+-- End --
+
+
+2. Template literals may embed expressions using `${...}` placeholder notation.
+
+-- Testcase --
+{%
+	let x = 2;
+	let y = 4;
+
+	print(`The result of ${x} * ${y} is ${x * y}\n`);
+%}
+-- End --
+
+-- Expect stdout --
+The result of 2 * 4 is 8
+-- End --
+
+
+3. Template literals may be nested.
+
+-- Testcase --
+{%
+	let isFoo = false;
+	let isBar = true;
+
+	print(`Foo is ${isFoo} and ${isBar ? `bar is ${isBar}` : `nothing else`}!\n`);
+%}
+-- End --
+
+-- Expect stdout --
+Foo is false and bar is true!
+-- End --
+
+
+4. Placeholder expression results are implicitly stringified.
+
+-- Testcase --
+{%
+	let o1 = { foo: true };
+	let o2 = proto({ color: "red" }, { tostring: function() { return `I am a ${this.color} object` } });
+
+	print(`The first object is ${o1} and the second says "${o2}".\n`);
+%}
+-- End --
+
+-- Expect stdout --
+The first object is { "foo": true } and the second says "I am a red object".
+-- End --
+
+
+5. Escaping either `$` or `{` prevents interpolation as placeholder, sole `$`
+   characters bear no special meaning.
+
+-- Testcase --
+{%
+	printf("%.J\n", [
+		`foo \${bar} baz`,
+		`foo $\{bar} baz`,
+		`foo $bar baz`
+	]);
+%}
+-- End --
+
+-- Expect stdout --
+[
+	"foo ${bar} baz",
+	"foo ${bar} baz",
+	"foo $bar baz"
+]
+-- End --
+
+
+6. Unterminated placeholder expressions are a synatax error.
+
+-- Testcase --
+{{
+	`foo ${ bar`
+}}
+-- End --
+
+-- Expect stderr --
+Syntax error: Unterminated string
+In line 2, byte 13:
+
+ `    `foo ${ bar``
+  Near here -----^
+
+
+-- End --
author	Jo-Philipp Wich <jo@mein.io>	2022-04-13 16:37:28 +0200
committer	GitHub <noreply@github.com>	2022-04-13 16:37:28 +0200
commit	568adbe8d465e200e7d1a5958bbe215e1e04d427 (patch)
tree	c23ec0351ac1f08917604bfa824fa83d0ab5668c
parent	23ddf91d6380da392c8eea7b7fe12c2cd687b6de (diff)
parent	e14b0993b101839d2d40b5c4f184e6b0c2083b65 (diff)