lexer: improve regex literal handling

- Do not treat slashes within bracket expressions as delimitters - Do not escape slashes when stringifying regex sources - Allow all escape sequence types in regex literals Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2022-10-02 12:57:50 +0200
committer: Jo-Philipp Wich <jo@mein.io> 2022-10-04 21:14:31 +0200
commit: a45f2a388efb649e0373a45c6db1d009dc18072d (patch)
tree: fceccd7bb14bc6cca3021817530b1f52f88fae2f /lexer.c
parent: d64d5d685d86b38dda8a314b7d1404633e26b346 (diff)
1 files changed, 158 insertions, 83 deletions
diff --git a/lexer.c b/lexer.c
index 7c7788a..786e495 100644
--- a/lexer.c
+++ b/lexer.c
@@ -209,10 +209,118 @@ append_utf8(uc_lexer_t *lex, int code) {
 }
 
 static uc_token_t *
-parse_string(uc_lexer_t *lex, int kind)
+parse_escape(uc_lexer_t *lex, const char *retain)
 {
 	int code, ch, i;
+
+	/* unicode escape sequence */
+	if (check_char(lex, 'u')) {
+		for (i = 0, code = 0; i < 4; i++) {
+			ch = next_char(lex);
+
+			if (!isxdigit(ch))
+				return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+			code = code * 16 + hex(ch);
+		}
+
+		/* is a leading surrogate value */
+		if ((code & 0xFC00) == 0xD800) {
+			/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+			if (lex->lead_surrogate)
+				append_utf8(lex, 0xFFFD);
+
+			/* store surrogate value and advance to next escape sequence */
+			lex->lead_surrogate = code;
+		}
+
+		/* is a trailing surrogate value */
+		else if ((code & 0xFC00) == 0xDC00) {
+			/* found a trailing surrogate following a leading one, combine and encode */
+			if (lex->lead_surrogate) {
+				code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+				lex->lead_surrogate = 0;
+			}
+
+			/* trailing surrogate not following a leading one, ignore and use replacement char */
+			else {
+				code = 0xFFFD;
+			}
+
+			append_utf8(lex, code);
+		}
+
+		/* is a normal codepoint */
+		else {
+			append_utf8(lex, code);
+		}
+	}
+
+	/* hex escape sequence */
+	else if (check_char(lex, 'x')) {
+		for (i = 0, code = 0; i < 2; i++) {
+			ch = next_char(lex);
+
+			if (!isxdigit(ch))
+				return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+			code = code * 16 + hex(ch);
+		}
+
+		append_utf8(lex, code);
+	}
+
+	/* octal or letter */
+	else {
+		/* try to parse octal sequence... */
+		for (i = 0, code = 0, ch = lookahead_char(lex);
+		     i < 3 && ch >= '0' && ch <= '7';
+		     i++, next_char(lex), ch = lookahead_char(lex)) {
+			code = code * 8 + dec(ch);
+		}
+
+		if (i) {
+			if (code > 255)
+				return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+			append_utf8(lex, code);
+		}
+
+		/* ... no octal sequence, handle other escape */
+		else {
+			ch = next_char(lex);
+
+			switch (ch) {
+			case 'a': uc_vector_push(&lex->buffer, '\a'); break;
+			case 'b': uc_vector_push(&lex->buffer, '\b'); break;
+			case 'e': uc_vector_push(&lex->buffer, '\033'); break;
+			case 'f': uc_vector_push(&lex->buffer, '\f'); break;
+			case 'n': uc_vector_push(&lex->buffer, '\n'); break;
+			case 'r': uc_vector_push(&lex->buffer, '\r'); break;
+			case 't': uc_vector_push(&lex->buffer, '\t'); break;
+			case 'v': uc_vector_push(&lex->buffer, '\v'); break;
+
+			case EOF:
+				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
+
+			default:
+				if (strchr(retain, ch))
+					uc_vector_push(&lex->buffer, '\\');
+
+				uc_vector_push(&lex->buffer, ch);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static uc_token_t *
+parse_string(uc_lexer_t *lex, int kind)
+{
+	uc_token_t *err;
 	unsigned type;
+	int code, ch;
 	size_t off;
 
 	if (kind == '`')
@@ -237,107 +345,74 @@ parse_string(uc_lexer_t *lex, int kind)
 			uc_vector_push(&lex->buffer, '$');
 			break;
 
-		/* escape sequence */
-		case '\\':
-			/* unicode escape sequence */
-			if (type != TK_REGEXP && check_char(lex, 'u')) {
-				for (i = 0, code = 0; i < 4; i++) {
-					ch = next_char(lex);
-
-					if (!isxdigit(ch))
-						return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+		/* regexp bracket expression */
+		case '[':
+			uc_vector_push(&lex->buffer, '[');
 
-					code = code * 16 + hex(ch);
-				}
+			if (type == TK_REGEXP) {
+				/* skip leading negation (^) */
+				if (check_char(lex, '^'))
+					uc_vector_push(&lex->buffer, '^');
 
-				/* is a leading surrogate value */
-				if ((code & 0xFC00) == 0xD800) {
-					/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
-					if (lex->lead_surrogate)
-						append_utf8(lex, 0xFFFD);
+				/* skip leading `]` - it is literal and not closing the bracket expr */
+				if (check_char(lex, ']'))
+					uc_vector_push(&lex->buffer, ']');
 
-					/* store surrogate value and advance to next escape sequence */
-					lex->lead_surrogate = code;
-				}
+				/* read until closing `]` */
+				for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+					if (ch == '\\') {
+						err = parse_escape(lex, "^");
 
-				/* is a trailing surrogate value */
-				else if ((code & 0xFC00) == 0xDC00) {
-					/* found a trailing surrogate following a leading one, combine and encode */
-					if (lex->lead_surrogate) {
-						code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
-						lex->lead_surrogate = 0;
-					}
+						if (err)
+							return err;
 
-					/* trailing surrogate not following a leading one, ignore and use replacement char */
-					else {
-						code = 0xFFFD;
+						continue;
 					}
 
-					append_utf8(lex, code);
-				}
+					uc_vector_push(&lex->buffer, ch);
 
-				/* is a normal codepoint */
-				else {
-					append_utf8(lex, code);
-				}
-			}
+					if (ch == ']')
+						break;
 
-			/* hex escape sequence */
-			else if (type != TK_REGEXP && check_char(lex, 'x')) {
-				for (i = 0, code = 0; i < 2; i++) {
-					ch = next_char(lex);
+					/* skip nested char classes / equivalence classes / collating chars */
+					if (ch == '[') {
+						code = lookahead_char(lex);
 
-					if (!isxdigit(ch))
-						return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+						if (code == ':' || code == '.' || code == '=') {
+							uc_vector_push(&lex->buffer, code);
+							next_char(lex);
 
-					code = code * 16 + hex(ch);
-				}
+							for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+								if (ch == '\\') {
+									err = parse_escape(lex, "");
 
-				append_utf8(lex, code);
-			}
+									if (err)
+										return err;
 
-			/* octal or letter */
-			else {
-				/* try to parse octal sequence... */
-				for (i = 0, code = 0, ch = lookahead_char(lex);
-				     kind != '/' && i < 3 && ch >= '0' && ch <= '7';
-				     i++, next_char(lex), ch = lookahead_char(lex)) {
-					code = code * 8 + dec(ch);
-				}
+									continue;
+								}
 
-				if (i) {
-					if (code > 255)
-						return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+								uc_vector_push(&lex->buffer, ch);
 
-					append_utf8(lex, code);
+								if (ch == code && check_char(lex, ']')) {
+									uc_vector_push(&lex->buffer, ']');
+									break;
+								}
+							}
+						}
+					}
 				}
+			}
 
-				/* ... no octal sequence, handle other escape */
-				else {
-					ch = next_char(lex);
-
-					switch (ch) {
-					case 'a': uc_vector_push(&lex->buffer, '\a'); break;
-					case 'b': uc_vector_push(&lex->buffer, '\b'); break;
-					case 'e': uc_vector_push(&lex->buffer, '\033'); break;
-					case 'f': uc_vector_push(&lex->buffer, '\f'); break;
-					case 'n': uc_vector_push(&lex->buffer, '\n'); break;
-					case 'r': uc_vector_push(&lex->buffer, '\r'); break;
-					case 't': uc_vector_push(&lex->buffer, '\t'); break;
-					case 'v': uc_vector_push(&lex->buffer, '\v'); break;
-
-					case EOF:
-						return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
+			break;
 
-					default:
-						/* regex mode => retain backslash */
-						if (type == TK_REGEXP)
-							uc_vector_push(&lex->buffer, '\\');
+		/* escape sequence */
+		case '\\':
+			err = parse_escape(lex,
+				(type == TK_REGEXP) ? "^.[$()|*+?{\\" : "");
 
-						uc_vector_push(&lex->buffer, ch);
-					}
-				}
-			}
+			if (err)
+				return err;
 
 			break;
author	Jo-Philipp Wich <jo@mein.io>	2022-10-02 12:57:50 +0200
committer	Jo-Philipp Wich <jo@mein.io>	2022-10-04 21:14:31 +0200
commit	a45f2a388efb649e0373a45c6db1d009dc18072d (patch)
tree	fceccd7bb14bc6cca3021817530b1f52f88fae2f /lexer.c
parent	d64d5d685d86b38dda8a314b7d1404633e26b346 (diff)