From 21ace5e5c7c98271d78ff9cdf2b61e3ac70704d8 Mon Sep 17 00:00:00 2001
From: Jo-Philipp Wich <jo@mein.io>
Date: Wed, 5 Oct 2022 21:34:59 +0200
Subject: lexer: fixes for regex literal parsing

 - Ensure that regexp extension escapes are consistently handled;
   substitute `\d`, `\D`, `\s`, `\S`, `\w` and `\W` with `[[:digit:]]`,
   `[^[:digit:]]`, `[[:space:]]`, `[^[:space:]]`, `[[:alnum:]_]` and
   `[^[:alnum:]_]` character classes respectively since not all POSIX
   regexp implementations implement all of those extensions
 - Preserve `\b`, `\B`, `\<` and `\>` boundary matches

Fixes: a45f2a3 ("lexer: improve regex literal handling")
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
---
 lexer.c                                  | 34 ++++++++++++++++++++++++++------
 tests/custom/00_syntax/21_regex_literals | 29 ++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/lexer.c b/lexer.c
index 786e495..b6f082d 100644
--- a/lexer.c
+++ b/lexer.c
@@ -209,9 +209,10 @@ append_utf8(uc_lexer_t *lex, int code) {
 }
 
 static uc_token_t *
-parse_escape(uc_lexer_t *lex, const char *retain)
+parse_escape(uc_lexer_t *lex, const char *regex_macros)
 {
 	int code, ch, i;
+	const char *p;
 
 	/* unicode escape sequence */
 	if (check_char(lex, 'u')) {
@@ -286,7 +287,31 @@ parse_escape(uc_lexer_t *lex, const char *retain)
 			append_utf8(lex, code);
 		}
 
-		/* ... no octal sequence, handle other escape */
+		/* ... no octal sequence, handle potential regex macros */
+		else if (strchr(regex_macros, ch)) {
+			ch = next_char(lex);
+
+			switch (ch) {
+			case 'd': p = "[[:digit:]]";   break;
+			case 'D': p = "[^[:digit:]]";  break;
+			case 'w': p = "[[:alnum:]_]";  break;
+			case 'W': p = "[^[:alnum:]_]"; break;
+			case 's': p = "[[:space:]]";   break;
+			case 'S': p = "[^[:space:]]";  break;
+			default:  p = NULL;
+			}
+
+			if (p) {
+				while (*p)
+					uc_vector_push(&lex->buffer, *p++);
+			}
+			else {
+				uc_vector_push(&lex->buffer, '\\');
+				uc_vector_push(&lex->buffer, ch);
+			}
+		}
+
+		/* ... handle other escape */
 		else {
 			ch = next_char(lex);
 
@@ -304,9 +329,6 @@ parse_escape(uc_lexer_t *lex, const char *retain)
 				return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
 
 			default:
-				if (strchr(retain, ch))
-					uc_vector_push(&lex->buffer, '\\');
-
 				uc_vector_push(&lex->buffer, ch);
 			}
 		}
@@ -409,7 +431,7 @@ parse_string(uc_lexer_t *lex, int kind)
 		/* escape sequence */
 		case '\\':
 			err = parse_escape(lex,
-				(type == TK_REGEXP) ? "^.[$()|*+?{\\" : "");
+				(type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
 
 			if (err)
 				return err;
diff --git a/tests/custom/00_syntax/21_regex_literals b/tests/custom/00_syntax/21_regex_literals
index 7466a2e..44f0079 100644
--- a/tests/custom/00_syntax/21_regex_literals
+++ b/tests/custom/00_syntax/21_regex_literals
@@ -4,7 +4,7 @@ within regular expression literals is subject of the underlying
 regular expression engine.
 
 -- Expect stdout --
-[ "/Hello world/", "/test/gis", "/test/g", "/test1 / test2/", "/1\n\\.\u0007\bc☀\\\\/" ]
+[ "/Hello world/", "/test/gis", "/test/g", "/test1 / test2/", "/1\n\\.\u0007\\bc☀\\\\/" ]
 -- End --
 
 -- Testcase --
@@ -117,3 +117,30 @@ literal delimitters.
 	]);
 %}
 -- End --
+
+
+Testing that regex extension macros are substituted only outside of
+bracket set expressions.
+
+-- Expect stdout --
+[
+	"/ \\b \\B [\b B] /",
+	"/ \\< \\> [< >] /",
+	"/ [[:digit:]] [^[:digit:]] [d D] /",
+	"/ [[:space:]] [^[:space:]] [s S] /",
+	"/ [[:alnum:]_] [^[:alnum:]_] [w W] /"
+]
+-- End --
+
+-- Testcase --
+{%
+	printf("%.J\n", [
+		/ \b \B [\b \B] /,   // \b outside brackets is a word boundary,
+		                     // \b within brackets is backspace
+		/ \< \> [\< \>] /,
+		/ \d \D [\d \D] /,
+		/ \s \S [\s \S] /,
+		/ \w \W [\w \W] /
+	]);
+%}
+-- End --
-- 
cgit v1.2.3