summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--lexer.c34
-rw-r--r--tests/custom/00_syntax/21_regex_literals29
2 files changed, 56 insertions, 7 deletions
diff --git a/lexer.c b/lexer.c
index 786e495..b6f082d 100644
--- a/lexer.c
+++ b/lexer.c
@@ -209,9 +209,10 @@ append_utf8(uc_lexer_t *lex, int code) {
}
static uc_token_t *
-parse_escape(uc_lexer_t *lex, const char *retain)
+parse_escape(uc_lexer_t *lex, const char *regex_macros)
{
int code, ch, i;
+ const char *p;
/* unicode escape sequence */
if (check_char(lex, 'u')) {
@@ -286,7 +287,31 @@ parse_escape(uc_lexer_t *lex, const char *retain)
append_utf8(lex, code);
}
- /* ... no octal sequence, handle other escape */
+ /* ... no octal sequence, handle potential regex macros */
+ else if (strchr(regex_macros, ch)) {
+ ch = next_char(lex);
+
+ switch (ch) {
+ case 'd': p = "[[:digit:]]"; break;
+ case 'D': p = "[^[:digit:]]"; break;
+ case 'w': p = "[[:alnum:]_]"; break;
+ case 'W': p = "[^[:alnum:]_]"; break;
+ case 's': p = "[[:space:]]"; break;
+ case 'S': p = "[^[:space:]]"; break;
+ default: p = NULL;
+ }
+
+ if (p) {
+ while (*p)
+ uc_vector_push(&lex->buffer, *p++);
+ }
+ else {
+ uc_vector_push(&lex->buffer, '\\');
+ uc_vector_push(&lex->buffer, ch);
+ }
+ }
+
+ /* ... handle other escape */
else {
ch = next_char(lex);
@@ -304,9 +329,6 @@ parse_escape(uc_lexer_t *lex, const char *retain)
return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
default:
- if (strchr(retain, ch))
- uc_vector_push(&lex->buffer, '\\');
-
uc_vector_push(&lex->buffer, ch);
}
}
@@ -409,7 +431,7 @@ parse_string(uc_lexer_t *lex, int kind)
/* escape sequence */
case '\\':
err = parse_escape(lex,
- (type == TK_REGEXP) ? "^.[$()|*+?{\\" : "");
+ (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
if (err)
return err;
diff --git a/tests/custom/00_syntax/21_regex_literals b/tests/custom/00_syntax/21_regex_literals
index 7466a2e..44f0079 100644
--- a/tests/custom/00_syntax/21_regex_literals
+++ b/tests/custom/00_syntax/21_regex_literals
@@ -4,7 +4,7 @@ within regular expression literals is subject of the underlying
regular expression engine.
-- Expect stdout --
-[ "/Hello world/", "/test/gis", "/test/g", "/test1 / test2/", "/1\n\\.\u0007\bc☀\\\\/" ]
+[ "/Hello world/", "/test/gis", "/test/g", "/test1 / test2/", "/1\n\\.\u0007\\bc☀\\\\/" ]
-- End --
-- Testcase --
@@ -117,3 +117,30 @@ literal delimitters.
]);
%}
-- End --
+
+
+Testing that regex extension macros are substituted only outside of
+bracket set expressions.
+
+-- Expect stdout --
+[
+ "/ \\b \\B [\b B] /",
+ "/ \\< \\> [< >] /",
+ "/ [[:digit:]] [^[:digit:]] [d D] /",
+ "/ [[:space:]] [^[:space:]] [s S] /",
+ "/ [[:alnum:]_] [^[:alnum:]_] [w W] /"
+]
+-- End --
+
+-- Testcase --
+{%
+ printf("%.J\n", [
+ / \b \B [\b \B] /, // \b outside brackets is a word boundary,
+ // \b within brackets is backspace
+ / \< \> [\< \>] /,
+ / \d \D [\d \D] /,
+ / \s \S [\s \S] /,
+ / \w \W [\w \W] /
+ ]);
+%}
+-- End --