summaryrefslogtreecommitdiffhomepage
path: root/lexer.c
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2022-10-05 21:34:59 +0200
committerJo-Philipp Wich <jo@mein.io>2022-10-05 23:01:05 +0200
commit21ace5e5c7c98271d78ff9cdf2b61e3ac70704d8 (patch)
treebc2f084916c574acb4c6e10ddf5146a796c4b7f7 /lexer.c
parentf8e00b4b83dad76e183b8293870cfe3110f1fa94 (diff)
lexer: fixes for regex literal parsing
- Ensure that regexp extension escapes are consistently handled; substitute `\d`, `\D`, `\s`, `\S`, `\w` and `\W` with `[[:digit:]]`, `[^[:digit:]]`, `[[:space:]]`, `[^[:space:]]`, `[[:alnum:]_]` and `[^[:alnum:]_]` character classes respectively since not all POSIX regexp implementations implement all of those extensions - Preserve `\b`, `\B`, `\<` and `\>` boundary matches Fixes: a45f2a3 ("lexer: improve regex literal handling") Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c34
1 files changed, 28 insertions, 6 deletions
diff --git a/lexer.c b/lexer.c
index 786e495..b6f082d 100644
--- a/lexer.c
+++ b/lexer.c
@@ -209,9 +209,10 @@ append_utf8(uc_lexer_t *lex, int code) {
}
static uc_token_t *
-parse_escape(uc_lexer_t *lex, const char *retain)
+parse_escape(uc_lexer_t *lex, const char *regex_macros)
{
int code, ch, i;
+ const char *p;
/* unicode escape sequence */
if (check_char(lex, 'u')) {
@@ -286,7 +287,31 @@ parse_escape(uc_lexer_t *lex, const char *retain)
append_utf8(lex, code);
}
- /* ... no octal sequence, handle other escape */
+ /* ... no octal sequence, handle potential regex macros */
+ else if (strchr(regex_macros, ch)) {
+ ch = next_char(lex);
+
+ switch (ch) {
+ case 'd': p = "[[:digit:]]"; break;
+ case 'D': p = "[^[:digit:]]"; break;
+ case 'w': p = "[[:alnum:]_]"; break;
+ case 'W': p = "[^[:alnum:]_]"; break;
+ case 's': p = "[[:space:]]"; break;
+ case 'S': p = "[^[:space:]]"; break;
+ default: p = NULL;
+ }
+
+ if (p) {
+ while (*p)
+ uc_vector_push(&lex->buffer, *p++);
+ }
+ else {
+ uc_vector_push(&lex->buffer, '\\');
+ uc_vector_push(&lex->buffer, ch);
+ }
+ }
+
+ /* ... handle other escape */
else {
ch = next_char(lex);
@@ -304,9 +329,6 @@ parse_escape(uc_lexer_t *lex, const char *retain)
return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
default:
- if (strchr(retain, ch))
- uc_vector_push(&lex->buffer, '\\');
-
uc_vector_push(&lex->buffer, ch);
}
}
@@ -409,7 +431,7 @@ parse_string(uc_lexer_t *lex, int kind)
/* escape sequence */
case '\\':
err = parse_escape(lex,
- (type == TK_REGEXP) ? "^.[$()|*+?{\\" : "");
+ (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
if (err)
return err;