diff options
author | Jo-Philipp Wich <jo@mein.io> | 2022-10-05 21:34:59 +0200 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2022-10-05 23:01:05 +0200 |
commit | 21ace5e5c7c98271d78ff9cdf2b61e3ac70704d8 (patch) | |
tree | bc2f084916c574acb4c6e10ddf5146a796c4b7f7 /lexer.c | |
parent | f8e00b4b83dad76e183b8293870cfe3110f1fa94 (diff) |
lexer: fixes for regex literal parsing
- Ensure that regexp extension escapes are consistently handled;
substitute `\d`, `\D`, `\s`, `\S`, `\w` and `\W` with `[[:digit:]]`,
`[^[:digit:]]`, `[[:space:]]`, `[^[:space:]]`, `[[:alnum:]_]` and
`[^[:alnum:]_]` character classes respectively since not all POSIX
regexp implementations implement all of those extensions
- Preserve `\b`, `\B`, `\<` and `\>` boundary matches
Fixes: a45f2a3 ("lexer: improve regex literal handling")
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 34 |
1 files changed, 28 insertions, 6 deletions
@@ -209,9 +209,10 @@ append_utf8(uc_lexer_t *lex, int code) { } static uc_token_t * -parse_escape(uc_lexer_t *lex, const char *retain) +parse_escape(uc_lexer_t *lex, const char *regex_macros) { int code, ch, i; + const char *p; /* unicode escape sequence */ if (check_char(lex, 'u')) { @@ -286,7 +287,31 @@ parse_escape(uc_lexer_t *lex, const char *retain) append_utf8(lex, code); } - /* ... no octal sequence, handle other escape */ + /* ... no octal sequence, handle potential regex macros */ + else if (strchr(regex_macros, ch)) { + ch = next_char(lex); + + switch (ch) { + case 'd': p = "[[:digit:]]"; break; + case 'D': p = "[^[:digit:]]"; break; + case 'w': p = "[[:alnum:]_]"; break; + case 'W': p = "[^[:alnum:]_]"; break; + case 's': p = "[[:space:]]"; break; + case 'S': p = "[^[:space:]]"; break; + default: p = NULL; + } + + if (p) { + while (*p) + uc_vector_push(&lex->buffer, *p++); + } + else { + uc_vector_push(&lex->buffer, '\\'); + uc_vector_push(&lex->buffer, ch); + } + } + + /* ... handle other escape */ else { ch = next_char(lex); @@ -304,9 +329,6 @@ parse_escape(uc_lexer_t *lex, const char *retain) return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string")); default: - if (strchr(retain, ch)) - uc_vector_push(&lex->buffer, '\\'); - uc_vector_push(&lex->buffer, ch); } } @@ -409,7 +431,7 @@ parse_string(uc_lexer_t *lex, int kind) /* escape sequence */ case '\\': err = parse_escape(lex, - (type == TK_REGEXP) ? "^.[$()|*+?{\\" : ""); + (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : ""); if (err) return err; |