summaryrefslogtreecommitdiffhomepage
path: root/lexer.c
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2022-10-02 12:57:50 +0200
committerJo-Philipp Wich <jo@mein.io>2022-10-04 21:14:31 +0200
commita45f2a388efb649e0373a45c6db1d009dc18072d (patch)
treefceccd7bb14bc6cca3021817530b1f52f88fae2f /lexer.c
parentd64d5d685d86b38dda8a314b7d1404633e26b346 (diff)
lexer: improve regex literal handling
- Do not treat slashes within bracket expressions as delimitters - Do not escape slashes when stringifying regex sources - Allow all escape sequence types in regex literals Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c241
1 files changed, 158 insertions, 83 deletions
diff --git a/lexer.c b/lexer.c
index 7c7788a..786e495 100644
--- a/lexer.c
+++ b/lexer.c
@@ -209,10 +209,118 @@ append_utf8(uc_lexer_t *lex, int code) {
}
static uc_token_t *
-parse_string(uc_lexer_t *lex, int kind)
+parse_escape(uc_lexer_t *lex, const char *retain)
{
int code, ch, i;
+
+ /* unicode escape sequence */
+ if (check_char(lex, 'u')) {
+ for (i = 0, code = 0; i < 4; i++) {
+ ch = next_char(lex);
+
+ if (!isxdigit(ch))
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+ code = code * 16 + hex(ch);
+ }
+
+ /* is a leading surrogate value */
+ if ((code & 0xFC00) == 0xD800) {
+ /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+ if (lex->lead_surrogate)
+ append_utf8(lex, 0xFFFD);
+
+ /* store surrogate value and advance to next escape sequence */
+ lex->lead_surrogate = code;
+ }
+
+ /* is a trailing surrogate value */
+ else if ((code & 0xFC00) == 0xDC00) {
+ /* found a trailing surrogate following a leading one, combine and encode */
+ if (lex->lead_surrogate) {
+ code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+ lex->lead_surrogate = 0;
+ }
+
+ /* trailing surrogate not following a leading one, ignore and use replacement char */
+ else {
+ code = 0xFFFD;
+ }
+
+ append_utf8(lex, code);
+ }
+
+ /* is a normal codepoint */
+ else {
+ append_utf8(lex, code);
+ }
+ }
+
+ /* hex escape sequence */
+ else if (check_char(lex, 'x')) {
+ for (i = 0, code = 0; i < 2; i++) {
+ ch = next_char(lex);
+
+ if (!isxdigit(ch))
+ return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+ code = code * 16 + hex(ch);
+ }
+
+ append_utf8(lex, code);
+ }
+
+ /* octal or letter */
+ else {
+ /* try to parse octal sequence... */
+ for (i = 0, code = 0, ch = lookahead_char(lex);
+ i < 3 && ch >= '0' && ch <= '7';
+ i++, next_char(lex), ch = lookahead_char(lex)) {
+ code = code * 8 + dec(ch);
+ }
+
+ if (i) {
+ if (code > 255)
+ return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+
+ append_utf8(lex, code);
+ }
+
+ /* ... no octal sequence, handle other escape */
+ else {
+ ch = next_char(lex);
+
+ switch (ch) {
+ case 'a': uc_vector_push(&lex->buffer, '\a'); break;
+ case 'b': uc_vector_push(&lex->buffer, '\b'); break;
+ case 'e': uc_vector_push(&lex->buffer, '\033'); break;
+ case 'f': uc_vector_push(&lex->buffer, '\f'); break;
+ case 'n': uc_vector_push(&lex->buffer, '\n'); break;
+ case 'r': uc_vector_push(&lex->buffer, '\r'); break;
+ case 't': uc_vector_push(&lex->buffer, '\t'); break;
+ case 'v': uc_vector_push(&lex->buffer, '\v'); break;
+
+ case EOF:
+ return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
+
+ default:
+ if (strchr(retain, ch))
+ uc_vector_push(&lex->buffer, '\\');
+
+ uc_vector_push(&lex->buffer, ch);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static uc_token_t *
+parse_string(uc_lexer_t *lex, int kind)
+{
+ uc_token_t *err;
unsigned type;
+ int code, ch;
size_t off;
if (kind == '`')
@@ -237,107 +345,74 @@ parse_string(uc_lexer_t *lex, int kind)
uc_vector_push(&lex->buffer, '$');
break;
- /* escape sequence */
- case '\\':
- /* unicode escape sequence */
- if (type != TK_REGEXP && check_char(lex, 'u')) {
- for (i = 0, code = 0; i < 4; i++) {
- ch = next_char(lex);
-
- if (!isxdigit(ch))
- return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+ /* regexp bracket expression */
+ case '[':
+ uc_vector_push(&lex->buffer, '[');
- code = code * 16 + hex(ch);
- }
+ if (type == TK_REGEXP) {
+ /* skip leading negation (^) */
+ if (check_char(lex, '^'))
+ uc_vector_push(&lex->buffer, '^');
- /* is a leading surrogate value */
- if ((code & 0xFC00) == 0xD800) {
- /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
- if (lex->lead_surrogate)
- append_utf8(lex, 0xFFFD);
+ /* skip leading `]` - it is literal and not closing the bracket expr */
+ if (check_char(lex, ']'))
+ uc_vector_push(&lex->buffer, ']');
- /* store surrogate value and advance to next escape sequence */
- lex->lead_surrogate = code;
- }
+ /* read until closing `]` */
+ for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+ if (ch == '\\') {
+ err = parse_escape(lex, "^");
- /* is a trailing surrogate value */
- else if ((code & 0xFC00) == 0xDC00) {
- /* found a trailing surrogate following a leading one, combine and encode */
- if (lex->lead_surrogate) {
- code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
- lex->lead_surrogate = 0;
- }
+ if (err)
+ return err;
- /* trailing surrogate not following a leading one, ignore and use replacement char */
- else {
- code = 0xFFFD;
+ continue;
}
- append_utf8(lex, code);
- }
+ uc_vector_push(&lex->buffer, ch);
- /* is a normal codepoint */
- else {
- append_utf8(lex, code);
- }
- }
+ if (ch == ']')
+ break;
- /* hex escape sequence */
- else if (type != TK_REGEXP && check_char(lex, 'x')) {
- for (i = 0, code = 0; i < 2; i++) {
- ch = next_char(lex);
+ /* skip nested char classes / equivalence classes / collating chars */
+ if (ch == '[') {
+ code = lookahead_char(lex);
- if (!isxdigit(ch))
- return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+ if (code == ':' || code == '.' || code == '=') {
+ uc_vector_push(&lex->buffer, code);
+ next_char(lex);
- code = code * 16 + hex(ch);
- }
+ for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
+ if (ch == '\\') {
+ err = parse_escape(lex, "");
- append_utf8(lex, code);
- }
+ if (err)
+ return err;
- /* octal or letter */
- else {
- /* try to parse octal sequence... */
- for (i = 0, code = 0, ch = lookahead_char(lex);
- kind != '/' && i < 3 && ch >= '0' && ch <= '7';
- i++, next_char(lex), ch = lookahead_char(lex)) {
- code = code * 8 + dec(ch);
- }
+ continue;
+ }
- if (i) {
- if (code > 255)
- return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
+ uc_vector_push(&lex->buffer, ch);
- append_utf8(lex, code);
+ if (ch == code && check_char(lex, ']')) {
+ uc_vector_push(&lex->buffer, ']');
+ break;
+ }
+ }
+ }
+ }
}
+ }
- /* ... no octal sequence, handle other escape */
- else {
- ch = next_char(lex);
-
- switch (ch) {
- case 'a': uc_vector_push(&lex->buffer, '\a'); break;
- case 'b': uc_vector_push(&lex->buffer, '\b'); break;
- case 'e': uc_vector_push(&lex->buffer, '\033'); break;
- case 'f': uc_vector_push(&lex->buffer, '\f'); break;
- case 'n': uc_vector_push(&lex->buffer, '\n'); break;
- case 'r': uc_vector_push(&lex->buffer, '\r'); break;
- case 't': uc_vector_push(&lex->buffer, '\t'); break;
- case 'v': uc_vector_push(&lex->buffer, '\v'); break;
-
- case EOF:
- return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
+ break;
- default:
- /* regex mode => retain backslash */
- if (type == TK_REGEXP)
- uc_vector_push(&lex->buffer, '\\');
+ /* escape sequence */
+ case '\\':
+ err = parse_escape(lex,
+ (type == TK_REGEXP) ? "^.[$()|*+?{\\" : "");
- uc_vector_push(&lex->buffer, ch);
- }
- }
- }
+ if (err)
+ return err;
break;