diff options
author | Jo-Philipp Wich <jo@mein.io> | 2022-01-13 16:06:17 +0100 |
---|---|---|
committer | Jo-Philipp Wich <jo@mein.io> | 2022-01-18 10:58:11 +0100 |
commit | 6c2caf9fbb9d346cfb20cd5c83875fdff77e584c (patch) | |
tree | 4d0fe816584e8f351ed0f1da8be0b9ccf1c5635f /lexer.c | |
parent | 725bb75b7b66dd1e0a381908e831cede0402cb6e (diff) |
source: refactor source file handling
- Move source object pointer into program entity which is referenced by
each function
- Move lineinfo related routines into source.c and use them from lexer.c
since lineinfo encoding does not belong into the lexical analyzer.
- Implement initial infrastructure for detecting source file type,
this is required later to differentiate between plaintext and
precompiled bytecode files
Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 93 |
1 files changed, 3 insertions, 90 deletions
@@ -278,65 +278,14 @@ _buf_startswith(uc_lexer_t *lex, const char *str, size_t len) { #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) -/* lineinfo is encoded in bytes: the most significant bit specifies whether - * to advance the line count by one or not, while the remaining 7 bits encode - * the amounts of bytes on the current line. - * - * If a line has more than 127 characters, the first byte will be set to - * 0xff (1 1111111) and subsequent bytes will encode the remaining characters - * in bits 1..7 while setting bit 8 to 0. A line with 400 characters will thus - * be encoded as 0xff 0x7f 0x7f 0x13 (1:1111111 + 0:1111111 + 0:1111111 + 0:1111111). - * - * The newline character itself is not counted, so an empty line is encoded as - * 0x80 (1:0000000). - */ - -static void -next_lineinfo(uc_lexer_t *lex) -{ - uc_lineinfo_t *lines = &lex->source->lineinfo; - - uc_vector_grow(lines); - lines->entries[lines->count++] = 0x80; -} - -static void -update_lineinfo(uc_lexer_t *lex, size_t off) -{ - uc_lineinfo_t *lines = &lex->source->lineinfo; - uint8_t *entry, n; - - entry = uc_vector_last(lines); - - if ((entry[0] & 0x7f) + off <= 0x7f) { - entry[0] += off; - } - else { - off -= (0x7f - (entry[0] & 0x7f)); - entry[0] |= 0x7f; - - while (off > 0) { - n = (off > 0x7f) ? 0x7f : off; - uc_vector_grow(lines); - entry = uc_vector_last(lines); - entry[1] = n; - off -= n; - lines->count++; - } - } -} - static void buf_consume(uc_lexer_t *lex, size_t len) { size_t i, linelen; - if (!lex->source->lineinfo.count) - next_lineinfo(lex); - for (i = 0, linelen = 0; i < len; i++) { if (lex->bufstart[i] == '\n') { - update_lineinfo(lex, linelen); - next_lineinfo(lex); + uc_source_line_update(lex->source, linelen); + uc_source_line_next(lex->source); linelen = 0; } @@ -346,7 +295,7 @@ buf_consume(uc_lexer_t *lex, size_t len) { } if (linelen) - update_lineinfo(lex, linelen); + uc_source_line_update(lex->source, linelen); lex->bufstart += len; lex->source->off += len; @@ -1120,38 +1069,6 @@ lex_step(uc_lexer_t *lex, FILE *fp) return NULL; } -static void -uc_lexer_skip_shebang(uc_lexer_t *lex) -{ - uc_source_t *source = lex->source; - FILE *fp = source->fp; - int c1, c2; - - c1 = fgetc(fp); - c2 = fgetc(fp); - - if (c1 == '#' && c2 == '!') { - next_lineinfo(lex); - - source->off += 2; - - while ((c1 = fgetc(fp)) != EOF) { - source->off++; - - if (c1 == '\n') { - update_lineinfo(lex, source->off); - next_lineinfo(lex); - - break; - } - } - } - else { - ungetc(c2, fp); - ungetc(c1, fp); - } -} - void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) { @@ -1187,10 +1104,6 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) lex->state = UC_LEX_IDENTIFY_TOKEN; lex->block = STATEMENTS; } - - /* Skip any potential interpreter line */ - if (lex->source->off == 0) - uc_lexer_skip_shebang(lex); } void |