summaryrefslogtreecommitdiffhomepage
path: root/lexer.c
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2022-01-13 16:06:17 +0100
committerJo-Philipp Wich <jo@mein.io>2022-01-18 10:58:11 +0100
commit6c2caf9fbb9d346cfb20cd5c83875fdff77e584c (patch)
tree4d0fe816584e8f351ed0f1da8be0b9ccf1c5635f /lexer.c
parent725bb75b7b66dd1e0a381908e831cede0402cb6e (diff)
source: refactor source file handling
- Move source object pointer into program entity which is referenced by each function - Move lineinfo related routines into source.c and use them from lexer.c since lineinfo encoding does not belong into the lexical analyzer. - Implement initial infrastructure for detecting source file type, this is required later to differentiate between plaintext and precompiled bytecode files Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c93
1 files changed, 3 insertions, 90 deletions
diff --git a/lexer.c b/lexer.c
index 7637306..d554c0d 100644
--- a/lexer.c
+++ b/lexer.c
@@ -278,65 +278,14 @@ _buf_startswith(uc_lexer_t *lex, const char *str, size_t len) {
#define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
-/* lineinfo is encoded in bytes: the most significant bit specifies whether
- * to advance the line count by one or not, while the remaining 7 bits encode
- * the amounts of bytes on the current line.
- *
- * If a line has more than 127 characters, the first byte will be set to
- * 0xff (1 1111111) and subsequent bytes will encode the remaining characters
- * in bits 1..7 while setting bit 8 to 0. A line with 400 characters will thus
- * be encoded as 0xff 0x7f 0x7f 0x13 (1:1111111 + 0:1111111 + 0:1111111 + 0:1111111).
- *
- * The newline character itself is not counted, so an empty line is encoded as
- * 0x80 (1:0000000).
- */
-
-static void
-next_lineinfo(uc_lexer_t *lex)
-{
- uc_lineinfo_t *lines = &lex->source->lineinfo;
-
- uc_vector_grow(lines);
- lines->entries[lines->count++] = 0x80;
-}
-
-static void
-update_lineinfo(uc_lexer_t *lex, size_t off)
-{
- uc_lineinfo_t *lines = &lex->source->lineinfo;
- uint8_t *entry, n;
-
- entry = uc_vector_last(lines);
-
- if ((entry[0] & 0x7f) + off <= 0x7f) {
- entry[0] += off;
- }
- else {
- off -= (0x7f - (entry[0] & 0x7f));
- entry[0] |= 0x7f;
-
- while (off > 0) {
- n = (off > 0x7f) ? 0x7f : off;
- uc_vector_grow(lines);
- entry = uc_vector_last(lines);
- entry[1] = n;
- off -= n;
- lines->count++;
- }
- }
-}
-
static void
buf_consume(uc_lexer_t *lex, size_t len) {
size_t i, linelen;
- if (!lex->source->lineinfo.count)
- next_lineinfo(lex);
-
for (i = 0, linelen = 0; i < len; i++) {
if (lex->bufstart[i] == '\n') {
- update_lineinfo(lex, linelen);
- next_lineinfo(lex);
+ uc_source_line_update(lex->source, linelen);
+ uc_source_line_next(lex->source);
linelen = 0;
}
@@ -346,7 +295,7 @@ buf_consume(uc_lexer_t *lex, size_t len) {
}
if (linelen)
- update_lineinfo(lex, linelen);
+ uc_source_line_update(lex->source, linelen);
lex->bufstart += len;
lex->source->off += len;
@@ -1120,38 +1069,6 @@ lex_step(uc_lexer_t *lex, FILE *fp)
return NULL;
}
-static void
-uc_lexer_skip_shebang(uc_lexer_t *lex)
-{
- uc_source_t *source = lex->source;
- FILE *fp = source->fp;
- int c1, c2;
-
- c1 = fgetc(fp);
- c2 = fgetc(fp);
-
- if (c1 == '#' && c2 == '!') {
- next_lineinfo(lex);
-
- source->off += 2;
-
- while ((c1 = fgetc(fp)) != EOF) {
- source->off++;
-
- if (c1 == '\n') {
- update_lineinfo(lex, source->off);
- next_lineinfo(lex);
-
- break;
- }
- }
- }
- else {
- ungetc(c2, fp);
- ungetc(c1, fp);
- }
-}
-
void
uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
{
@@ -1187,10 +1104,6 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = STATEMENTS;
}
-
- /* Skip any potential interpreter line */
- if (lex->source->off == 0)
- uc_lexer_skip_shebang(lex);
}
void