summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2020-11-12 23:34:56 +0100
committerJo-Philipp Wich <jo@mein.io>2020-11-15 17:10:20 +0100
commit6cec0d369192f43d87680a3f807054659206f713 (patch)
treeeec57d24891fd5da2439eaf612fffb11a0d58b03
parentcfb9b55964ff7074e1b35883b4b6009060151b26 (diff)
lexer: improve scanner performance
Optimize the strncmp() based token lookup with an integer comparison approach which roughly cuts the time of the source code parsing phase in half. Signed-off-by: Jo-Philipp Wich <jo@mein.io>
-rw-r--r--ast.c20
-rw-r--r--ast.h23
-rw-r--r--lexer.c170
-rw-r--r--lib.h4
4 files changed, 116 insertions, 101 deletions
diff --git a/ast.c b/ast.c
index 823b2a7..562a87b 100644
--- a/ast.c
+++ b/ast.c
@@ -29,26 +29,6 @@
static size_t ut_ext_types_count = 0;
static struct ut_extended_type *ut_ext_types = NULL;
-struct ut_op *
-ut_get_op(struct ut_state *s, uint32_t off)
-{
- if (off == 0 || off > s->poolsize)
- return NULL;
-
- return &s->pool[off - 1];
-}
-
-struct ut_op *
-ut_get_child(struct ut_state *s, uint32_t off, int n)
-{
- struct ut_op *op = ut_get_op(s, off);
-
- if (!op || n >= ARRAY_SIZE(op->tree.operand) || !op->tree.operand[n])
- return NULL;
-
- return ut_get_op(s, op->tree.operand[n]);
-}
-
uint32_t
ut_new_op(struct ut_state *s, int type, struct json_object *val, ...)
{
diff --git a/ast.h b/ast.h
index 864e5f9..337b81a 100644
--- a/ast.h
+++ b/ast.h
@@ -33,6 +33,10 @@
#define ALIGN(x) (((x) + sizeof(size_t) - 1) & -sizeof(size_t))
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+#endif
+
#define JSON_C_TO_STRING_STRICT (1<<31)
enum ut_lex_state {
@@ -150,8 +154,23 @@ struct ut_extended_type {
void (*free)(void *);
};
-struct ut_op *ut_get_op(struct ut_state *s, uint32_t off);
-struct ut_op *ut_get_child(struct ut_state *s, uint32_t off, int n);
+static inline struct ut_op *ut_get_op(struct ut_state *s, uint32_t off)
+{
+ if (off == 0 || off > s->poolsize)
+ return NULL;
+
+ return &s->pool[off - 1];
+}
+
+static inline struct ut_op *ut_get_child(struct ut_state *s, uint32_t off, int n)
+{
+ struct ut_op *op = ut_get_op(s, off);
+
+ if (!op || n >= ARRAY_SIZE(op->tree.operand) || !op->tree.operand[n])
+ return NULL;
+
+ return ut_get_op(s, op->tree.operand[n]);
+}
static inline uint32_t ut_get_off(struct ut_state *s, struct ut_op *op) {
return op ? (op - s->pool + 1) : 0;
diff --git a/lexer.c b/lexer.c
index 31a4daf..1982dea 100644
--- a/lexer.c
+++ b/lexer.c
@@ -23,6 +23,7 @@
#include <regex.h>
#include <math.h>
#include <errno.h>
+#include <endian.h>
#include "ast.h"
#include "lib.h"
@@ -30,17 +31,26 @@
#include "parser.h"
-struct token {
+struct keyword {
int type;
const char *pat;
int plen;
union {
- uint32_t (*parse)(struct ut_state *s);
double d;
bool b;
};
};
+struct token {
+ int type;
+ union {
+ uint32_t patn;
+ char pat[4];
+ };
+ int plen;
+ uint32_t (*parse)(struct ut_state *s);
+};
+
#define dec(o) \
((o) - '0')
@@ -55,79 +65,75 @@ static uint32_t parse_number(struct ut_state *);
static uint32_t parse_label(struct ut_state *);
static const struct token tokens[] = {
- { 0, " ", 1 },
- { 0, "\t", 1 },
- { 0, "\r", 1 },
- { 0, "\n", 1 },
- { T_ASLEFT, "<<=", 3 },
- { T_ASRIGHT, ">>=", 3 },
- { T_LEXP, "{{-", 3 },
- { T_REXP, "-}}", 3 },
- { T_LSTM, "{%+", 3 },
- { T_LSTM, "{%-", 3 },
- { T_RSTM, "-%}", 3 },
- { T_EQS, "===", 3 },
- { T_NES, "!==", 3 },
- { T_ELLIP, "...", 3 },
- { T_AND, "&&", 2 },
- { T_ASADD, "+=", 2 },
- { T_ASBAND, "&=", 2 },
- { T_ASBOR, "|=", 2 },
- { T_ASBXOR, "^=", 2 },
- //{ T_ASDIV, "/=", 2 },
- { T_ASMOD, "%=", 2 },
- { T_ASMUL, "*=", 2 },
- { T_ASSUB, "-=", 2 },
- { T_DEC, "--", 2 },
- { T_INC, "++", 2 },
- { T_EQ, "==", 2 },
- { T_NE, "!=", 2 },
- { T_LE, "<=", 2 },
- { T_GE, ">=", 2 },
- { T_LSHIFT, "<<", 2 },
- { T_RSHIFT, ">>", 2 },
- { 0, "//", 2, { .parse = parse_comment } },
- { 0, "/*", 2, { .parse = parse_comment } },
- { T_OR, "||", 2 },
- { T_LEXP, "{{", 2 },
- { T_REXP, "}}", 2 },
- { T_LSTM, "{%", 2 },
- { T_RSTM, "%}", 2 },
- { T_ARROW, "=>", 2 },
- { T_ADD, "+", 1 },
- { T_ASSIGN, "=", 1 },
- { T_BAND, "&", 1 },
- { T_BOR, "|", 1 },
- { T_LBRACK, "[", 1 },
- { T_RBRACK, "]", 1 },
- { T_BXOR, "^", 1 },
- { T_LBRACE, "{", 1 },
- { T_RBRACE, "}", 1 },
- { T_COLON, ":", 1 },
- { T_COMMA, ",", 1 },
- { T_COMPL, "~", 1 },
- //{ T_DIV, "/", 1 },
- { T_GT, ">", 1 },
- { T_NOT, "!", 1 },
- { T_LT, "<", 1 },
- { T_MOD, "%", 1 },
- { T_MUL, "*", 1 },
- { T_LPAREN, "(", 1 },
- { T_RPAREN, ")", 1 },
- { T_QMARK, "?", 1 },
- { T_SCOL, ";", 1 },
- { T_SUB, "-", 1 },
- { T_DOT, ".", 1 },
- { T_STRING, "'", 1, { .parse = parse_string } },
- { T_STRING, "\"", 1, { .parse = parse_string } },
- { T_REGEXP, "/", 1, { .parse = parse_regexp } },
- { T_LABEL, "_", 1, { .parse = parse_label } },
- { T_LABEL, "az", 0, { .parse = parse_label } },
- { T_LABEL, "AZ", 0, { .parse = parse_label } },
- { T_NUMBER, "09", 0, { .parse = parse_number } },
+ { T_ASLEFT, { .pat = "<<=" }, 3 },
+ { T_ASRIGHT, { .pat = ">>=" }, 3 },
+ { T_LEXP, { .pat = "{{-" }, 3 },
+ { T_REXP, { .pat = "-}}" }, 3 },
+ { T_LSTM, { .pat = "{%+" }, 3 },
+ { T_LSTM, { .pat = "{%-" }, 3 },
+ { T_RSTM, { .pat = "-%}" }, 3 },
+ { T_EQS, { .pat = "===" }, 3 },
+ { T_NES, { .pat = "!==" }, 3 },
+ { T_ELLIP, { .pat = "..." }, 3 },
+ { T_AND, { .pat = "&&" }, 2 },
+ { T_ASADD, { .pat = "+=" }, 2 },
+ { T_ASBAND, { .pat = "&=" }, 2 },
+ { T_ASBOR, { .pat = "|=" }, 2 },
+ { T_ASBXOR, { .pat = "^=" }, 2 },
+ //{ T_ASDIV, { .pat = "/=" }, 2 },
+ { T_ASMOD, { .pat = "%=" }, 2 },
+ { T_ASMUL, { .pat = "*=" }, 2 },
+ { T_ASSUB, { .pat = "-=" }, 2 },
+ { T_DEC, { .pat = "--" }, 2 },
+ { T_INC, { .pat = "++" }, 2 },
+ { T_EQ, { .pat = "==" }, 2 },
+ { T_NE, { .pat = "!=" }, 2 },
+ { T_LE, { .pat = "<=" }, 2 },
+ { T_GE, { .pat = ">=" }, 2 },
+ { T_LSHIFT, { .pat = "<<" }, 2 },
+ { T_RSHIFT, { .pat = ">>" }, 2 },
+ { 0, { .pat = "//" }, 2, parse_comment },
+ { 0, { .pat = "/*" }, 2, parse_comment },
+ { T_OR, { .pat = "||" }, 2 },
+ { T_LEXP, { .pat = "{{" }, 2 },
+ { T_REXP, { .pat = "}}" }, 2 },
+ { T_LSTM, { .pat = "{%" }, 2 },
+ { T_RSTM, { .pat = "%}" }, 2 },
+ { T_ARROW, { .pat = "=>" }, 2 },
+ { T_ADD, { .pat = "+" }, 1 },
+ { T_ASSIGN, { .pat = "=" }, 1 },
+ { T_BAND, { .pat = "&" }, 1 },
+ { T_BOR, { .pat = "|" }, 1 },
+ { T_LBRACK, { .pat = "[" }, 1 },
+ { T_RBRACK, { .pat = "]" }, 1 },
+ { T_BXOR, { .pat = "^" }, 1 },
+ { T_LBRACE, { .pat = "{" }, 1 },
+ { T_RBRACE, { .pat = "}" }, 1 },
+ { T_COLON, { .pat = ":" }, 1 },
+ { T_COMMA, { .pat = "," }, 1 },
+ { T_COMPL, { .pat = "~" }, 1 },
+ //{ T_DIV, { .pat = "/" }, 1 },
+ { T_GT, { .pat = ">" }, 1 },
+ { T_NOT, { .pat = "!" }, 1 },
+ { T_LT, { .pat = "<" }, 1 },
+ { T_MOD, { .pat = "%" }, 1 },
+ { T_MUL, { .pat = "*" }, 1 },
+ { T_LPAREN, { .pat = "(" }, 1 },
+ { T_RPAREN, { .pat = ")" }, 1 },
+ { T_QMARK, { .pat = "?" }, 1 },
+ { T_SCOL, { .pat = ";" }, 1 },
+ { T_SUB, { .pat = "-" }, 1 },
+ { T_DOT, { .pat = "." }, 1 },
+ { T_STRING, { .pat = "'" }, 1, parse_string },
+ { T_STRING, { .pat = "\"" }, 1, parse_string },
+ { T_REGEXP, { .pat = "/" }, 1, parse_regexp },
+ { T_LABEL, { .pat = "_" }, 1, parse_label },
+ { T_LABEL, { .pat = "az" }, 0, parse_label },
+ { T_LABEL, { .pat = "AZ" }, 0, parse_label },
+ { T_NUMBER, { .pat = "09" }, 0, parse_number },
};
-static const struct token reserved_words[] = {
+static const struct keyword reserved_words[] = {
{ T_ENDFUNC, "endfunction", 11 },
{ T_DOUBLE, "Infinity", 8, { .d = INFINITY } },
{ T_CONTINUE, "continue", 8 },
@@ -719,7 +725,7 @@ static uint32_t
parse_label(struct ut_state *s)
{
const struct token *tok = s->lex.tok;
- const struct token *word;
+ const struct keyword *word;
uint32_t rv;
char *ptr;
size_t i;
@@ -831,6 +837,8 @@ parse_number(struct ut_state *s)
static uint32_t
lex_step(struct ut_state *s, FILE *fp)
{
+ uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
+ union { uint32_t n; char str[4]; } search;
const struct token *tok;
size_t rlen, rem;
char *ptr, c;
@@ -1008,6 +1016,18 @@ lex_step(struct ut_state *s, FILE *fp)
case UT_LEX_IDENTIFY_TOKEN:
+ /* skip leading whitespace */
+ for (i = 0; i < buf_remaining(s) && isspace(s->lex.bufstart[i]); i++)
+ ;
+
+ buf_consume(s, i);
+
+ if (i > 0 && buf_remaining(s) < UT_LEX_MAX_TOKEN_LEN)
+ return 0;
+
+ for (i = 0; i < sizeof(search.str); i++)
+ search.str[i] = (i < buf_remaining(s)) ? s->lex.bufstart[i] : 0;
+
for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
/* remaining buffer data is shorter than token, skip */
if (tok->plen > buf_remaining(s))
@@ -1015,7 +1035,7 @@ lex_step(struct ut_state *s, FILE *fp)
c = s->lex.bufstart[0];
- if (tok->plen ? !strncmp(s->lex.bufstart, tok->pat, tok->plen)
+ if (tok->plen ? ((search.n & masks[tok->plen]) == tok->patn)
: (c >= tok->pat[0] && c <= tok->pat[1])) {
buf_consume(s, tok->plen);
diff --git a/lib.h b/lib.h
index d391a8d..aaaff55 100644
--- a/lib.h
+++ b/lib.h
@@ -20,10 +20,6 @@
#include "ast.h"
#include "lexer.h"
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
-#endif
-
typedef struct json_object *(ut_c_fn)(struct ut_state *, uint32_t, struct json_object *);
void ut_lib_init(struct ut_state *state, struct json_object *scope);