--[[-------------------------------------------------------------------- llex.lua: Lua 5.1 lexical analyzer in Lua This file is part of LuaSrcDiet, based on Yueliang material. Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> The COPYRIGHT file describes the conditions under which this software may be distributed. See the ChangeLog for more information. ----------------------------------------------------------------------]] --[[-------------------------------------------------------------------- -- NOTES: -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0, -- with significant modifications to handle LuaSrcDiet's needs: -- (1) llex.error is an optional error function handler -- (2) seminfo for strings include their delimiters and no -- translation operations are performed on them -- * ADDED shbang handling has been added to support executable scripts -- * NO localized decimal point replacement magic -- * NO limit to number of lines -- * NO support for compatible long strings (LUA_COMPAT_LSTR) -- * Please read technotes.txt for more technical details. ----------------------------------------------------------------------]] local base = _G local string = require "string" module "llex" local find = string.find local match = string.match local sub = string.sub ---------------------------------------------------------------------- -- initialize keyword list, variables ---------------------------------------------------------------------- local kw = {} for v in string.gmatch([[ and break do else elseif end false for function if in local nil not or repeat return then true until while]], "%S+") do kw[v] = true end -- NOTE: see init() for module variables (externally visible): -- tok, seminfo, tokln local z, -- source stream sourceid, -- name of source I, -- position of lexer buff, -- buffer for strings ln -- line number ---------------------------------------------------------------------- -- add information to token listing ---------------------------------------------------------------------- local function addtoken(token, info) local i = #tok + 1 tok[i] = token seminfo[i] = info tokln[i] = ln end ---------------------------------------------------------------------- -- handles line number incrementation and end-of-line characters ---------------------------------------------------------------------- local function inclinenumber(i, is_tok) local sub = sub local old = sub(z, i, i) i = i + 1 -- skip '\n' or '\r' local c = sub(z, i, i) if (c == "\n" or c == "\r") and (c ~= old) then i = i + 1 -- skip '\n\r' or '\r\n' old = old..c end if is_tok then addtoken("TK_EOL", old) end ln = ln + 1 I = i return i end ---------------------------------------------------------------------- -- initialize lexer for given source _z and source name _sourceid ---------------------------------------------------------------------- function init(_z, _sourceid) z = _z -- source sourceid = _sourceid -- name of source I = 1 -- lexer's position in source ln = 1 -- line number tok = {} -- lexed token list* seminfo = {} -- lexed semantic information list* tokln = {} -- line numbers for messages* -- (*) externally visible thru' module -------------------------------------------------------------------- -- initial processing (shbang handling) -------------------------------------------------------------------- local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)") if p then -- skip first line I = I + #q addtoken("TK_COMMENT", q) if #r > 0 then inclinenumber(I, true) end end end ---------------------------------------------------------------------- -- returns a chunk name or id, no truncation for long names ---------------------------------------------------------------------- function chunkid() if sourceid and match(sourceid, "^[=@]") then return sub(sourceid, 2) -- remove first char end return "[string]" end ---------------------------------------------------------------------- -- formats error message and throws error -- * a simplified version, does not report what token was responsible ---------------------------------------------------------------------- function errorline(s, line) local e = error or base.error e(string.format("%s:%d: %s", chunkid(), line or ln, s)) end local errorline = errorline ------------------------------------------------------------------------ -- count separators ("=") in a long string delimiter ------------------------------------------------------------------------ local function skip_sep(i) local sub = sub local s = sub(z, i, i) i = i + 1 local count = #match(z, "=*", i) -- note, take the length i = i + count I = i return (sub(z, i, i) == s) and count or (-count) - 1 end ---------------------------------------------------------------------- -- reads a long string or long comment ---------------------------------------------------------------------- local function read_long_string(is_str, sep) local i = I + 1 -- skip 2nd '[' local sub = sub local c = sub(z, i, i) if c == "\r" or c == "\n" then -- string starts with a newline? i = inclinenumber(i) -- skip it end local j = i while true do local p, q, r = find(z, "([\r\n%]])", i) -- (long range) if not p then errorline(is_str and "unfinished long string" or "unfinished long comment") end i = p if r == "]" then -- delimiter test if skip_sep(i) == sep then buff = sub(z, buff, I) I = I + 1 -- skip 2nd ']' return buff end i = I else -- newline buff = buff.."\n" i = inclinenumber(i) end end--while end ---------------------------------------------------------------------- -- reads a string ---------------------------------------------------------------------- local function read_string(del) local i = I local find = find local sub = sub while true do local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range) if p then if r == "\n" or r == "\r" then errorline("unfinished string") end i = p if r == "\\" then -- handle escapes i = i + 1 r = sub(z, i, i) if r == "" then break end -- (EOZ error) p = find("abfnrtv\n\r", r, 1, true) ------------------------------------------------------ if p then -- special escapes if p > 7 then i = inclinenumber(i) else i = i + 1 end ------------------------------------------------------ elseif find(r, "%D") then -- other non-digits i = i + 1 ------------------------------------------------------ else -- \xxx sequence local p, q, s = find(z, "^(%d%d?%d?)", i) i = q + 1 if s + 1 > 256 then -- UCHAR_MAX errorline("escape sequence too large") end ------------------------------------------------------ end--if p else i = i + 1 if r == del then -- ending delimiter I = i return sub(z, buff, i - 1) -- return string end end--if r else break -- (error) end--if p end--while errorline("unfinished string") end ------------------------------------------------------------------------ -- main lexer function ------------------------------------------------------------------------ function llex() local find = find local match = match while true do--outer local i = I -- inner loop allows break to be used to nicely section tests while true do--inner ---------------------------------------------------------------- local p, _, r = find(z, "^([_%a][_%w]*)", i) if p then I = i + #r if kw[r] then addtoken("TK_KEYWORD", r) -- reserved word (keyword) else addtoken("TK_NAME", r) -- identifier end break -- (continue) end ---------------------------------------------------------------- local p, _, r = find(z, "^(%.?)%d", i) if p then -- numeral if r == "." then i = i + 1 end local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i) i = q + 1 if #r == 1 then -- optional exponent if match(z, "^[%+%-]", i) then -- optional sign i = i + 1 end end local _, q = find(z, "^[_%w]*", i) I = q + 1 local v = sub(z, p, q) -- string equivalent if not base.tonumber(v) then -- handles hex test also errorline("malformed number") end addtoken("TK_NUMBER", v) break -- (continue) end ---------------------------------------------------------------- local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i) if p then if t == "\n" or t == "\r" then -- newline inclinenumber(i, true) else I = q + 1 -- whitespace addtoken("TK_SPACE", r) end break -- (continue) end ---------------------------------------------------------------- local r = match(z, "^%p", i) if r then buff = i local p = find("-[\"\'.=<>~", r, 1, true) if p then -- two-level if block for punctuation/symbols -------------------------------------------------------- if p <= 2 then if p == 1 then -- minus local c = match(z, "^%-%-(%[?)", i) if c then i = i + 2 local sep = -1 if c == "[" then sep = skip_sep(i) end if sep >= 0 then -- long comment addtoken("TK_LCOMMENT", read_long_string(false, sep)) else -- short comment I = find(z, "[\n\r]", i) or (#z + 1) addtoken("TK_COMMENT", sub(z, buff, I - 1)) end break -- (continue) end -- (fall through for "-") else -- [ or long string local sep = skip_sep(i) if sep >= 0 then addtoken("TK_LSTRING", read_long_string(true, sep)) elseif sep == -1 then addtoken("TK_OP", "[") else errorline("invalid long string delimiter") end break -- (continue) end -------------------------------------------------------- elseif p <= 5 then if p < 5 then -- strings I = i + 1 addtoken("TK_STRING", read_string(r)) break -- (continue) end r = match(z, "^%.%.?%.?", i) -- .|..|... dots -- (fall through) -------------------------------------------------------- else -- relational r = match(z, "^%p=?", i) -- (fall through) end end I = i + #r addtoken("TK_OP", r) -- for other symbols, fall through break -- (continue) end ---------------------------------------------------------------- local r = sub(z, i, i) if r ~= "" then I = i + 1 addtoken("TK_OP", r) -- other single-char tokens break end addtoken("TK_EOS", "") -- end of stream, return -- exit here ---------------------------------------------------------------- end--while inner end--while outer end return base.getfenv()