diff options
Diffstat (limited to 'contrib/luasrcdiet/lua/optlex.lua')
-rw-r--r-- | contrib/luasrcdiet/lua/optlex.lua | 832 |
1 files changed, 832 insertions, 0 deletions
diff --git a/contrib/luasrcdiet/lua/optlex.lua b/contrib/luasrcdiet/lua/optlex.lua new file mode 100644 index 0000000000..4c46b918bf --- /dev/null +++ b/contrib/luasrcdiet/lua/optlex.lua @@ -0,0 +1,832 @@ +--[[-------------------------------------------------------------------- + + optlex.lua: does lexer-based optimizations + This file is part of LuaSrcDiet. + + Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> + The COPYRIGHT file describes the conditions + under which this software may be distributed. + + See the ChangeLog for more information. + +----------------------------------------------------------------------]] + +--[[-------------------------------------------------------------------- +-- NOTES: +-- * For more lexer-based optimization ideas, see the TODO items or +-- look at technotes.txt. +-- * TODO: general string delimiter conversion optimizer +-- * TODO: (numbers) warn if overly significant digit +----------------------------------------------------------------------]] + +local base = _G +local string = require "string" +module "optlex" +local match = string.match +local sub = string.sub +local find = string.find +local rep = string.rep +local print + +------------------------------------------------------------------------ +-- variables and data structures +------------------------------------------------------------------------ + +-- error function, can override by setting own function into module +error = base.error + +warn = {} -- table for warning flags + +local stoks, sinfos, stoklns -- source lists + +local is_realtoken = { -- significant (grammar) tokens + TK_KEYWORD = true, + TK_NAME = true, + TK_NUMBER = true, + TK_STRING = true, + TK_LSTRING = true, + TK_OP = true, + TK_EOS = true, +} +local is_faketoken = { -- whitespace (non-grammar) tokens + TK_COMMENT = true, + TK_LCOMMENT = true, + TK_EOL = true, + TK_SPACE = true, +} + +local opt_details -- for extra information + +------------------------------------------------------------------------ +-- true if current token is at the start of a line +-- * skips over deleted tokens via recursion +------------------------------------------------------------------------ + +local function atlinestart(i) + local tok = stoks[i - 1] + if i <= 1 or tok == "TK_EOL" then + return true + elseif tok == "" then + return atlinestart(i - 1) + end + return false +end + +------------------------------------------------------------------------ +-- true if current token is at the end of a line +-- * skips over deleted tokens via recursion +------------------------------------------------------------------------ + +local function atlineend(i) + local tok = stoks[i + 1] + if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then + return true + elseif tok == "" then + return atlineend(i + 1) + end + return false +end + +------------------------------------------------------------------------ +-- counts comment EOLs inside a long comment +-- * in order to keep line numbering, EOLs need to be reinserted +------------------------------------------------------------------------ + +local function commenteols(lcomment) + local sep = #match(lcomment, "^%-%-%[=*%[") + local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims + local i, c = 1, 0 + while true do + local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) + if not p then break end -- if no matches, done + i = p + 1 + c = c + 1 + if #s > 0 and r ~= s then -- skip CRLF or LFCR + i = i + 1 + end + end + return c +end + +------------------------------------------------------------------------ +-- compares two tokens (i, j) and returns the whitespace required +-- * important! see technotes.txt for more information +-- * only two grammar/real tokens are being considered +-- * if "", no separation is needed +-- * if " ", then at least one whitespace (or EOL) is required +------------------------------------------------------------------------ + +local function checkpair(i, j) + local match = match + local t1, t2 = stoks[i], stoks[j] + -------------------------------------------------------------------- + if t1 == "TK_STRING" or t1 == "TK_LSTRING" or + t2 == "TK_STRING" or t2 == "TK_LSTRING" then + return "" + -------------------------------------------------------------------- + elseif t1 == "TK_OP" or t2 == "TK_OP" then + if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or + (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then + return "" + end + if t1 == "TK_OP" and t2 == "TK_OP" then + -- for TK_OP/TK_OP pairs, see notes in technotes.txt + local op, op2 = sinfos[i], sinfos[j] + if (match(op, "^%.%.?$") and match(op2, "^%.")) or + (match(op, "^[~=<>]$") and op2 == "=") or + (op == "[" and (op2 == "[" or op2 == "=")) then + return " " + end + return "" + end + -- "TK_OP" + "TK_NUMBER" case + local op = sinfos[i] + if t2 == "TK_OP" then op = sinfos[j] end + if match(op, "^%.%.?%.?$") then + return " " + end + return "" + -------------------------------------------------------------------- + else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then + return " " + -------------------------------------------------------------------- + end +end + +------------------------------------------------------------------------ +-- repack tokens, removing deletions caused by optimization process +------------------------------------------------------------------------ + +local function repack_tokens() + local dtoks, dinfos, dtoklns = {}, {}, {} + local j = 1 + for i = 1, #stoks do + local tok = stoks[i] + if tok ~= "" then + dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i] + j = j + 1 + end + end + stoks, sinfos, stoklns = dtoks, dinfos, dtoklns +end + +------------------------------------------------------------------------ +-- number optimization +-- * optimization using string formatting functions is one way of doing +-- this, but here, we consider all cases and handle them separately +-- (possibly an idiotic approach...) +-- * scientific notation being generated is not in canonical form, this +-- may or may not be a bad thing, feedback welcome +-- * note: intermediate portions need to fit into a normal number range +-- * optimizations can be divided based on number patterns: +-- * hexadecimal: +-- (1) no need to remove leading zeros, just skip to (2) +-- (2) convert to integer if size equal or smaller +-- * change if equal size -> lose the 'x' to reduce entropy +-- (3) number is then processed as an integer +-- (4) note: does not make 0[xX] consistent +-- * integer: +-- (1) note: includes anything with trailing ".", ".0", ... +-- (2) remove useless fractional part, if present, e.g. 123.000 +-- (3) remove leading zeros, e.g. 000123 +-- (4) switch to scientific if shorter, e.g. 123000 -> 123e3 +-- * with fraction: +-- (1) split into digits dot digits +-- (2) if no integer portion, take as zero (can omit later) +-- (3) handle degenerate .000 case, after which the fractional part +-- must be non-zero (if zero, it's matched as an integer) +-- (4) remove trailing zeros for fractional portion +-- (5) p.q where p > 0 and q > 0 cannot be shortened any more +-- (6) otherwise p == 0 and the form is .q, e.g. .000123 +-- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6 +-- * scientific: +-- (1) split into (digits dot digits) [eE] ([+-] digits) +-- (2) if significand has ".", shift it out so it becomes an integer +-- (3) if significand is zero, just use zero +-- (4) remove leading zeros for significand +-- (5) shift out trailing zeros for significand +-- (6) examine exponent and determine which format is best: +-- integer, with fraction, scientific +------------------------------------------------------------------------ + +local function do_number(i) + local before = sinfos[i] -- 'before' + local z = before -- working representation + local y -- 'after', if better + -------------------------------------------------------------------- + if match(z, "^0[xX]") then -- hexadecimal number + local v = base.tostring(base.tonumber(z)) + if #v <= #z then + z = v -- change to integer, AND continue + else + return -- no change; stick to hex + end + end + -------------------------------------------------------------------- + if match(z, "^%d+%.?0*$") then -- integer or has useless frac + z = match(z, "^(%d+)%.?0*$") -- int portion only + if z + 0 > 0 then + z = match(z, "^0*([1-9]%d*)$") -- remove leading zeros + local v = #match(z, "0*$") + local nv = base.tostring(v) + if v > #nv + 1 then -- scientific is shorter + z = sub(z, 1, #z - v).."e"..nv + end + y = z + else + y = "0" -- basic zero + end + -------------------------------------------------------------------- + elseif not match(z, "[eE]") then -- number with fraction part + local p, q = match(z, "^(%d*)%.(%d+)$") -- split + if p == "" then p = 0 end -- int part zero + if q + 0 == 0 and p == 0 then + y = "0" -- degenerate .000 case + else + -- now, q > 0 holds and p is a number + local v = #match(q, "0*$") -- remove trailing zeros + if v > 0 then + q = sub(q, 1, #q - v) + end + -- if p > 0, nothing else we can do to simplify p.q case + if p + 0 > 0 then + y = p.."."..q + else + y = "."..q -- tentative, e.g. .000123 + local v = #match(q, "^0*") -- # leading spaces + local w = #q - v -- # significant digits + local nv = base.tostring(#q) + -- e.g. compare 123e-6 versus .000123 + if w + 2 + #nv < 1 + #q then + y = sub(q, -w).."e-"..nv + end + end + end + -------------------------------------------------------------------- + else -- scientific number + local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$") + ex = base.tonumber(ex) + -- if got ".", shift out fractional portion of significand + local p, q = match(sig, "^(%d*)%.(%d*)$") + if p then + ex = ex - #q + sig = p..q + end + if sig + 0 == 0 then + y = "0" -- basic zero + else + local v = #match(sig, "^0*") -- remove leading zeros + sig = sub(sig, v + 1) + v = #match(sig, "0*$") -- shift out trailing zeros + if v > 0 then + sig = sub(sig, 1, #sig - v) + ex = ex + v + end + -- examine exponent and determine which format is best + local nex = base.tostring(ex) + if ex == 0 then -- it's just an integer + y = sig + elseif ex > 0 and (ex <= 1 + #nex) then -- a number + y = sig..rep("0", ex) + elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123 + v = #sig + ex + y = sub(sig, 1, v).."."..sub(sig, v + 1) + elseif ex < 0 and (#nex >= -ex - #sig) then + -- e.g. compare 1234e-5 versus .01234 + -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig + -- -> #nex >= -ex - #sig + v = -ex - #sig + y = "."..rep("0", v)..sig + else -- non-canonical scientific representation + y = sig.."e"..ex + end + end--if sig + end + -------------------------------------------------------------------- + if y and y ~= sinfos[i] then + if opt_details then + print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y) + opt_details = opt_details + 1 + end + sinfos[i] = y + end +end + +------------------------------------------------------------------------ +-- string optimization +-- * note: works on well-formed strings only! +-- * optimizations on characters can be summarized as follows: +-- \a\b\f\n\r\t\v -- no change +-- \\ -- no change +-- \"\' -- depends on delim, other can remove \ +-- \[\] -- remove \ +-- \<char> -- general escape, remove \ +-- \<eol> -- normalize the EOL only +-- \ddd -- if \a\b\f\n\r\t\v, change to latter +-- if other < ascii 32, keep ddd but zap leading zeros +-- if >= ascii 32, translate it into the literal, then also +-- do escapes for \\,\",\' cases +-- <other> -- no change +-- * switch delimiters if string becomes shorter +------------------------------------------------------------------------ + +local function do_string(I) + local info = sinfos[I] + local delim = sub(info, 1, 1) -- delimiter used + local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> ' + local z = sub(info, 2, -2) -- actual string + local i = 1 + local c_delim, c_ndelim = 0, 0 -- "/' counts + -------------------------------------------------------------------- + while i <= #z do + local c = sub(z, i, i) + ---------------------------------------------------------------- + if c == "\\" then -- escaped stuff + local j = i + 1 + local d = sub(z, j, j) + local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true) + ------------------------------------------------------------ + if not p then -- \<char> -- remove \ + z = sub(z, 1, i - 1)..sub(z, j) + i = i + 1 + ------------------------------------------------------------ + elseif p <= 8 then -- \a\b\f\n\r\t\v\\ + i = i + 2 -- no change + ------------------------------------------------------------ + elseif p <= 10 then -- \<eol> -- normalize EOL + local eol = sub(z, j, j + 1) + if eol == "\r\n" or eol == "\n\r" then + z = sub(z, 1, i).."\n"..sub(z, j + 2) + elseif p == 10 then -- \r case + z = sub(z, 1, i).."\n"..sub(z, j + 1) + end + i = i + 2 + ------------------------------------------------------------ + elseif p <= 12 then -- \"\' -- remove \ for ndelim + if d == delim then + c_delim = c_delim + 1 + i = i + 2 + else + c_ndelim = c_ndelim + 1 + z = sub(z, 1, i - 1)..sub(z, j) + i = i + 1 + end + ------------------------------------------------------------ + else -- \ddd -- various steps + local s = match(z, "^(%d%d?%d?)", j) + j = i + 1 + #s -- skip to location + local cv = s + 0 + local cc = string.char(cv) + local p = find("\a\b\f\n\r\t\v", cc, 1, true) + if p then -- special escapes + s = "\\"..sub("abfnrtv", p, p) + elseif cv < 32 then -- normalized \ddd + s = "\\"..cv + elseif cc == delim then -- \<delim> + s = "\\"..cc + c_delim = c_delim + 1 + elseif cc == "\\" then -- \\ + s = "\\\\" + else -- literal character + s = cc + if cc == ndelim then + c_ndelim = c_ndelim + 1 + end + end + z = sub(z, 1, i - 1)..s..sub(z, j) + i = i + #s + ------------------------------------------------------------ + end--if p + ---------------------------------------------------------------- + else-- c ~= "\\" -- <other> -- no change + i = i + 1 + if c == ndelim then -- count ndelim, for switching delimiters + c_ndelim = c_ndelim + 1 + end + ---------------------------------------------------------------- + end--if c + end--while + -------------------------------------------------------------------- + -- switching delimiters, a long-winded derivation: + -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes + -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes + -- simplifying the condition (1)>(2) --> c_delim > c_ndelim + if c_delim > c_ndelim then + i = 1 + while i <= #z do + local p, q, r = find(z, "([\'\"])", i) + if not p then break end + if r == delim then -- \<delim> -> <delim> + z = sub(z, 1, p - 2)..sub(z, p) + i = p + else-- r == ndelim -- <ndelim> -> \<ndelim> + z = sub(z, 1, p - 1).."\\"..sub(z, p) + i = p + 2 + end + end--while + delim = ndelim -- actually change delimiters + end + -------------------------------------------------------------------- + z = delim..z..delim + if z ~= sinfos[I] then + if opt_details then + print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z) + opt_details = opt_details + 1 + end + sinfos[I] = z + end +end + +------------------------------------------------------------------------ +-- long string optimization +-- * note: warning flagged if trailing whitespace found, not trimmed +-- * remove first optional newline +-- * normalize embedded newlines +-- * reduce '=' separators in delimiters if possible +------------------------------------------------------------------------ + +local function do_lstring(I) + local info = sinfos[I] + local delim1 = match(info, "^%[=*%[") -- cut out delimiters + local sep = #delim1 + local delim2 = sub(info, -sep, -1) + local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims + local y = "" + local i = 1 + -------------------------------------------------------------------- + while true do + local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) + -- deal with a single line + local ln + if not p then + ln = sub(z, i) + elseif p >= i then + ln = sub(z, i, p - 1) + end + if ln ~= "" then + -- flag a warning if there are trailing spaces, won't optimize! + if match(ln, "%s+$") then + warn.lstring = "trailing whitespace in long string near line "..stoklns[I] + end + y = y..ln + end + if not p then -- done if no more EOLs + break + end + -- deal with line endings, normalize them + i = p + 1 + if p then + if #s > 0 and r ~= s then -- skip CRLF or LFCR + i = i + 1 + end + -- skip first newline, which can be safely deleted + if not(i == 1 and i == p) then + y = y.."\n" + end + end + end--while + -------------------------------------------------------------------- + -- handle possible deletion of one or more '=' separators + if sep >= 3 then + local chk, okay = sep - 1 + -- loop to test ending delimiter with less of '=' down to zero + while chk >= 2 do + local delim = "%]"..rep("=", chk - 2).."%]" + if not match(y, delim) then okay = chk end + chk = chk - 1 + end + if okay then -- change delimiters + sep = rep("=", okay - 2) + delim1, delim2 = "["..sep.."[", "]"..sep.."]" + end + end + -------------------------------------------------------------------- + sinfos[I] = delim1..y..delim2 +end + +------------------------------------------------------------------------ +-- long comment optimization +-- * note: does not remove first optional newline +-- * trim trailing whitespace +-- * normalize embedded newlines +-- * reduce '=' separators in delimiters if possible +------------------------------------------------------------------------ + +local function do_lcomment(I) + local info = sinfos[I] + local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters + local sep = #delim1 + local delim2 = sub(info, -sep, -1) + local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims + local y = "" + local i = 1 + -------------------------------------------------------------------- + while true do + local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) + -- deal with a single line, extract and check trailing whitespace + local ln + if not p then + ln = sub(z, i) + elseif p >= i then + ln = sub(z, i, p - 1) + end + if ln ~= "" then + -- trim trailing whitespace if non-empty line + local ws = match(ln, "%s*$") + if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end + y = y..ln + end + if not p then -- done if no more EOLs + break + end + -- deal with line endings, normalize them + i = p + 1 + if p then + if #s > 0 and r ~= s then -- skip CRLF or LFCR + i = i + 1 + end + y = y.."\n" + end + end--while + -------------------------------------------------------------------- + -- handle possible deletion of one or more '=' separators + sep = sep - 2 + if sep >= 3 then + local chk, okay = sep - 1 + -- loop to test ending delimiter with less of '=' down to zero + while chk >= 2 do + local delim = "%]"..rep("=", chk - 2).."%]" + if not match(y, delim) then okay = chk end + chk = chk - 1 + end + if okay then -- change delimiters + sep = rep("=", okay - 2) + delim1, delim2 = "--["..sep.."[", "]"..sep.."]" + end + end + -------------------------------------------------------------------- + sinfos[I] = delim1..y..delim2 +end + +------------------------------------------------------------------------ +-- short comment optimization +-- * trim trailing whitespace +------------------------------------------------------------------------ + +local function do_comment(i) + local info = sinfos[i] + local ws = match(info, "%s*$") -- just look from end of string + if #ws > 0 then + info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace + end + sinfos[i] = info +end + +------------------------------------------------------------------------ +-- returns true if string found in long comment +-- * this is a feature to keep copyright or license texts +------------------------------------------------------------------------ + +local function keep_lcomment(opt_keep, info) + if not opt_keep then return false end -- option not set + local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters + local sep = #delim1 + local delim2 = sub(info, -sep, -1) + local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims + if find(z, opt_keep, 1, true) then -- try to match + return true + end +end + +------------------------------------------------------------------------ +-- main entry point +-- * currently, lexer processing has 2 passes +-- * processing is done on a line-oriented basis, which is easier to +-- grok due to the next point... +-- * since there are various options that can be enabled or disabled, +-- processing is a little messy or convoluted +------------------------------------------------------------------------ + +function optimize(option, toklist, semlist, toklnlist) + -------------------------------------------------------------------- + -- set option flags + -------------------------------------------------------------------- + local opt_comments = option["opt-comments"] + local opt_whitespace = option["opt-whitespace"] + local opt_emptylines = option["opt-emptylines"] + local opt_eols = option["opt-eols"] + local opt_strings = option["opt-strings"] + local opt_numbers = option["opt-numbers"] + local opt_keep = option.KEEP + opt_details = option.DETAILS and 0 -- upvalues for details display + print = print or base.print + if opt_eols then -- forced settings, otherwise won't work properly + opt_comments = true + opt_whitespace = true + opt_emptylines = true + end + -------------------------------------------------------------------- + -- variable initialization + -------------------------------------------------------------------- + stoks, sinfos, stoklns -- set source lists + = toklist, semlist, toklnlist + local i = 1 -- token position + local tok, info -- current token + local prev -- position of last grammar token + -- on same line (for TK_SPACE stuff) + -------------------------------------------------------------------- + -- changes a token, info pair + -------------------------------------------------------------------- + local function settoken(tok, info, I) + I = I or i + stoks[I] = tok or "" + sinfos[I] = info or "" + end + -------------------------------------------------------------------- + -- processing loop (PASS 1) + -------------------------------------------------------------------- + while true do + tok, info = stoks[i], sinfos[i] + ---------------------------------------------------------------- + local atstart = atlinestart(i) -- set line begin flag + if atstart then prev = nil end + ---------------------------------------------------------------- + if tok == "TK_EOS" then -- end of stream/pass + break + ---------------------------------------------------------------- + elseif tok == "TK_KEYWORD" or -- keywords, identifiers, + tok == "TK_NAME" or -- operators + tok == "TK_OP" then + -- TK_KEYWORD and TK_OP can't be optimized without a big + -- optimization framework; it would be more of an optimizing + -- compiler, not a source code compressor + -- TK_NAME that are locals needs parser to analyze/optimize + prev = i + ---------------------------------------------------------------- + elseif tok == "TK_NUMBER" then -- numbers + if opt_numbers then + do_number(i) -- optimize + end + prev = i + ---------------------------------------------------------------- + elseif tok == "TK_STRING" or -- strings, long strings + tok == "TK_LSTRING" then + if opt_strings then + if tok == "TK_STRING" then + do_string(i) -- optimize + else + do_lstring(i) -- optimize + end + end + prev = i + ---------------------------------------------------------------- + elseif tok == "TK_COMMENT" then -- short comments + if opt_comments then + if i == 1 and sub(info, 1, 1) == "#" then + -- keep shbang comment, trim whitespace + do_comment(i) + else + -- safe to delete, as a TK_EOL (or TK_EOS) always follows + settoken() -- remove entirely + end + elseif opt_whitespace then -- trim whitespace only + do_comment(i) + end + ---------------------------------------------------------------- + elseif tok == "TK_LCOMMENT" then -- long comments + if keep_lcomment(opt_keep, info) then + ------------------------------------------------------------ + -- if --keep, we keep a long comment if <msg> is found; + -- this is a feature to keep copyright or license texts + if opt_whitespace then -- trim whitespace only + do_lcomment(i) + end + prev = i + elseif opt_comments then + local eols = commenteols(info) + ------------------------------------------------------------ + -- prepare opt_emptylines case first, if a disposable token + -- follows, current one is safe to dump, else keep a space; + -- it is implied that the operation is safe for '-', because + -- current is a TK_LCOMMENT, and must be separate from a '-' + if is_faketoken[stoks[i + 1]] then + settoken() -- remove entirely + tok = "" + else + settoken("TK_SPACE", " ") + end + ------------------------------------------------------------ + -- if there are embedded EOLs to keep and opt_emptylines is + -- disabled, then switch the token into one or more EOLs + if not opt_emptylines and eols > 0 then + settoken("TK_EOL", rep("\n", eols)) + end + ------------------------------------------------------------ + -- if optimizing whitespaces, force reinterpretation of the + -- token to give a chance for the space to be optimized away + if opt_whitespace and tok ~= "" then + i = i - 1 -- to reinterpret + end + ------------------------------------------------------------ + else -- disabled case + if opt_whitespace then -- trim whitespace only + do_lcomment(i) + end + prev = i + end + ---------------------------------------------------------------- + elseif tok == "TK_EOL" then -- line endings + if atstart and opt_emptylines then + settoken() -- remove entirely + elseif info == "\r\n" or info == "\n\r" then + -- normalize the rest of the EOLs for CRLF/LFCR only + -- (note that TK_LCOMMENT can change into several EOLs) + settoken("TK_EOL", "\n") + end + ---------------------------------------------------------------- + elseif tok == "TK_SPACE" then -- whitespace + if opt_whitespace then + if atstart or atlineend(i) then + -- delete leading and trailing whitespace + settoken() -- remove entirely + else + ------------------------------------------------------------ + -- at this point, since leading whitespace have been removed, + -- there should be a either a real token or a TK_LCOMMENT + -- prior to hitting this whitespace; the TK_LCOMMENT case + -- only happens if opt_comments is disabled; so prev ~= nil + local ptok = stoks[prev] + if ptok == "TK_LCOMMENT" then + -- previous TK_LCOMMENT can abut with anything + settoken() -- remove entirely + else + -- prev must be a grammar token; consecutive TK_SPACE + -- tokens is impossible when optimizing whitespace + local ntok = stoks[i + 1] + if is_faketoken[ntok] then + -- handle special case where a '-' cannot abut with + -- either a short comment or a long comment + if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and + ptok == "TK_OP" and sinfos[prev] == "-" then + -- keep token + else + settoken() -- remove entirely + end + else--is_realtoken + -- check a pair of grammar tokens, if can abut, then + -- delete space token entirely, otherwise keep one space + local s = checkpair(prev, i + 1) + if s == "" then + settoken() -- remove entirely + else + settoken("TK_SPACE", " ") + end + end + end + ------------------------------------------------------------ + end + end + ---------------------------------------------------------------- + else + error("unidentified token encountered") + end + ---------------------------------------------------------------- + i = i + 1 + end--while + repack_tokens() + -------------------------------------------------------------------- + -- processing loop (PASS 2) + -------------------------------------------------------------------- + if opt_eols then + i = 1 + -- aggressive EOL removal only works with most non-grammar tokens + -- optimized away because it is a rather simple scheme -- basically + -- it just checks 'real' token pairs around EOLs + if stoks[1] == "TK_COMMENT" then + -- first comment still existing must be shbang, skip whole line + i = 3 + end + while true do + tok, info = stoks[i], sinfos[i] + -------------------------------------------------------------- + if tok == "TK_EOS" then -- end of stream/pass + break + -------------------------------------------------------------- + elseif tok == "TK_EOL" then -- consider each TK_EOL + local t1, t2 = stoks[i - 1], stoks[i + 1] + if is_realtoken[t1] and is_realtoken[t2] then -- sanity check + local s = checkpair(i - 1, i + 1) + if s == "" then + settoken() -- remove entirely + end + end + end--if tok + -------------------------------------------------------------- + i = i + 1 + end--while + repack_tokens() + end + -------------------------------------------------------------------- + if opt_details and opt_details > 0 then print() end -- spacing + return stoks, sinfos, stoklns +end |