summaryrefslogtreecommitdiffhomepage
path: root/contrib/luasrcdiet/lua/optparser.lua
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/luasrcdiet/lua/optparser.lua')
-rw-r--r--contrib/luasrcdiet/lua/optparser.lua564
1 files changed, 564 insertions, 0 deletions
diff --git a/contrib/luasrcdiet/lua/optparser.lua b/contrib/luasrcdiet/lua/optparser.lua
new file mode 100644
index 000000000..cfe6cc101
--- /dev/null
+++ b/contrib/luasrcdiet/lua/optparser.lua
@@ -0,0 +1,564 @@
+--[[--------------------------------------------------------------------
+
+ optparser.lua: does parser-based optimizations
+ This file is part of LuaSrcDiet.
+
+ Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
+ The COPYRIGHT file describes the conditions
+ under which this software may be distributed.
+
+ See the ChangeLog for more information.
+
+----------------------------------------------------------------------]]
+
+--[[--------------------------------------------------------------------
+-- NOTES:
+-- * For more parser-based optimization ideas, see the TODO items or
+-- look at technotes.txt.
+-- * The processing load is quite significant, but since this is an
+-- off-line text processor, I believe we can wait a few seconds.
+-- * TODO: might process "local a,a,a" wrongly... need tests!
+-- * TODO: remove position handling if overlapped locals (rem < 0)
+-- needs more study, to check behaviour
+-- * TODO: there are probably better ways to do allocation, e.g. by
+-- choosing better methods to sort and pick locals...
+-- * TODO: we don't need 53*63 two-letter identifiers; we can make
+-- do with significantly less depending on how many that are really
+-- needed and improve entropy; e.g. 13 needed -> choose 4*4 instead
+----------------------------------------------------------------------]]
+
+local base = _G
+local string = require "string"
+local table = require "table"
+module "optparser"
+
+----------------------------------------------------------------------
+-- Letter frequencies for reducing symbol entropy (fixed version)
+-- * Might help a wee bit when the output file is compressed
+-- * See Wikipedia: http://en.wikipedia.org/wiki/Letter_frequencies
+-- * We use letter frequencies according to a Linotype keyboard, plus
+-- the underscore, and both lower case and upper case letters.
+-- * The arrangement below (LC, underscore, %d, UC) is arbitrary.
+-- * This is certainly not optimal, but is quick-and-dirty and the
+-- process has no significant overhead
+----------------------------------------------------------------------
+
+local LETTERS = "etaoinshrdlucmfwypvbgkqjxz_ETAOINSHRDLUCMFWYPVBGKQJXZ"
+local ALPHANUM = "etaoinshrdlucmfwypvbgkqjxz_0123456789ETAOINSHRDLUCMFWYPVBGKQJXZ"
+
+-- names or identifiers that must be skipped
+-- * the first two lines are for keywords
+local SKIP_NAME = {}
+for v in string.gmatch([[
+and break do else elseif end false for function if in
+local nil not or repeat return then true until while
+self]], "%S+") do
+ SKIP_NAME[v] = true
+end
+
+------------------------------------------------------------------------
+-- variables and data structures
+------------------------------------------------------------------------
+
+local toklist, seminfolist, -- token lists
+ globalinfo, localinfo, -- variable information tables
+ globaluniq, localuniq, -- unique name tables
+ var_new, -- index of new variable names
+ varlist -- list of output variables
+
+----------------------------------------------------------------------
+-- preprocess information table to get lists of unique names
+----------------------------------------------------------------------
+
+local function preprocess(infotable)
+ local uniqtable = {}
+ for i = 1, #infotable do -- enumerate info table
+ local obj = infotable[i]
+ local name = obj.name
+ --------------------------------------------------------------------
+ if not uniqtable[name] then -- not found, start an entry
+ uniqtable[name] = {
+ decl = 0, token = 0, size = 0,
+ }
+ end
+ --------------------------------------------------------------------
+ local uniq = uniqtable[name] -- count declarations, tokens, size
+ uniq.decl = uniq.decl + 1
+ local xref = obj.xref
+ local xcount = #xref
+ uniq.token = uniq.token + xcount
+ uniq.size = uniq.size + xcount * #name
+ --------------------------------------------------------------------
+ if obj.decl then -- if local table, create first,last pairs
+ obj.id = i
+ obj.xcount = xcount
+ if xcount > 1 then -- if ==1, means local never accessed
+ obj.first = xref[2]
+ obj.last = xref[xcount]
+ end
+ --------------------------------------------------------------------
+ else -- if global table, add a back ref
+ uniq.id = i
+ end
+ --------------------------------------------------------------------
+ end--for
+ return uniqtable
+end
+
+----------------------------------------------------------------------
+-- calculate actual symbol frequencies, in order to reduce entropy
+-- * this may help further reduce the size of compressed sources
+-- * note that since parsing optimizations is put before lexing
+-- optimizations, the frequency table is not exact!
+-- * yes, this will miss --keep block comments too...
+----------------------------------------------------------------------
+
+local function recalc_for_entropy(option)
+ local byte = string.byte
+ local char = string.char
+ -- table of token classes to accept in calculating symbol frequency
+ local ACCEPT = {
+ TK_KEYWORD = true, TK_NAME = true, TK_NUMBER = true,
+ TK_STRING = true, TK_LSTRING = true,
+ }
+ if not option["opt-comments"] then
+ ACCEPT.TK_COMMENT = true
+ ACCEPT.TK_LCOMMENT = true
+ end
+ --------------------------------------------------------------------
+ -- create a new table and remove any original locals by filtering
+ --------------------------------------------------------------------
+ local filtered = {}
+ for i = 1, #toklist do
+ filtered[i] = seminfolist[i]
+ end
+ for i = 1, #localinfo do -- enumerate local info table
+ local obj = localinfo[i]
+ local xref = obj.xref
+ for j = 1, obj.xcount do
+ local p = xref[j]
+ filtered[p] = "" -- remove locals
+ end
+ end
+ --------------------------------------------------------------------
+ local freq = {} -- reset symbol frequency table
+ for i = 0, 255 do freq[i] = 0 end
+ for i = 1, #toklist do -- gather symbol frequency
+ local tok, info = toklist[i], filtered[i]
+ if ACCEPT[tok] then
+ for j = 1, #info do
+ local c = byte(info, j)
+ freq[c] = freq[c] + 1
+ end
+ end--if
+ end--for
+ --------------------------------------------------------------------
+ -- function to re-sort symbols according to actual frequencies
+ --------------------------------------------------------------------
+ local function resort(symbols)
+ local symlist = {}
+ for i = 1, #symbols do -- prepare table to sort
+ local c = byte(symbols, i)
+ symlist[i] = { c = c, freq = freq[c], }
+ end
+ table.sort(symlist, -- sort selected symbols
+ function(v1, v2)
+ return v1.freq > v2.freq
+ end
+ )
+ local charlist = {} -- reconstitute the string
+ for i = 1, #symlist do
+ charlist[i] = char(symlist[i].c)
+ end
+ return table.concat(charlist)
+ end
+ --------------------------------------------------------------------
+ LETTERS = resort(LETTERS) -- change letter arrangement
+ ALPHANUM = resort(ALPHANUM)
+end
+
+----------------------------------------------------------------------
+-- returns a string containing a new local variable name to use, and
+-- a flag indicating whether it collides with a global variable
+-- * trapping keywords and other names like 'self' is done elsewhere
+----------------------------------------------------------------------
+
+local function new_var_name()
+ local var
+ local cletters, calphanum = #LETTERS, #ALPHANUM
+ local v = var_new
+ if v < cletters then -- single char
+ v = v + 1
+ var = string.sub(LETTERS, v, v)
+ else -- longer names
+ local range, sz = cletters, 1 -- calculate # chars fit
+ repeat
+ v = v - range
+ range = range * calphanum
+ sz = sz + 1
+ until range > v
+ local n = v % cletters -- left side cycles faster
+ v = (v - n) / cletters -- do first char first
+ n = n + 1
+ var = string.sub(LETTERS, n, n)
+ while sz > 1 do
+ local m = v % calphanum
+ v = (v - m) / calphanum
+ m = m + 1
+ var = var..string.sub(ALPHANUM, m, m)
+ sz = sz - 1
+ end
+ end
+ var_new = var_new + 1
+ return var, globaluniq[var] ~= nil
+end
+
+----------------------------------------------------------------------
+-- calculate and print some statistics
+-- * probably better in main source, put here for now
+----------------------------------------------------------------------
+
+local function stats_summary(globaluniq, localuniq, afteruniq, option)
+ local print = print or base.print
+ local fmt = string.format
+ local opt_details = option.DETAILS
+ local uniq_g , uniq_li, uniq_lo, uniq_ti, uniq_to, -- stats needed
+ decl_g, decl_li, decl_lo, decl_ti, decl_to,
+ token_g, token_li, token_lo, token_ti, token_to,
+ size_g, size_li, size_lo, size_ti, size_to
+ = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ local function avg(c, l) -- safe average function
+ if c == 0 then return 0 end
+ return l / c
+ end
+ --------------------------------------------------------------------
+ -- collect statistics (note: globals do not have declarations!)
+ --------------------------------------------------------------------
+ for name, uniq in base.pairs(globaluniq) do
+ uniq_g = uniq_g + 1
+ token_g = token_g + uniq.token
+ size_g = size_g + uniq.size
+ end
+ for name, uniq in base.pairs(localuniq) do
+ uniq_li = uniq_li + 1
+ decl_li = decl_li + uniq.decl
+ token_li = token_li + uniq.token
+ size_li = size_li + uniq.size
+ end
+ for name, uniq in base.pairs(afteruniq) do
+ uniq_lo = uniq_lo + 1
+ decl_lo = decl_lo + uniq.decl
+ token_lo = token_lo + uniq.token
+ size_lo = size_lo + uniq.size
+ end
+ uniq_ti = uniq_g + uniq_li
+ decl_ti = decl_g + decl_li
+ token_ti = token_g + token_li
+ size_ti = size_g + size_li
+ uniq_to = uniq_g + uniq_lo
+ decl_to = decl_g + decl_lo
+ token_to = token_g + token_lo
+ size_to = size_g + size_lo
+ --------------------------------------------------------------------
+ -- detailed stats: global list
+ --------------------------------------------------------------------
+ if opt_details then
+ local sorted = {} -- sort table of unique global names by size
+ for name, uniq in base.pairs(globaluniq) do
+ uniq.name = name
+ sorted[#sorted + 1] = uniq
+ end
+ table.sort(sorted,
+ function(v1, v2)
+ return v1.size > v2.size
+ end
+ )
+ local tabf1, tabf2 = "%8s%8s%10s %s", "%8d%8d%10.2f %s"
+ local hl = string.rep("-", 44)
+ print("*** global variable list (sorted by size) ***\n"..hl)
+ print(fmt(tabf1, "Token", "Input", "Input", "Global"))
+ print(fmt(tabf1, "Count", "Bytes", "Average", "Name"))
+ print(hl)
+ for i = 1, #sorted do
+ local uniq = sorted[i]
+ print(fmt(tabf2, uniq.token, uniq.size, avg(uniq.token, uniq.size), uniq.name))
+ end
+ print(hl)
+ print(fmt(tabf2, token_g, size_g, avg(token_g, size_g), "TOTAL"))
+ print(hl.."\n")
+ --------------------------------------------------------------------
+ -- detailed stats: local list
+ --------------------------------------------------------------------
+ local tabf1, tabf2 = "%8s%8s%8s%10s%8s%10s %s", "%8d%8d%8d%10.2f%8d%10.2f %s"
+ local hl = string.rep("-", 70)
+ print("*** local variable list (sorted by allocation order) ***\n"..hl)
+ print(fmt(tabf1, "Decl.", "Token", "Input", "Input", "Output", "Output", "Global"))
+ print(fmt(tabf1, "Count", "Count", "Bytes", "Average", "Bytes", "Average", "Name"))
+ print(hl)
+ for i = 1, #varlist do -- iterate according to order assigned
+ local name = varlist[i]
+ local uniq = afteruniq[name]
+ local old_t, old_s = 0, 0
+ for j = 1, #localinfo do -- find corresponding old names and calculate
+ local obj = localinfo[j]
+ if obj.name == name then
+ old_t = old_t + obj.xcount
+ old_s = old_s + obj.xcount * #obj.oldname
+ end
+ end
+ print(fmt(tabf2, uniq.decl, uniq.token, old_s, avg(old_t, old_s),
+ uniq.size, avg(uniq.token, uniq.size), name))
+ end
+ print(hl)
+ print(fmt(tabf2, decl_lo, token_lo, size_li, avg(token_li, size_li),
+ size_lo, avg(token_lo, size_lo), "TOTAL"))
+ print(hl.."\n")
+ end--if opt_details
+ --------------------------------------------------------------------
+ -- display output
+ --------------------------------------------------------------------
+ local tabf1, tabf2 = "%-16s%8s%8s%8s%8s%10s", "%-16s%8d%8d%8d%8d%10.2f"
+ local hl = string.rep("-", 58)
+ print("*** local variable optimization summary ***\n"..hl)
+ print(fmt(tabf1, "Variable", "Unique", "Decl.", "Token", "Size", "Average"))
+ print(fmt(tabf1, "Types", "Names", "Count", "Count", "Bytes", "Bytes"))
+ print(hl)
+ print(fmt(tabf2, "Global", uniq_g, decl_g, token_g, size_g, avg(token_g, size_g)))
+ print(hl)
+ print(fmt(tabf2, "Local (in)", uniq_li, decl_li, token_li, size_li, avg(token_li, size_li)))
+ print(fmt(tabf2, "TOTAL (in)", uniq_ti, decl_ti, token_ti, size_ti, avg(token_ti, size_ti)))
+ print(hl)
+ print(fmt(tabf2, "Local (out)", uniq_lo, decl_lo, token_lo, size_lo, avg(token_lo, size_lo)))
+ print(fmt(tabf2, "TOTAL (out)", uniq_to, decl_to, token_to, size_to, avg(token_to, size_to)))
+ print(hl.."\n")
+end
+
+----------------------------------------------------------------------
+-- main entry point
+-- * does only local variable optimization for now
+----------------------------------------------------------------------
+
+function optimize(option, _toklist, _seminfolist, _globalinfo, _localinfo)
+ -- set tables
+ toklist, seminfolist, globalinfo, localinfo
+ = _toklist, _seminfolist, _globalinfo, _localinfo
+ var_new = 0 -- reset variable name allocator
+ varlist = {}
+ ------------------------------------------------------------------
+ -- preprocess global/local tables, handle entropy reduction
+ ------------------------------------------------------------------
+ globaluniq = preprocess(globalinfo)
+ localuniq = preprocess(localinfo)
+ if option["opt-entropy"] then -- for entropy improvement
+ recalc_for_entropy(option)
+ end
+ ------------------------------------------------------------------
+ -- build initial declared object table, then sort according to
+ -- token count, this might help assign more tokens to more common
+ -- variable names such as 'e' thus possibly reducing entropy
+ -- * an object knows its localinfo index via its 'id' field
+ -- * special handling for "self" special local (parameter) here
+ ------------------------------------------------------------------
+ local object = {}
+ for i = 1, #localinfo do
+ object[i] = localinfo[i]
+ end
+ table.sort(object, -- sort largest first
+ function(v1, v2)
+ return v1.xcount > v2.xcount
+ end
+ )
+ ------------------------------------------------------------------
+ -- the special "self" function parameters must be preserved
+ -- * the allocator below will never use "self", so it is safe to
+ -- keep those implicit declarations as-is
+ ------------------------------------------------------------------
+ local temp, j, gotself = {}, 1, false
+ for i = 1, #object do
+ local obj = object[i]
+ if not obj.isself then
+ temp[j] = obj
+ j = j + 1
+ else
+ gotself = true
+ end
+ end
+ object = temp
+ ------------------------------------------------------------------
+ -- a simple first-come first-served heuristic name allocator,
+ -- note that this is in no way optimal...
+ -- * each object is a local variable declaration plus existence
+ -- * the aim is to assign short names to as many tokens as possible,
+ -- so the following tries to maximize name reuse
+ -- * note that we preserve sort order
+ ------------------------------------------------------------------
+ local nobject = #object
+ while nobject > 0 do
+ local varname, gcollide
+ repeat
+ varname, gcollide = new_var_name() -- collect a variable name
+ until not SKIP_NAME[varname] -- skip all special names
+ varlist[#varlist + 1] = varname -- keep a list
+ local oleft = nobject
+ ------------------------------------------------------------------
+ -- if variable name collides with an existing global, the name
+ -- cannot be used by a local when the name is accessed as a global
+ -- during which the local is alive (between 'act' to 'rem'), so
+ -- we drop objects that collides with the corresponding global
+ ------------------------------------------------------------------
+ if gcollide then
+ -- find the xref table of the global
+ local gref = globalinfo[globaluniq[varname].id].xref
+ local ngref = #gref
+ -- enumerate for all current objects; all are valid at this point
+ for i = 1, nobject do
+ local obj = object[i]
+ local act, rem = obj.act, obj.rem -- 'live' range of local
+ -- if rem < 0, it is a -id to a local that had the same name
+ -- so follow rem to extend it; does this make sense?
+ while rem < 0 do
+ rem = localinfo[-rem].rem
+ end
+ local drop
+ for j = 1, ngref do
+ local p = gref[j]
+ if p >= act and p <= rem then drop = true end -- in range?
+ end
+ if drop then
+ obj.skip = true
+ oleft = oleft - 1
+ end
+ end--for
+ end--if gcollide
+ ------------------------------------------------------------------
+ -- now the first unassigned local (since it's sorted) will be the
+ -- one with the most tokens to rename, so we set this one and then
+ -- eliminate all others that collides, then any locals that left
+ -- can then reuse the same variable name; this is repeated until
+ -- all local declaration that can use this name is assigned
+ -- * the criteria for local-local reuse/collision is:
+ -- A is the local with a name already assigned
+ -- B is the unassigned local under consideration
+ -- => anytime A is accessed, it cannot be when B is 'live'
+ -- => to speed up things, we have first/last accesses noted
+ ------------------------------------------------------------------
+ while oleft > 0 do
+ local i = 1
+ while object[i].skip do -- scan for first object
+ i = i + 1
+ end
+ ------------------------------------------------------------------
+ -- first object is free for assignment of the variable name
+ -- [first,last] gives the access range for collision checking
+ ------------------------------------------------------------------
+ oleft = oleft - 1
+ local obja = object[i]
+ i = i + 1
+ obja.newname = varname
+ obja.skip = true
+ obja.done = true
+ local first, last = obja.first, obja.last
+ local xref = obja.xref
+ ------------------------------------------------------------------
+ -- then, scan all the rest and drop those colliding
+ -- if A was never accessed then it'll never collide with anything
+ -- otherwise trivial skip if:
+ -- * B was activated after A's last access (last < act)
+ -- * B was removed before A's first access (first > rem)
+ -- if not, see detailed skip below...
+ ------------------------------------------------------------------
+ if first and oleft > 0 then -- must have at least 1 access
+ local scanleft = oleft
+ while scanleft > 0 do
+ while object[i].skip do -- next valid object
+ i = i + 1
+ end
+ scanleft = scanleft - 1
+ local objb = object[i]
+ i = i + 1
+ local act, rem = objb.act, objb.rem -- live range of B
+ -- if rem < 0, extend range of rem thru' following local
+ while rem < 0 do
+ rem = localinfo[-rem].rem
+ end
+ --------------------------------------------------------
+ if not(last < act or first > rem) then -- possible collision
+ --------------------------------------------------------
+ -- B is activated later than A or at the same statement,
+ -- this means for no collision, A cannot be accessed when B
+ -- is alive, since B overrides A (or is a peer)
+ --------------------------------------------------------
+ if act >= obja.act then
+ for j = 1, obja.xcount do -- ... then check every access
+ local p = xref[j]
+ if p >= act and p <= rem then -- A accessed when B live!
+ oleft = oleft - 1
+ objb.skip = true
+ break
+ end
+ end--for
+ --------------------------------------------------------
+ -- A is activated later than B, this means for no collision,
+ -- A's access is okay since it overrides B, but B's last
+ -- access need to be earlier than A's activation time
+ --------------------------------------------------------
+ else
+ if objb.last and objb.last >= obja.act then
+ oleft = oleft - 1
+ objb.skip = true
+ end
+ end
+ end
+ --------------------------------------------------------
+ if oleft == 0 then break end
+ end
+ end--if first
+ ------------------------------------------------------------------
+ end--while
+ ------------------------------------------------------------------
+ -- after assigning all possible locals to one variable name, the
+ -- unassigned locals/objects have the skip field reset and the table
+ -- is compacted, to hopefully reduce iteration time
+ ------------------------------------------------------------------
+ local temp, j = {}, 1
+ for i = 1, nobject do
+ local obj = object[i]
+ if not obj.done then
+ obj.skip = false
+ temp[j] = obj
+ j = j + 1
+ end
+ end
+ object = temp -- new compacted object table
+ nobject = #object -- objects left to process
+ ------------------------------------------------------------------
+ end--while
+ ------------------------------------------------------------------
+ -- after assigning all locals with new variable names, we can
+ -- patch in the new names, and reprocess to get 'after' stats
+ ------------------------------------------------------------------
+ for i = 1, #localinfo do -- enumerate all locals
+ local obj = localinfo[i]
+ local xref = obj.xref
+ if obj.newname then -- if got new name, patch it in
+ for j = 1, obj.xcount do
+ local p = xref[j] -- xrefs indexes the token list
+ seminfolist[p] = obj.newname
+ end
+ obj.name, obj.oldname -- adjust names
+ = obj.newname, obj.name
+ else
+ obj.oldname = obj.name -- for cases like 'self'
+ end
+ end
+ ------------------------------------------------------------------
+ -- deal with statistics output
+ ------------------------------------------------------------------
+ if gotself then -- add 'self' to end of list
+ varlist[#varlist + 1] = "self"
+ end
+ local afteruniq = preprocess(localinfo)
+ stats_summary(globaluniq, localuniq, afteruniq, option)
+ ------------------------------------------------------------------
+end