1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
|
--[[--------------------------------------------------------------------
llex.lua: Lua 5.1 lexical analyzer in Lua
This file is part of LuaSrcDiet, based on Yueliang material.
Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
The COPYRIGHT file describes the conditions
under which this software may be distributed.
See the ChangeLog for more information.
----------------------------------------------------------------------]]
--[[--------------------------------------------------------------------
-- NOTES:
-- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
-- with significant modifications to handle LuaSrcDiet's needs:
-- (1) llex.error is an optional error function handler
-- (2) seminfo for strings include their delimiters and no
-- translation operations are performed on them
-- * ADDED shbang handling has been added to support executable scripts
-- * NO localized decimal point replacement magic
-- * NO limit to number of lines
-- * NO support for compatible long strings (LUA_COMPAT_LSTR)
-- * Please read technotes.txt for more technical details.
----------------------------------------------------------------------]]
local base = _G
local string = require "string"
module "llex"
local find = string.find
local match = string.match
local sub = string.sub
----------------------------------------------------------------------
-- initialize keyword list, variables
----------------------------------------------------------------------
local kw = {}
for v in string.gmatch([[
and break do else elseif end false for function if in
local nil not or repeat return then true until while]], "%S+") do
kw[v] = true
end
-- NOTE: see init() for module variables (externally visible):
-- tok, seminfo, tokln
local z, -- source stream
sourceid, -- name of source
I, -- position of lexer
buff, -- buffer for strings
ln -- line number
----------------------------------------------------------------------
-- add information to token listing
----------------------------------------------------------------------
local function addtoken(token, info)
local i = #tok + 1
tok[i] = token
seminfo[i] = info
tokln[i] = ln
end
----------------------------------------------------------------------
-- handles line number incrementation and end-of-line characters
----------------------------------------------------------------------
local function inclinenumber(i, is_tok)
local sub = sub
local old = sub(z, i, i)
i = i + 1 -- skip '\n' or '\r'
local c = sub(z, i, i)
if (c == "\n" or c == "\r") and (c ~= old) then
i = i + 1 -- skip '\n\r' or '\r\n'
old = old..c
end
if is_tok then addtoken("TK_EOL", old) end
ln = ln + 1
I = i
return i
end
----------------------------------------------------------------------
-- initialize lexer for given source _z and source name _sourceid
----------------------------------------------------------------------
function init(_z, _sourceid)
z = _z -- source
sourceid = _sourceid -- name of source
I = 1 -- lexer's position in source
ln = 1 -- line number
tok = {} -- lexed token list*
seminfo = {} -- lexed semantic information list*
tokln = {} -- line numbers for messages*
-- (*) externally visible thru' module
--------------------------------------------------------------------
-- initial processing (shbang handling)
--------------------------------------------------------------------
local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
if p then -- skip first line
I = I + #q
addtoken("TK_COMMENT", q)
if #r > 0 then inclinenumber(I, true) end
end
end
----------------------------------------------------------------------
-- returns a chunk name or id, no truncation for long names
----------------------------------------------------------------------
function chunkid()
if sourceid and match(sourceid, "^[=@]") then
return sub(sourceid, 2) -- remove first char
end
return "[string]"
end
----------------------------------------------------------------------
-- formats error message and throws error
-- * a simplified version, does not report what token was responsible
----------------------------------------------------------------------
function errorline(s, line)
local e = error or base.error
e(string.format("%s:%d: %s", chunkid(), line or ln, s))
end
local errorline = errorline
------------------------------------------------------------------------
-- count separators ("=") in a long string delimiter
------------------------------------------------------------------------
local function skip_sep(i)
local sub = sub
local s = sub(z, i, i)
i = i + 1
local count = #match(z, "=*", i) -- note, take the length
i = i + count
I = i
return (sub(z, i, i) == s) and count or (-count) - 1
end
----------------------------------------------------------------------
-- reads a long string or long comment
----------------------------------------------------------------------
local function read_long_string(is_str, sep)
local i = I + 1 -- skip 2nd '['
local sub = sub
local c = sub(z, i, i)
if c == "\r" or c == "\n" then -- string starts with a newline?
i = inclinenumber(i) -- skip it
end
local j = i
while true do
local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
if not p then
errorline(is_str and "unfinished long string" or
"unfinished long comment")
end
i = p
if r == "]" then -- delimiter test
if skip_sep(i) == sep then
buff = sub(z, buff, I)
I = I + 1 -- skip 2nd ']'
return buff
end
i = I
else -- newline
buff = buff.."\n"
i = inclinenumber(i)
end
end--while
end
----------------------------------------------------------------------
-- reads a string
----------------------------------------------------------------------
local function read_string(del)
local i = I
local find = find
local sub = sub
while true do
local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
if p then
if r == "\n" or r == "\r" then
errorline("unfinished string")
end
i = p
if r == "\\" then -- handle escapes
i = i + 1
r = sub(z, i, i)
if r == "" then break end -- (EOZ error)
p = find("abfnrtv\n\r", r, 1, true)
------------------------------------------------------
if p then -- special escapes
if p > 7 then
i = inclinenumber(i)
else
i = i + 1
end
------------------------------------------------------
elseif find(r, "%D") then -- other non-digits
i = i + 1
------------------------------------------------------
else -- \xxx sequence
local p, q, s = find(z, "^(%d%d?%d?)", i)
i = q + 1
if s + 1 > 256 then -- UCHAR_MAX
errorline("escape sequence too large")
end
------------------------------------------------------
end--if p
else
i = i + 1
if r == del then -- ending delimiter
I = i
return sub(z, buff, i - 1) -- return string
end
end--if r
else
break -- (error)
end--if p
end--while
errorline("unfinished string")
end
------------------------------------------------------------------------
-- main lexer function
------------------------------------------------------------------------
function llex()
local find = find
local match = match
while true do--outer
local i = I
-- inner loop allows break to be used to nicely section tests
while true do--inner
----------------------------------------------------------------
local p, _, r = find(z, "^([_%a][_%w]*)", i)
if p then
I = i + #r
if kw[r] then
addtoken("TK_KEYWORD", r) -- reserved word (keyword)
else
addtoken("TK_NAME", r) -- identifier
end
break -- (continue)
end
----------------------------------------------------------------
local p, _, r = find(z, "^(%.?)%d", i)
if p then -- numeral
if r == "." then i = i + 1 end
local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
i = q + 1
if #r == 1 then -- optional exponent
if match(z, "^[%+%-]", i) then -- optional sign
i = i + 1
end
end
local _, q = find(z, "^[_%w]*", i)
I = q + 1
local v = sub(z, p, q) -- string equivalent
if not base.tonumber(v) then -- handles hex test also
errorline("malformed number")
end
addtoken("TK_NUMBER", v)
break -- (continue)
end
----------------------------------------------------------------
local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
if p then
if t == "\n" or t == "\r" then -- newline
inclinenumber(i, true)
else
I = q + 1 -- whitespace
addtoken("TK_SPACE", r)
end
break -- (continue)
end
----------------------------------------------------------------
local r = match(z, "^%p", i)
if r then
buff = i
local p = find("-[\"\'.=<>~", r, 1, true)
if p then
-- two-level if block for punctuation/symbols
--------------------------------------------------------
if p <= 2 then
if p == 1 then -- minus
local c = match(z, "^%-%-(%[?)", i)
if c then
i = i + 2
local sep = -1
if c == "[" then
sep = skip_sep(i)
end
if sep >= 0 then -- long comment
addtoken("TK_LCOMMENT", read_long_string(false, sep))
else -- short comment
I = find(z, "[\n\r]", i) or (#z + 1)
addtoken("TK_COMMENT", sub(z, buff, I - 1))
end
break -- (continue)
end
-- (fall through for "-")
else -- [ or long string
local sep = skip_sep(i)
if sep >= 0 then
addtoken("TK_LSTRING", read_long_string(true, sep))
elseif sep == -1 then
addtoken("TK_OP", "[")
else
errorline("invalid long string delimiter")
end
break -- (continue)
end
--------------------------------------------------------
elseif p <= 5 then
if p < 5 then -- strings
I = i + 1
addtoken("TK_STRING", read_string(r))
break -- (continue)
end
r = match(z, "^%.%.?%.?", i) -- .|..|... dots
-- (fall through)
--------------------------------------------------------
else -- relational
r = match(z, "^%p=?", i)
-- (fall through)
end
end
I = i + #r
addtoken("TK_OP", r) -- for other symbols, fall through
break -- (continue)
end
----------------------------------------------------------------
local r = sub(z, i, i)
if r ~= "" then
I = i + 1
addtoken("TK_OP", r) -- other single-char tokens
break
end
addtoken("TK_EOS", "") -- end of stream,
return -- exit here
----------------------------------------------------------------
end--while inner
end--while outer
end
return base.getfenv()
|