Compare commits

...

24 Commits

Author SHA1 Message Date
df8e9f7e83 refactoring: switched to match/case
wow! python actually added switch cases! too bad this is just syntax sugar...
2023-12-09 12:01:04 -06:00
a22aa808e0 lp: added support for OP_TEST 2022-08-26 01:18:24 -05:00
935844f274 more minor refactoring 2022-08-22 00:59:21 -05:00
c37e9a21d8 ld: minor refactoring 2022-08-22 00:54:59 -05:00
34b1ec7285 ld: added LuaDump()
- chunks can now be serialized back into lua bytecode dumps :D
2022-08-22 00:50:08 -05:00
f9f1d4af00 ld: minor refactoring 2022-08-19 15:46:47 -05:00
3be45f156a lp: support OP_CLOSURE, boilerplate function/proto support 2022-08-17 22:14:45 -05:00
b28edcba1d lp: fix isValidLocal() not respecting capitals 2022-08-17 22:14:13 -05:00
bc4e762e26 lp: bug fix (forgot to transfer registers) 2022-08-16 00:26:50 -05:00
19bed999ee lp: added __parseNewTable(), better table pseudo-code 2022-08-16 00:12:26 -05:00
a248cc4807 lp: added NEWTABLE && SETLIST
- tables can now be (mostly) decompiled
- changed 'decompiled source' to 'pseudo-code' since the output doesn't typically match the compiled script source.
- misc. refactoring
2022-08-15 23:30:32 -05:00
9da0d0ffbd lp: support FORPREP && FORLOOP 2022-08-14 01:36:05 -05:00
5d91dbbc64 lundump.py: better instruction annotations 2022-08-12 17:31:15 -05:00
0d947f4f3d Added 'repeat .. until' support
- lines are now tracked by start & end PC
- new config option: annotatedLines. if true line annotations show start & end PC will be emitted
- lundump.py now shows locals for each proto
2022-08-12 17:08:31 -05:00
368ff62538 'not' not '!' 2022-08-12 00:10:47 -05:00
a6623c8953 lparser.py: if local name is invalid, resort to fallback 2022-08-12 00:04:11 -05:00
0f72e71a59 lparser.py: proper support for locals 2022-08-11 23:58:21 -05:00
b8bf02f7d0 lparser.py: minor refactoring, cleanup 2022-08-11 23:10:36 -05:00
95ca3bb26b lparser.py: added support for while loops 2022-08-11 17:26:48 -05:00
78e137d033 updated README 2022-08-11 16:56:42 -05:00
875e91636b lundump.py: SETGLOBAL uses K[] 2022-08-11 16:49:48 -05:00
055af56e27 lparser.py: don't read RK values here, use lundump.py to read them 2022-08-11 16:45:05 -05:00
eb1d3ffe87 lundump.py: minor refactoring, added instruction annotations 2022-08-11 16:43:34 -05:00
2258888956 lundump.py: Instruction.__readRK() should be a static method 2022-08-11 16:19:21 -05:00
5 changed files with 701 additions and 259 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
example.*
__pycache__
NOTES.md

107
README.md
View File

@@ -2,48 +2,105 @@
An experimental Lua 5.1 dump decompiler (typically dumped using `luac -o <out.luac> <script.lua>`).
You will quickly find that only **extremely** simple scripts are decompiled successfully right now. This is an experimental project and not all opcodes are properly handled for now. If you need a real decompiler I would recommend any of the handful of ones that exist already.
## Why?
Lua has a relatively small instruction set (only 38 different opcodes!). This makes it pretty feasible for a weekend decompiler project. (real) Decompilers are extremely complex pieces of software, so being able to write a simpler one helps show the theory without *much* of the headache.
## Example usage
```sh
> cat example.lua && luac5.1 -o example.luac example.lua
pp = "pri" .. "nt"
local printMsg = function(append)
local tbl = {"He", "llo", " ", "Wo"}
local str = ""
if 2 + 2 == 4 then
_G[pp]("Hello world")
for i = 1, #tbl do
str = str .. tbl[i]
end
print(str .. append)
end
printMsg("rld!")
> python main.py example.luac
example.luac
==== [[example.lua's constants]] ====
0: [STRING] pp
1: [STRING] pri
2: [STRING] nt
3: [NUMBER] 4.0
4: [STRING] _G
5: [STRING] Hello world
0: [STRING] rld!
==== [[example.lua's locals]] ====
R[0]: printMsg
==== [[example.lua's dissassembly]] ====
[ 0] LOADK : R[0] K[1]
[ 1] LOADK : R[1] K[2]
[ 2] CONCAT : R[0] R[0] R[1]
[ 3] SETGLOBAL : R[0] R[0]
[ 4] EQ : R[0] K[3] K[3]
[ 5] JMP : R[0] R[5]
[ 6] GETGLOBAL : R[0] K[4]
[ 7] GETGLOBAL : R[1] K[0]
[ 8] GETTABLE : R[0] R[0] R[1]
[ 9] LOADK : R[1] K[5]
[ 10] CALL : R[0] R[2] R[1]
[ 11] RETURN : R[0] R[1] R[0]
[ 0] CLOSURE : R[0] 0 ;
[ 1] MOVE : 1 0 0 ; move R[0] into R[1]
[ 2] LOADK : R[2] K[0] ; load "rld!" into R[2]
[ 3] CALL : 1 2 1 ;
[ 4] RETURN : 0 1 0 ;
==== [[example.lua's decompiled source]] ====
==== [[example.lua's protos]] ====
pp = "pri" .. "nt"
if 4.0 == 4.0 then
_G[pp]("Hello world")
==== [['s constants]] ====
0: [STRING] He
1: [STRING] llo
2: [STRING]
3: [STRING] Wo
4: [STRING]
5: [NUMBER] 1.0
6: [STRING] print
==== [['s locals]] ====
R[0]: append
R[1]: tbl
R[2]: str
R[3]: (for index)
R[4]: (for limit)
R[5]: (for step)
R[6]: i
==== [['s dissassembly]] ====
[ 0] NEWTABLE : 1 4 0 ;
[ 1] LOADK : R[2] K[0] ; load "He" into R[2]
[ 2] LOADK : R[3] K[1] ; load "llo" into R[3]
[ 3] LOADK : R[4] K[2] ; load " " into R[4]
[ 4] LOADK : R[5] K[3] ; load "Wo" into R[5]
[ 5] SETLIST : 1 4 1 ;
[ 6] LOADK : R[2] K[4] ; load "" into R[2]
[ 7] LOADK : R[3] K[5] ; load 1 into R[3]
[ 8] LEN : 4 1 0 ;
[ 9] LOADK : R[5] K[5] ; load 1 into R[5]
[ 10] FORPREP : R[3] 3 ;
[ 11] MOVE : 7 2 0 ; move R[2] into R[7]
[ 12] GETTABLE : R[8] 1 R[6] ;
[ 13] CONCAT : 2 7 8 ; concat 2 values from R[7] to R[8], store into R[2]
[ 14] FORLOOP : R[3] -4 ;
[ 15] GETGLOBAL : R[3] K[6] ; move _G["print"] into R[3]
[ 16] MOVE : 4 2 0 ; move R[2] into R[4]
[ 17] MOVE : 5 0 0 ; move R[0] into R[5]
[ 18] CONCAT : 4 4 5 ; concat 2 values from R[4] to R[5], store into R[4]
[ 19] CALL : 3 2 1 ;
[ 20] RETURN : 0 1 0 ;
==== [[example.lua's pseudo-code]] ====
local printMsg = function(append)
local tbl = {"He", "llo", " ", "Wo", }
local str = ""
for i = 1, #tbl, 1 do
str = str .. tbl[i]
end
print(str .. append)
end
printMsg("rld!")
```

View File

@@ -1,90 +1,215 @@
'''
lparser.py
Depends on ldump.py for lua dump deserialization.
Depends on lundump.py for lua dump deserialization.
An experimental bytecode decompiler.
'''
from operator import concat
from subprocess import call
from lundump import Chunk, LuaUndump, Constant, Instruction, InstructionType, Opcodes
from lundump import Chunk, Constant, Instruction, Opcodes, whichRK, readRKasK
class _Scope:
def __init__(self, startPC: int, endPC: int):
self.startPC = startPC
self.endPC = endPC
class _Traceback:
def __init__(self):
self.sets = []
self.uses = []
self.isConst = False
class _Line:
def __init__(self, startPC: int, endPC: int, src: str, scope: int):
self.startPC = startPC
self.endPC = endPC
self.src = src
self.scope = scope
def isValidLocal(ident: str) -> bool:
# has to start with an alpha or _
if ident[0] not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_":
return False
# then it can be alphanum or _
for c in ident[1:]:
if c not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_":
return False
return True
class LuaDecomp:
def __init__(self, chunk: Chunk):
def __init__(self, chunk: Chunk, headChunk: bool = True, scopeOffset: int = 0):
self.chunk = chunk
self.pc = 0
self.scope = []
self.scope: list[_Scope] = []
self.lines: list[_Line] = []
self.top = {}
self.locals = {}
self.traceback = {}
self.unknownLocalCount = 0
self.headChunk = headChunk
self.scopeOffset = scopeOffset # number of scopes this chunk/proto is in
self.src: str = ""
# configurations!
self.aggressiveLocals = False # should *EVERY* accessed register be considered a local?
self.aggressiveLocals = False # should *EVERY* set register be considered a local?
self.annotateLines = False
self.indexWidth = 4 # how many spaces for indentions?
self.__loadLocals()
if not self.headChunk:
functionProto = "function("
# define params
for i in range(self.chunk.numParams):
# add param to function prototype (also make a local in the register if it doesn't exist)
functionProto += ("%s, " if i+1 < self.chunk.numParams else "%s") % self.__makeLocalIdentifier(i)
# mark local as defined
self.__addSetTraceback(i)
functionProto += ")"
self.__startScope(functionProto, 0, len(self.chunk.instructions))
# parse instructions
while self.pc < len(self.chunk.instructions):
self.parseExpr()
self.parseInstr()
self.pc += 1
# end the scope (if we're supposed too)
self.__checkScope()
print("\n==== [[" + str(self.chunk.name) + "'s decompiled source]] ====\n")
print(self.src)
if not self.headChunk:
self.__endScope()
def getPseudoCode(self) -> str:
fullSrc = ""
for line in self.lines:
if self.annotateLines:
fullSrc += "-- PC: %d to PC: %d\n" % (line.startPC, line.endPC)
fullSrc += ((' ' * self.indexWidth) * (line.scope + self.scopeOffset)) + line.src + "\n"
return fullSrc
# =======================================[[ Helpers ]]=========================================
def __getInstrAtPC(self, pc: int) -> Instruction:
if pc < len(self.chunk.instructions):
return self.chunk.instructions[pc]
raise Exception("Decompilation failed!")
def __getNextInstr(self) -> Instruction:
return self.__getInstrAtPC(self.pc + 1)
def __getCurrInstr(self) -> Instruction:
return self.__getInstrAtPC(self.pc)
def __makeTracIfNotExist(self) -> None:
if not self.pc in self.traceback:
self.traceback[self.pc] = _Traceback()
# when we read from a register, call this
def __addUseTraceback(self, reg: int) -> None:
self.__makeTracIfNotExist()
self.traceback[self.pc].uses.append(reg)
# when we write from a register, call this
def __addSetTraceback(self, reg: int) -> None:
self.__makeTracIfNotExist()
self.traceback[self.pc].sets.append(reg)
def __addExpr(self, code: str) -> None:
self.src += code
def __endStatement(self):
startPC = self.lines[len(self.lines) - 1].endPC + 1 if len(self.lines) > 0 else 0
endPC = self.pc
# make sure we don't write an empty line
if not self.src == "":
self.lines.append(_Line(startPC, endPC, self.src, len(self.scope)))
self.src = ""
def __insertStatement(self, pc: int) -> None:
# insert current statement into lines at pc location
for i in range(len(self.lines)):
if self.lines[i].startPC <= pc and self.lines[i].endPC >= pc:
self.lines.insert(i, _Line(pc, pc, self.src, self.lines[i-1].scope if i > 0 else 0))
self.src = ""
return i
self.src = ""
# walks traceback, if local wasn't set before, the local needs to be defined
def __needsDefined(self, reg) -> bool:
for _, trace in self.traceback.items():
if reg in trace.sets:
return False
# wasn't set in traceback! needs defined!
return True
def __loadLocals(self):
for i in range(len(self.chunk.locals)):
name = self.chunk.locals[i].name
if isValidLocal(name):
self.locals[i] = name
elif "(for " not in name: # if it's a for loop register, ignore
self.__makeLocalIdentifier(i)
# when you *know* the register *has* to be a local (for loops, etc.)
def __getLocal(self, indx: int) -> str:
return self.locals[indx] if indx in self.locals else self.__makeLocalIdentifier(indx)
def __getReg(self, indx: int) -> str:
self.__addUseTraceback(indx)
# if the top indx is a local, get it
return self.locals[indx] if indx in self.locals else self.top[indx]
def __setReg(self, indx: int, code: str, forceLocal: bool = False) -> None:
# if the top indx is a local, set it
if indx in self.locals:
if self.__needsDefined(indx):
self.__newLocal(indx, code)
else:
self.__addExpr(self.locals[indx] + " = " + code)
self.__endStatement()
elif self.aggressiveLocals or forceLocal: # 'every register is a local!!'
self.__newLocal(indx, code)
self.__addSetTraceback(indx)
self.top[indx] = code
# ========================================[[ Locals ]]=========================================
def __makeLocalIdentifier(self, indx: int) -> str:
# first, check if we have a local name already determined
if indx in self.locals:
return self.locals[indx]
# otherwise, generate a local
self.locals[indx] = "__unknLocal%d" % self.unknownLocalCount
self.unknownLocalCount += 1
return self.locals[indx]
def __newLocal(self, indx: int, expr: str) -> None:
# TODO: grab identifier from chunk(?)
self.__makeLocalIdentifier(indx)
self.__startStatement()
self.__addExpr("local " + self.locals[indx] + " = " + expr)
self.__endStatement()
def __getNextInstr(self) -> Instruction:
if self.pc + 1 < len(self.chunk.instructions):
return self.chunk.instructions[self.pc + 1]
# ========================================[[ Scopes ]]=========================================
return None
def __getCurrInstr(self) -> Instruction:
return self.chunk.instructions[self.pc]
def __addExpr(self, code: str) -> None:
self.src += code
def __startStatement(self):
self.src += '\n' + (' ' * self.indexWidth * len(self.scope))
def __getReg(self, indx: int) -> str:
# if the top indx is a local, get it
return self.locals[indx] if indx in self.locals else self.top[indx]
def __setReg(self, indx: int, code: str) -> None:
# if the top indx is a local, set it
if indx in self.locals:
self.__startStatement()
self.__addExpr(self.locals[indx] + " = " + code)
elif self.aggressiveLocals: # 'every register is a local!!'
self.__newLocal(indx, code)
self.top[indx] = code
def __startScope(self, scopeType: str, size: int) -> None:
def __startScope(self, scopeType: str, start: int, size: int) -> None:
self.__addExpr(scopeType)
self.scope.append(_Scope(self.pc, self.pc + size))
self.__endStatement()
self.scope.append(_Scope(start, start + size))
# checks if we need to end a scope
def __checkScope(self) -> None:
@@ -95,119 +220,227 @@ class LuaDecomp:
self.__endScope()
def __endScope(self) -> None:
self.scope.pop()
self.__startStatement()
self.__endStatement()
self.__addExpr("end")
self.scope.pop()
self.__endStatement()
# =====================================[[ Instructions ]]======================================
def __emitOperand(self, a: int, b: str, c: str, op: str) -> None:
self.__setReg(a, "(" + b + op + c + ")")
# handles conditional jumps
def __condJmp(self, op: str, rkBC: bool = True):
instr = self.__getCurrInstr()
jmpType = "if"
scopeStart = "then"
# we need to check if the jmp location has a jump back (if so, it's a while loop)
jmp = self.__getNextInstr().B + 1
jmpToInstr = self.__getInstrAtPC(self.pc + jmp)
if jmpToInstr.opcode == Opcodes.JMP:
# if this jump jumps back to this compJmp, it's a loop!
if self.pc + jmp + jmpToInstr.B <= self.pc + 1:
jmpType = "while"
scopeStart = "do"
elif jmp < 0:
# 'repeat until' loop (probably)
jmpType = "until"
scopeStart = None
if instr.A > 0:
self.__addExpr("%s not " % jmpType)
else:
self.__addExpr("%s " % jmpType)
# write actual comparison
if rkBC:
self.__addExpr(self.__readRK(instr.B) + op + self.__readRK(instr.C) + " ")
else: # just testing rkB
self.__addExpr(op + self.__readRK(instr.B))
self.pc += 1 # skip next instr
if scopeStart:
self.__startScope("%s " % scopeStart, self.pc - 1, jmp)
# we end the statement *after* scopeStart
self.__endStatement()
else:
# end the statement prior to repeat
self.__endStatement()
# it's a repeat until loop, insert 'repeat' at the jumpTo location
self.__addExpr("repeat")
insertedLine = self.__insertStatement(self.pc + jmp)
# add scope to every line in-between
for i in range(insertedLine+1, len(self.lines)-1):
self.lines[i].scope += 1
# 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which
def __readRK(self, rk: int) -> str:
if (rk & (1 << 8)) > 0:
return self.chunk.constants[(rk & ~(1 << 8))].toCode()
if (whichRK(rk)) > 0:
return self.chunk.getConstant(readRKasK(rk)).toCode()
else:
return self.__getReg(rk)
def parseExpr(self):
# walk & peak ahead NEWTABLE
def __parseNewTable(self, indx: int):
# TODO: parse SETTABLE too?
tblOps = [Opcodes.LOADK, Opcodes.SETLIST]
instr = self.__getNextInstr()
cachedRegs = {}
tbl = "{"
while instr.opcode in tblOps:
if instr.opcode == Opcodes.LOADK: # operate on registers
cachedRegs[instr.A] = self.chunk.getConstant(instr.B).toCode()
elif instr.opcode == Opcodes.SETLIST:
numElems = instr.B
for i in range(numElems):
tbl += "%s, " % cachedRegs[instr.A + i + 1]
del cachedRegs[instr.A + i + 1]
self.pc += 1
instr = self.__getNextInstr()
tbl += "}"
# i use forceLocal here even though i don't know *for sure* that the register is a local.
# this does help later though if the table is reused (which is 99% of the time). the other 1%
# only affects syntax and may look a little weird but is fine and equivalent non-the-less
self.__setReg(indx, tbl, forceLocal=True)
self.__endStatement()
# if we have leftovers... oops, set those
for i, v in cachedRegs.items():
self.__setReg(i, v)
def parseInstr(self):
instr = self.__getCurrInstr()
# python, add switch statements *please*
if instr.opcode == Opcodes.MOVE: # move is a fake ABC instr, C is ignored
# move registers
self.__setReg(instr.A, self.__getReg(instr.B))
elif instr.opcode == Opcodes.LOADK:
self.__setReg(instr.A, self.chunk.constants[instr.B].toCode())
elif instr.opcode == Opcodes.LOADBOOL:
if instr.B == 0:
self.__setReg(instr.A, "false")
else:
self.__setReg(instr.A, "true")
elif instr.opcode == Opcodes.GETGLOBAL:
self.__setReg(instr.A, self.chunk.constants[instr.B].data)
elif instr.opcode == Opcodes.GETTABLE:
self.__setReg(instr.A, self.__getReg(instr.B) + "[" + self.__readRK(instr.C) + "]")
elif instr.opcode == Opcodes.SETGLOBAL:
self.__startStatement()
self.__addExpr(self.chunk.constants[instr.B].data + " = " + self.__getReg(instr.A))
elif instr.opcode == Opcodes.SETTABLE:
self.__startStatement()
self.__addExpr(self.__getReg(instr.A) + "[" + self.__readRK(instr.B) + "] = " + self.__readRK(instr.C))
elif instr.opcode == Opcodes.ADD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " + ")
elif instr.opcode == Opcodes.SUB:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " - ")
elif instr.opcode == Opcodes.MUL:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " * ")
elif instr.opcode == Opcodes.DIV:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " / ")
elif instr.opcode == Opcodes.MOD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " % ")
elif instr.opcode == Opcodes.POW:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " ^ ")
elif instr.opcode == Opcodes.UNM:
self.__setReg(instr.A, "-" + self.__getReg(instr.B))
elif instr.opcode == Opcodes.NOT:
self.__setReg(instr.A, "!" + self.__getReg(instr.B))
elif instr.opcode == Opcodes.LEN:
self.__setReg(instr.A, "#" + self.__getCurrInstr(instr.B))
elif instr.opcode == Opcodes.CONCAT:
count = instr.C-instr.B+1
concatStr = ""
match instr.opcode:
case Opcodes.MOVE: # move is a fake ABC instr, C is ignored
# move registers
self.__setReg(instr.A, self.__getReg(instr.B))
case Opcodes.LOADK:
self.__setReg(instr.A, self.chunk.getConstant(instr.B).toCode())
case Opcodes.LOADBOOL:
if instr.B == 0:
self.__setReg(instr.A, "false")
else:
self.__setReg(instr.A, "true")
case Opcodes.GETGLOBAL:
self.__setReg(instr.A, self.chunk.getConstant(instr.B).data)
case Opcodes.GETTABLE:
self.__setReg(instr.A, self.__getReg(instr.B) + "[" + self.__readRK(instr.C) + "]")
case Opcodes.SETGLOBAL:
self.__addExpr(self.chunk.getConstant(instr.B).data + " = " + self.__getReg(instr.A))
self.__endStatement()
case Opcodes.SETTABLE:
self.__addExpr(self.__getReg(instr.A) + "[" + self.__readRK(instr.B) + "] = " + self.__readRK(instr.C))
self.__endStatement()
case Opcodes.NEWTABLE:
self.__parseNewTable(instr.A)
case Opcodes.ADD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " + ")
case Opcodes.SUB:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " - ")
case Opcodes.MUL:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " * ")
case Opcodes.DIV:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " / ")
case Opcodes.MOD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " % ")
case Opcodes.POW:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " ^ ")
case Opcodes.UNM:
self.__setReg(instr.A, "-" + self.__getReg(instr.B))
case Opcodes.NOT:
self.__setReg(instr.A, "not " + self.__getReg(instr.B))
case Opcodes.LEN:
self.__setReg(instr.A, "#" + self.__getReg(instr.B))
case Opcodes.CONCAT:
count = instr.C-instr.B+1
concatStr = ""
# concat all items on stack from RC to RB
for i in range(count):
concatStr += self.__getReg(instr.B + i) + (" .. " if not i == count - 1 else "")
# concat all items on stack from RC to RB
for i in range(count):
concatStr += self.__getReg(instr.B + i) + (" .. " if not i == count - 1 else "")
self.__setReg(instr.A, concatStr)
elif instr.opcode == Opcodes.JMP:
pass
elif instr.opcode == Opcodes.EQ:
self.__startStatement()
if instr.A > 0:
self.__addExpr("if not ")
else:
self.__addExpr("if ")
self.__addExpr(self.__readRK(instr.B) + " == " + self.__readRK(instr.C) + " ")
self.__startScope("then ", self.__getNextInstr().B + 1)
self.__setReg(instr.A, concatStr)
case Opcodes.JMP:
pass
case Opcodes.EQ:
self.__condJmp(" == ")
case Opcodes.LT:
self.__condJmp(" < ")
case Opcodes.LE:
self.__condJmp(" <= ")
case Opcodes.TEST:
if instr.C == 0:
self.__condJmp("", False)
else:
self.__condJmp("not ", False)
case Opcodes.CALL:
preStr = ""
callStr = ""
ident = ""
self.pc += 1 # skip next instr
elif instr.opcode == Opcodes.LT:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " < ")
elif instr.opcode == Opcodes.LE:
self.__emitOperand(instr.A, instr.B, instr.C, " <= ")
elif instr.opcode == Opcodes.CALL:
preStr = ""
callStr = ""
ident = ""
# parse arguments
callStr += self.__getReg(instr.A) + "("
for i in range(instr.A + 1, instr.A + instr.B):
callStr += self.__getReg(i) + (", " if not i + 1 == instr.A + instr.B else "")
callStr += ")"
# parse arguments
callStr += self.__getReg(instr.A) + "("
for i in range(instr.A + 1, instr.A + instr.B):
callStr += self.__getReg(i) + (", " if not i + 1 == instr.A + instr.B else "")
callStr += ")"
# parse return values
if instr.C > 1:
preStr = "local "
for indx in range(instr.A, instr.A + instr.C - 1):
if indx in self.locals:
ident = self.locals[indx]
else:
ident = self.__makeLocalIdentifier(indx)
preStr += ident
# parse return values
if instr.C > 1:
preStr = "local "
for indx in range(instr.A, instr.A + instr.C - 1):
if indx in self.locals:
ident = self.locals[indx]
else:
ident = self.__makeLocalIdentifier(indx)
preStr += ident
# normally setReg() does this
self.top[indx] = ident
# normally setReg() does this
self.top[indx] = ident
# just so we don't have a trailing ', '
preStr += ", " if not indx == instr.A + instr.C - 2 else ""
preStr += " = "
# just so we don't have a trailing ', '
preStr += ", " if not indx == instr.A + instr.C - 2 else ""
preStr += " = "
self.__addExpr(preStr + callStr)
self.__endStatement()
case Opcodes.RETURN:
self.__endStatement()
pass # no-op for now
case Opcodes.FORLOOP:
pass # no-op for now
case Opcodes.FORPREP:
self.__addExpr("for %s = %s, %s, %s " % (self.__getLocal(instr.A+3), self.__getReg(instr.A), self.__getReg(instr.A + 1), self.__getReg(instr.A + 2)))
self.__startScope("do", self.pc, instr.B)
case Opcodes.SETLIST:
# LFIELDS_PER_FLUSH (50) is the number of elements that *should* have been set in the list in the *last* SETLIST
# eg.
# [ 49] LOADK : R[49] K[1] ; load 0.0 into R[49]
# [ 50] LOADK : R[50] K[1] ; load 0.0 into R[50]
# [ 51] SETLIST : 0 50 1 ; sets list[1..50]
# [ 52] LOADK : R[1] K[1] ; load 0.0 into R[1]
# [ 53] SETLIST : 0 1 2 ; sets list[51..51]
numElems = instr.B
startAt = ((instr.C - 1) * 50)
ident = self.__getLocal(instr.A)
self.__startStatement()
self.__addExpr(preStr + callStr)
elif instr.opcode == Opcodes.RETURN:
self.__startStatement()
pass # no-op for now
else:
raise Exception("unsupported instruction: %s" % instr.toString())
# set each index (TODO: make tables less verbose)
for i in range(numElems):
self.__addExpr("%s[%d] = %s" % (ident, (startAt + i + 1), self.__getReg(instr.A + i + 1)))
self.__endStatement()
case Opcodes.CLOSURE:
proto = LuaDecomp(self.chunk.protos[instr.B], headChunk=False, scopeOffset=len(self.scope))
self.__setReg(instr.A, proto.getPseudoCode())
case _:
raise Exception("unsupported instruction: %s" % instr.toString())

View File

@@ -1,7 +1,7 @@
'''
l(un)dump.py
A Lua5.1 cross-platform bytecode deserializer. This module pulls int and size_t sizes from the
A Lua5.1 cross-platform bytecode deserializer && serializer. This module pulls int and size_t sizes from the
chunk header, meaning it should be able to deserialize lua bytecode dumps from most platforms,
regardless of the host machine.
@@ -9,11 +9,9 @@
as well as read the lundump.c source file from the Lua5.1 source.
'''
from multiprocessing.spawn import get_executable
import struct
import array
from enum import IntEnum, Enum, auto
from typing_extensions import Self
class InstructionType(Enum):
ABC = auto(),
@@ -68,7 +66,17 @@ class ConstType(IntEnum):
_RKBCInstr = [Opcodes.SETTABLE, Opcodes.ADD, Opcodes.SUB, Opcodes.MUL, Opcodes.DIV, Opcodes.MOD, Opcodes.POW, Opcodes.EQ, Opcodes.LT]
_RKCInstr = [Opcodes.GETTABLE, Opcodes.SELF]
_KBx = [Opcodes.LOADK, Opcodes.GETGLOBAL]
_KBx = [Opcodes.LOADK, Opcodes.GETGLOBAL, Opcodes.SETGLOBAL]
_LUAMAGIC = b'\x1bLua'
# is an 'RK' value a K? (result is true for K, false for R)
def whichRK(rk: int):
return (rk & (1 << 8)) > 0
# read an RK as a K
def readRKasK(rk: int):
return (rk & ~(1 << 8))
class Instruction:
def __init__(self, type: InstructionType, name: str) -> None:
@@ -80,9 +88,9 @@ class Instruction:
self.C: int = None
# 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which
def __readRK(self, rk: int) -> str:
if (rk & (1 << 8)) > 0:
return "K[" + str((rk & ~(1 << 8))) + "]"
def __formatRK(self, rk: int) -> str:
if whichRK(rk):
return "K[" + str(readRKasK(rk)) + "]"
else:
return "R[" + str(rk) + "]"
@@ -92,36 +100,59 @@ class Instruction:
if self.type == InstructionType.ABC:
# by default, treat them as registers
A = "R[%d]" % self.A
B = "R[%d]" % self.B
C = "R[%d]" % self.C
A = "%d" % self.A
B = "%d" % self.B
C = "%d" % self.C
# these opcodes have RKs for B & C
if self.opcode in _RKBCInstr:
B = self.__readRK(self.B)
C = self.__readRK(self.C)
A = "R[%d]" % self.A
B = self.__formatRK(self.B)
C = self.__formatRK(self.C)
elif self.opcode in _RKCInstr: # just for C
C = self.__readRK(self.C)
A = "R[%d]" % self.A
C = self.__formatRK(self.C)
regs = "%s %s %s" % (A, B, C)
regs = "%6s %6s %6s" % (A, B, C)
elif self.type == InstructionType.ABx or self.type == InstructionType.AsBx:
A = "R[%d]" % self.A
B = "R[%d]" % self.B
B = "%d" % self.B
if self.opcode in _KBx:
B = "K[%d]" % self.B
regs = "%s %s" % (A, B)
regs = "%6s %6s" % (A, B)
return "%s : %s" % (instr, regs)
def getAnnotation(self, chunk):
if self.opcode == Opcodes.MOVE:
return "move R[%d] into R[%d]" % (self.B, self.A)
elif self.opcode == Opcodes.LOADK:
return "load %s into R[%d]" % (chunk.getConstant(self.B).toCode(), self.A)
elif self.opcode == Opcodes.GETGLOBAL:
return 'move _G[%s] into R[%d]' % (chunk.getConstant(self.B).toCode(), self.A)
elif self.opcode == Opcodes.ADD:
return 'add %s to %s, place into R[%d]' % (self.__formatRK(self.C), self.__formatRK(self.B), self.A)
elif self.opcode == Opcodes.SUB:
return 'sub %s from %s, place into R[%d]' % (self.__formatRK(self.C), self.__formatRK(self.B), self.A)
elif self.opcode == Opcodes.MUL:
return 'mul %s to %s, place into R[%d]' % (self.__formatRK(self.C), self.__formatRK(self.B), self.A)
elif self.opcode == Opcodes.DIV:
return 'div %s from %s, place into R[%d]' % (self.__formatRK(self.C), self.__formatRK(self.B), self.A)
elif self.opcode == Opcodes.CONCAT:
count = self.C - self.B + 1
return "concat %d values from R[%d] to R[%d], store into R[%d]" % (count, self.B, self.C, self.A)
else:
return ""
class Constant:
def __init__(self, type: ConstType, data) -> None:
self.type = type
self.data = data
def toString(self):
return "[" + self.type.name + "] " + str(self.data)
return "[%s] %s" % (self.type.name, str(self.data))
# format the constant so that it is parsable by lua
def toCode(self):
@@ -133,7 +164,7 @@ class Constant:
else:
return "false"
elif self.type == ConstType.NUMBER:
return str(self.data)
return "%g" % self.data
else:
return "nil"
@@ -158,6 +189,7 @@ class Chunk:
self.maxStack: int = 0
self.upvalues: list[str] = []
self.lineNums: list[int] = []
self.locals: list[Local] = []
def appendInstruction(self, instr: Instruction):
@@ -169,9 +201,15 @@ class Chunk:
def appendProto(self, proto):
self.protos.append(proto)
def appendLine(self, line: int):
self.lineNums.append(line)
def appendLocal(self, local: Local):
self.locals.append(local)
def appendUpval(self, upval: str):
self.upvalues.append(upval)
def findLocal(self, pc: int) -> Local:
for l in self.locals:
if l.start <= pc and l.end >= pc:
@@ -180,15 +218,21 @@ class Chunk:
# there's no local information (may have been stripped)
return None
def getConstant(self, indx: int) -> Constant:
return self.constants[indx]
def print(self):
print("\n==== [[" + str(self.name) + "'s constants]] ====\n")
for z in range(len(self.constants)):
i = self.constants[z]
print(str(z) + ": " + i.toString())
for i in range(len(self.constants)):
print("%d: %s" % (i, self.constants[i].toString()))
print("\n==== [[" + str(self.name) + "'s locals]] ====\n")
for i in range(len(self.locals)):
print("R[%d]: %s" % (i, self.locals[i].name))
print("\n==== [[" + str(self.name) + "'s dissassembly]] ====\n")
for i in range(len(self.instructions)):
print("[%3d] %s" % (i, self.instructions[i].toString()))
print("[%3d] %-40s ; %s" % (i, self.instructions[i].toString(), self.instructions[i].getAnnotation(self)))
if len(self.protos) > 0:
print("\n==== [[" + str(self.name) + "'s protos]] ====\n")
@@ -261,11 +305,7 @@ class LuaUndump:
self.rootChunk: Chunk = None
self.index = 0
@staticmethod
def dis_chunk(chunk: Chunk):
chunk.print()
def loadBlock(self, sz) -> bytearray:
def _loadBlock(self, sz) -> bytearray:
if self.index + sz > len(self.bytecode):
raise Exception("Malformed bytecode!")
@@ -273,82 +313,71 @@ class LuaUndump:
self.index = self.index + sz
return temp
def get_byte(self) -> int:
return self.loadBlock(1)[0]
def _get_byte(self) -> int:
return self._loadBlock(1)[0]
def get_int32(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(4), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(4), byteorder='little', signed=False)
def _get_uint32(self) -> int:
order = 'big' if self.big_endian else 'little'
return int.from_bytes(self._loadBlock(4), byteorder=order, signed=False)
def get_int(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(self.int_size), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(self.int_size), byteorder='little', signed=False)
def _get_uint(self) -> int:
order = 'big' if self.big_endian else 'little'
return int.from_bytes(self._loadBlock(self.int_size), byteorder=order, signed=False)
def get_size_t(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(self.size_t), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(self.size_t), byteorder='little', signed=False)
def _get_size_t(self) -> int:
order = 'big' if self.big_endian else 'little'
return int.from_bytes(self._loadBlock(self.size_t), byteorder=order, signed=False)
def get_double(self) -> int:
if self.big_endian:
return struct.unpack('>d', self.loadBlock(8))[0]
else:
return struct.unpack('<d', self.loadBlock(8))[0]
def _get_double(self) -> int:
order = '>d' if self.big_endian else '<d'
return struct.unpack(order, self._loadBlock(self.l_number_size))[0]
def get_string(self, size) -> str:
if (size == None):
size = self.get_size_t()
if (size == 0):
return ""
def _get_string(self) -> str:
size = self._get_size_t()
if (size == 0):
return ""
return "".join(chr(x) for x in self.loadBlock(size))
# [:-1] to remove the NULL terminator
return ("".join(chr(x) for x in self._loadBlock(size)))[:-1]
def decode_chunk(self) -> Chunk:
chunk = Chunk()
chunk.name = self.get_string(None)
chunk.frst_line = self.get_int()
chunk.last_line = self.get_int()
chunk.numUpvals = self.get_byte()
chunk.numParams = self.get_byte()
chunk.isVarg = (self.get_byte() != 0)
chunk.maxStack = self.get_byte()
if (not chunk.name == ""):
chunk.name = chunk.name[1:-1]
# chunk meta info
chunk.name = self._get_string()
chunk.frst_line = self._get_uint()
chunk.last_line = self._get_uint()
chunk.numUpvals = self._get_byte()
chunk.numParams = self._get_byte()
chunk.isVarg = (self._get_byte() != 0)
chunk.maxStack = self._get_byte()
# parse instructions
num = self.get_int()
num = self._get_uint()
for i in range(num):
chunk.appendInstruction(_decode_instr(self.get_int32()))
chunk.appendInstruction(_decode_instr(self._get_uint32()))
# get constants
num = self.get_int()
num = self._get_uint()
for i in range(num):
constant: Constant = None
type = self.get_byte()
type = self._get_byte()
if type == 0: #nil
if type == 0: # nil
constant = Constant(ConstType.NIL, None)
elif type == 1: # bool
constant = Constant(ConstType.BOOL, (self.get_byte() != 0))
constant = Constant(ConstType.BOOL, (self._get_byte() != 0))
elif type == 3: # number
constant = Constant(ConstType.NUMBER, self.get_double())
constant = Constant(ConstType.NUMBER, self._get_double())
elif type == 4: # string
constant = Constant(ConstType.STRING, self.get_string(None)[:-1])
constant = Constant(ConstType.STRING, self._get_string())
else:
raise Exception("Unknown Datatype! [%d]" % type)
chunk.appendConstant(constant)
# parse protos
num = self.get_int()
num = self._get_uint()
for i in range(num):
chunk.appendProto(self.decode_chunk())
@@ -356,47 +385,47 @@ class LuaUndump:
# eh, for now just consume the bytes.
# line numbers
num = self.get_int()
num = self._get_uint()
for i in range(num):
self.get_int()
self._get_uint()
# locals
num = self.get_int()
num = self._get_uint()
for i in range(num):
name = self.get_string(None) # local name
start = self.get_int() # local start PC
end = self.get_int() # local end PC
name = self._get_string() # local name
start = self._get_uint() # local start PC
end = self._get_uint() # local end PC
chunk.appendLocal(Local(name, start, end))
# upvalues
num = self.get_int()
num = self._get_uint()
for i in range(num):
self.get_string(None) # upvalue name
chunk.appendUpval(self._get_string()) # upvalue name
return chunk
def decode_rawbytecode(self, rawbytecode):
# bytecode sanity checks
if not rawbytecode[0:4] == b'\x1bLua':
if not rawbytecode[0:4] == _LUAMAGIC:
raise Exception("Lua Bytecode expected!")
bytecode = array.array('b', rawbytecode)
return self.decode_bytecode(bytecode)
def decode_bytecode(self, bytecode):
self.bytecode = bytecode
self.bytecode = bytecode
# aligns index, skips header
self.index = 4
self.vm_version = self.get_byte()
self.bytecode_format = self.get_byte()
self.big_endian = (self.get_byte() == 0)
self.int_size = self.get_byte()
self.size_t = self.get_byte()
self.instr_size = self.get_byte() # gets size of instructions
self.l_number_size = self.get_byte() # size of lua_Number
self.integral_flag = self.get_byte()
self.vm_version = self._get_byte()
self.bytecode_format = self._get_byte()
self.big_endian = (self._get_byte() == 0)
self.int_size = self._get_byte()
self.size_t = self._get_byte()
self.instr_size = self._get_byte() # gets size of instructions
self.l_number_size = self._get_byte() # size of lua_Number
self.integral_flag = self._get_byte() # is lua_Number defined as an int? false = float/double, true = int/long/short/etc.
self.rootChunk = self.decode_chunk()
return self.rootChunk
@@ -407,5 +436,122 @@ class LuaUndump:
return self.decode_rawbytecode(bytecode)
def print_dissassembly(self):
LuaUndump.dis_chunk(self.rootChunk)
self.rootChunk.print()
class LuaDump:
def __init__(self, rootChunk: Chunk):
self.rootChunk = rootChunk
self.bytecode = bytearray()
# header info
self.vm_version = 0x51
self.bytecode_format = 0x00
self.big_endian = False
# data sizes
self.int_size = 4
self.size_t = 8
self.instr_size = 4
self.l_number_size = 8
self.integral_flag = False # lua_Number is a double
def _writeBlock(self, data: bytes):
self.bytecode += bytearray(data)
def _set_byte(self, b: int):
self.bytecode.append(b)
def _set_uint32(self, i: int):
order = 'big' if self.big_endian else 'little'
self._writeBlock(i.to_bytes(4, order, signed=False))
def _set_uint(self, i: int):
order = 'big' if self.big_endian else 'little'
self._writeBlock(i.to_bytes(self.int_size, order, signed=False))
def _set_size_t(self, i: int):
order = 'big' if self.big_endian else 'little'
self._writeBlock(i.to_bytes(self.size_t, order, signed=False))
def _set_double(self, f: float):
order = '>d' if self.big_endian else '<d'
self._writeBlock(struct.pack(order, f))
def _set_string(self, string: str):
self._set_size_t(len(string)+1)
self._writeBlock(string.encode('utf-8'))
self._set_byte(0x00) # write null terminator
def _dumpChunk(self, chunk: Chunk):
# write meta info
self._set_string(chunk.name)
self._set_uint(chunk.frst_line)
self._set_uint(chunk.last_line)
self._set_byte(chunk.numUpvals)
self._set_byte(chunk.numParams)
self._set_byte(1 if chunk.isVarg else 1)
self._set_byte(chunk.maxStack)
# write instructions
self._set_uint(len(chunk.instructions))
for l in chunk.instructions:
self._set_uint32(_encode_instr(l))
# write constants
self._set_uint(len(chunk.constants))
for constant in chunk.constants:
# write constant data
if constant.type == ConstType.NIL:
self._set_byte(0)
elif constant.type == ConstType.BOOL:
self._set_byte(1)
self._set_byte(1 if constant.data else 0)
elif constant.type == ConstType.NUMBER: # number
self._set_byte(3)
self._set_double(constant.data)
elif constant.type == ConstType.STRING: # string
self._set_byte(4)
self._set_string(constant.data)
else:
raise Exception("Unknown Datatype! [%s]" % str(constant.type))
# write child protos
self._set_uint(len(chunk.protos))
for p in chunk.protos:
self._dumpChunk(p)
# write line numbers
self._set_uint(len(chunk.lineNums))
for l in chunk.lineNums:
self._set_uint(l)
# write locals
self._set_uint(len(chunk.locals))
for l in chunk.locals:
self._set_string(l.name)
self._set_uint(l.start)
self._set_uint(l.end)
# write upvals
self._set_uint(len(chunk.upvalues))
for u in chunk.upvalues:
self._set_string(u)
def _dumpHeader(self):
self._writeBlock(_LUAMAGIC)
# write header info
self._set_byte(self.vm_version)
self._set_byte(self.bytecode_format)
self._set_byte(0 if self.big_endian else 1)
self._set_byte(self.int_size)
self._set_byte(self.size_t)
self._set_byte(self.instr_size)
self._set_byte(self.l_number_size)
self._set_byte(self.integral_flag)
def dump(self) -> bytearray:
self._dumpHeader()
self._dumpChunk(self.rootChunk)
return self.bytecode

7
main.py Normal file → Executable file
View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys
import lundump
import lparser
@@ -7,4 +8,8 @@ print(sys.argv[1])
chunk = lc.loadFile(sys.argv[1])
lc.print_dissassembly()
lp = lparser.LuaDecomp(chunk)
lp = lparser.LuaDecomp(chunk)
print("\n==== [[" + str(chunk.name) + "'s pseudo-code]] ====\n")
print(lp.getPseudoCode())