Inital commit

- Extremely basic decompiler implemented in lparser.py
- lundump.py ported from [this repository](https://github.com/CPunch/LuaPytecode)
This commit is contained in:
CPunch 2022-08-11 15:38:31 -05:00
commit 0dbdfc49c6
4 changed files with 624 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
example.*
__pycache__

213
lparser.py Normal file
View File

@ -0,0 +1,213 @@
'''
lparser.py
Depends on ldump.py for lua dump deserialization.
An experimental bytecode decompiler.
'''
from operator import concat
from subprocess import call
from lundump import Chunk, LuaUndump, Constant, Instruction, InstructionType, Opcodes
class _Scope:
def __init__(self, startPC: int, endPC: int):
self.startPC = startPC
self.endPC = endPC
class LuaDecomp:
def __init__(self, chunk: Chunk):
self.chunk = chunk
self.pc = 0
self.scope = []
self.top = {}
self.locals = {}
self.unknownLocalCount = 0
self.src: str = ""
# configurations!
self.aggressiveLocals = False # should *EVERY* accessed register be considered a local?
self.indexWidth = 4 # how many spaces for indentions?
# parse instructions
while self.pc < len(self.chunk.instructions):
self.parseExpr()
self.pc += 1
# end the scope (if we're supposed too)
self.__checkScope()
print("\n==== [[" + str(self.chunk.name) + "'s decompiled source]] ====\n")
print(self.src)
def __makeLocalIdentifier(self, indx: int) -> str:
self.locals[indx] = "__unknLocal%d" % self.unknownLocalCount
self.unknownLocalCount += 1
return self.locals[indx]
def __newLocal(self, indx: int, expr: str) -> None:
# TODO: grab identifier from chunk(?)
self.__makeLocalIdentifier(indx)
self.__startStatement()
self.__addExpr("local " + self.locals[indx] + " = " + expr)
def __getNextInstr(self) -> Instruction:
if self.pc + 1 < len(self.chunk.instructions):
return self.chunk.instructions[self.pc + 1]
return None
def __getCurrInstr(self) -> Instruction:
return self.chunk.instructions[self.pc]
def __addExpr(self, code: str) -> None:
self.src += code
def __startStatement(self):
self.src += '\n' + (' ' * self.indexWidth * len(self.scope))
def __getReg(self, indx: int) -> str:
# if the top indx is a local, get it
return self.locals[indx] if indx in self.locals else self.top[indx]
def __setReg(self, indx: int, code: str) -> None:
# if the top indx is a local, set it
if indx in self.locals:
self.__startStatement()
self.__addExpr(self.locals[indx] + " = " + code)
elif self.aggressiveLocals: # 'every register is a local!!'
self.__newLocal(indx, code)
self.top[indx] = code
def __startScope(self, scopeType: str, size: int) -> None:
self.__addExpr(scopeType)
self.scope.append(_Scope(self.pc, self.pc + size))
# checks if we need to end a scope
def __checkScope(self) -> None:
if len(self.scope) == 0:
return
if self.pc > self.scope[len(self.scope) - 1].endPC:
self.__endScope()
def __endScope(self) -> None:
self.scope.pop()
self.__startStatement()
self.__addExpr("end")
def __emitOperand(self, a: int, b: str, c: str, op: str) -> None:
self.__setReg(a, "(" + b + op + c + ")")
# 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which
def __readRK(self, rk: int) -> str:
if (rk & (1 << 8)) > 0:
return self.chunk.constants[(rk & ~(1 << 8))].toCode()
else:
return self.__getReg(rk)
def parseExpr(self):
instr = self.__getCurrInstr()
# python, add switch statements *please*
if instr.opcode == Opcodes.MOVE: # move is a fake ABC instr, C is ignored
# move registers
self.__setReg(instr.A, self.__getReg(instr.B))
elif instr.opcode == Opcodes.LOADK:
self.__setReg(instr.A, self.chunk.constants[instr.B].toCode())
elif instr.opcode == Opcodes.LOADBOOL:
if instr.B == 0:
self.__setReg(instr.A, "false")
else:
self.__setReg(instr.A, "true")
elif instr.opcode == Opcodes.GETGLOBAL:
self.__setReg(instr.A, self.chunk.constants[instr.B].data)
elif instr.opcode == Opcodes.GETTABLE:
self.__setReg(instr.A, self.__getReg(instr.B) + "[" + self.__readRK(instr.C) + "]")
elif instr.opcode == Opcodes.SETGLOBAL:
self.__startStatement()
self.__addExpr(self.chunk.constants[instr.B].data + " = " + self.__getReg(instr.A))
elif instr.opcode == Opcodes.SETTABLE:
self.__startStatement()
self.__addExpr(self.__getReg(instr.A) + "[" + self.__readRK(instr.B) + "] = " + self.__readRK(instr.C))
elif instr.opcode == Opcodes.ADD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " + ")
elif instr.opcode == Opcodes.SUB:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " - ")
elif instr.opcode == Opcodes.MUL:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " * ")
elif instr.opcode == Opcodes.DIV:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " / ")
elif instr.opcode == Opcodes.MOD:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " % ")
elif instr.opcode == Opcodes.POW:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " ^ ")
elif instr.opcode == Opcodes.UNM:
self.__setReg(instr.A, "-" + self.__getReg(instr.B))
elif instr.opcode == Opcodes.NOT:
self.__setReg(instr.A, "!" + self.__getReg(instr.B))
elif instr.opcode == Opcodes.LEN:
self.__setReg(instr.A, "#" + self.__getCurrInstr(instr.B))
elif instr.opcode == Opcodes.CONCAT:
count = instr.C-instr.B+1
concatStr = ""
# concat all items on stack from RC to RB
for i in range(count):
concatStr += self.__getReg(instr.B + i) + (" .. " if not i == count - 1 else "")
self.__setReg(instr.A, concatStr)
elif instr.opcode == Opcodes.JMP:
pass
elif instr.opcode == Opcodes.EQ:
self.__startStatement()
if instr.A > 0:
self.__addExpr("if not ")
else:
self.__addExpr("if ")
self.__addExpr(self.__readRK(instr.B) + " == " + self.__readRK(instr.C) + " ")
self.__startScope("then ", self.__getNextInstr().B + 1)
self.pc += 1 # skip next instr
elif instr.opcode == Opcodes.LT:
self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " < ")
elif instr.opcode == Opcodes.LE:
self.__emitOperand(instr.A, instr.B, instr.C, " <= ")
elif instr.opcode == Opcodes.CALL:
preStr = ""
callStr = ""
ident = ""
# parse arguments
callStr += self.__getReg(instr.A) + "("
for i in range(instr.A + 1, instr.A + instr.B):
callStr += self.__getReg(i) + (", " if not i + 1 == instr.A + instr.B else "")
callStr += ")"
# parse return values
if instr.C > 1:
preStr = "local "
for indx in range(instr.A, instr.A + instr.C - 1):
if indx in self.locals:
ident = self.locals[indx]
else:
ident = self.__makeLocalIdentifier(indx)
preStr += ident
# normally setReg() does this
self.top[indx] = ident
# just so we don't have a trailing ', '
preStr += ", " if not indx == instr.A + instr.C - 2 else ""
preStr += " = "
self.__startStatement()
self.__addExpr(preStr + callStr)
elif instr.opcode == Opcodes.RETURN:
self.__startStatement()
pass # no-op for now
else:
raise Exception("unsupported instruction: %s" % instr.toString())

399
lundump.py Normal file
View File

@ -0,0 +1,399 @@
'''
l(un)dump.py
A Lua5.1 cross-platform bytecode deserializer. This module pulls int and size_t sizes from the
chunk header, meaning it should be able to deserialize lua bytecode dumps from most platforms,
regardless of the host machine.
For details on the Lua5.1 bytecode format, I read [this PDF](https://archive.org/download/a-no-frills-intro-to-lua-5.1-vm-instructions/a-no-frills-intro-to-lua-5.1-vm-instructions_archive.torrent)
as well as read the lundump.c source file from the Lua5.1 source.
'''
from multiprocessing.spawn import get_executable
import struct
import array
from enum import IntEnum, Enum, auto
from typing_extensions import Self
class InstructionType(Enum):
ABC = auto(),
ABx = auto(),
AsBx = auto()
class Opcodes(IntEnum):
MOVE = 0,
LOADK = 1,
LOADBOOL = 2,
LOADNIL = 3,
GETUPVAL = 4,
GETGLOBAL = 5,
GETTABLE = 6,
SETGLOBAL = 7,
SETUPVAL = 8,
SETTABLE = 9,
NEWTABLE = 10,
SELF = 11,
ADD = 12,
SUB = 13,
MUL = 14,
DIV = 15,
MOD = 16,
POW = 17,
UNM = 18,
NOT = 19,
LEN = 20,
CONCAT = 21,
JMP = 22,
EQ = 23,
LT = 24,
LE = 25,
TEST = 26,
TESTSET = 27,
CALL = 28,
TAILCALL = 29,
RETURN = 30,
FORLOOP = 31,
FORPREP = 32,
TFORLOOP = 33,
SETLIST = 34,
CLOSE = 35,
CLOSURE = 36,
VARARG = 37
class ConstType(IntEnum):
NIL = 0,
BOOL = 1,
NUMBER = 3,
STRING = 4,
class Instruction:
def __init__(self, type: InstructionType, name: str) -> None:
self.type = type
self.name = name
self.opcode: int = None
self.A: int = None
self.B: int = None
self.C: int = None
# 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which
def __readRK(self, rk: int) -> str:
if (rk & (1 << 8)) > 0:
return "K[" + str((rk & ~(1 << 8))) + "]"
else:
return "R[" + str(rk) + "]"
def toString(self):
instr = "%10s" % self.name
regs = ""
if self.type == InstructionType.ABC:
A = "%d" % self.A
B = "%d" % self.B
C = "%d" % self.C
# these opcodes have RKs for B & C
if self.opcode == Opcodes.SETTABLE or self.opcode == Opcodes.EQ or self.opcode == Opcodes.LT:
B = self.__readRK(self.B)
C = self.__readRK(self.C)
elif self.opcode == Opcodes.GETTABLE: # just for C
C = self.__readRK(self.C)
regs = "%s %s %s" % (A, B, C)
elif self.type == InstructionType.ABx or self.type == InstructionType.AsBx:
regs = "%d %d" % (self.A, self.B)
return "%s : %s" % (instr, regs)
class Constant:
def __init__(self, type: ConstType, data) -> None:
self.type = type
self.data = data
def toString(self):
return "[" + self.type.name + "] " + str(self.data)
# format the constant so that it is parsable by lua
def toCode(self):
if self.type == ConstType.STRING:
return "\"" + self.data + "\""
elif self.type == ConstType.BOOL:
if self.data:
return "true"
else:
return "false"
elif self.type == ConstType.NUMBER:
return str(self.data)
else:
return "nil"
class Local:
def __init__(self, name: str, start: int, end: int):
self.name = name
self.start = start
self.end = end
class Chunk:
def __init__(self) -> None:
self.constants: list[Constant] = []
self.instructions: list[Instruction] = []
self.protos: list[Chunk] = []
self.name: str = "Unnamed proto"
self.frst_line: int = 0
self.last_line: int = 0
self.numUpvals: int = 0
self.numParams: int = 0
self.isVarg: bool = False
self.maxStack: int = 0
self.upvalues: list[str] = []
self.locals: list[Local] = []
def appendInstruction(self, instr: Instruction):
self.instructions.append(instr)
def appendConstant(self, const: Constant):
self.constants.append(const)
def appendProto(self, proto):
self.protos.append(proto)
def appendLocal(self, local: Local):
self.locals.append(local)
def findLocal(self, pc: int) -> Local:
for l in self.locals:
if l.start <= pc and l.end >= pc:
return l
# there's no local information (may have been stripped)
return None
def print(self):
print("\n==== [[" + str(self.name) + "'s constants]] ====\n")
for z in range(len(self.constants)):
i = self.constants[z]
print(str(z) + ": " + i.toString())
print("\n==== [[" + str(self.name) + "'s dissassembly]] ====\n")
for i in range(len(self.instructions)):
print("[%3d] %s" % (i, self.instructions[i].toString()))
print("\n==== [[" + str(self.name) + "'s protos]] ====\n")
for z in self.protos:
z.print()
instr_lookup_tbl = [
Instruction(InstructionType.ABC, "MOVE"), Instruction(InstructionType.ABx, "LOADK"), Instruction(InstructionType.ABC, "LOADBOOL"),
Instruction(InstructionType.ABC, "LOADNIL"), Instruction(InstructionType.ABC, "GETUPVAL"), Instruction(InstructionType.ABx, "GETGLOBAL"),
Instruction(InstructionType.ABC, "GETTABLE"), Instruction(InstructionType.ABx, "SETGLOBAL"), Instruction(InstructionType.ABC, "SETUPVAL"),
Instruction(InstructionType.ABC, "SETTABLE"), Instruction(InstructionType.ABC, "NEWTABLE"), Instruction(InstructionType.ABC, "SELF"),
Instruction(InstructionType.ABC, "ADD"), Instruction(InstructionType.ABC, "SUB"), Instruction(InstructionType.ABC, "MUL"),
Instruction(InstructionType.ABC, "DIV"), Instruction(InstructionType.ABC, "MOD"), Instruction(InstructionType.ABC, "POW"),
Instruction(InstructionType.ABC, "UNM"), Instruction(InstructionType.ABC, "NOT"), Instruction(InstructionType.ABC, "LEN"),
Instruction(InstructionType.ABC, "CONCAT"), Instruction(InstructionType.AsBx, "JMP"), Instruction(InstructionType.ABC, "EQ"),
Instruction(InstructionType.ABC, "LT"), Instruction(InstructionType.ABC, "LE"), Instruction(InstructionType.ABC, "TEST"),
Instruction(InstructionType.ABC, "TESTSET"), Instruction(InstructionType.ABC, "CALL"), Instruction(InstructionType.ABC, "TAILCALL"),
Instruction(InstructionType.ABC, "RETURN"), Instruction(InstructionType.AsBx, "FORLOOP"), Instruction(InstructionType.AsBx, "FORPREP"),
Instruction(InstructionType.ABC, "TFORLOOP"), Instruction(InstructionType.ABC, "SETLIST"), Instruction(InstructionType.ABC, "CLOSE"),
Instruction(InstructionType.ABx, "CLOSURE"), Instruction(InstructionType.ABC, "VARARG")
]
# at [p]osition, with [s]ize of bits
def get_bits(num: int, p: int, s: int):
return (num>>p) & (~((~0)<<s))
# set bits from data to num at [p]osition, with [s]ize of bits
def set_bits(num, data, p, s) -> int:
return (num & (~((~((~0)<<s))<<p))) | ((data << p) & ((~((~0)<<s))<<p))
def _decode_instr(data: int) -> Instruction:
opcode = get_bits(data, 0, 6)
template = instr_lookup_tbl[opcode]
instr = Instruction(template.type, template.name)
# i read the lopcodes.h file to get these bit position and sizes.
instr.opcode = opcode
instr.A = get_bits(data, 6, 8) # starts after POS_OP + SIZE_OP (6), with a size of 8
if instr.type == InstructionType.ABC:
instr.B = get_bits(data, 23, 9) # starts after POS_C + SIZE_C (23), with a size of 9
instr.C = get_bits(data, 14, 9) # starts after POS_A + SIZE_A (14), with a size of 9
elif instr.type == InstructionType.ABx:
instr.B = get_bits(data, 14, 18) # starts after POS_A + SIZE_A (14), with a size of 18
elif instr.type == InstructionType.AsBx:
instr.B = get_bits(data, 14, 18) - 131071 # Bx is now signed, so just sub half of the MAX_UINT for 18 bits
return instr
# returns a u32 instruction
def _encode_instr(instr: Instruction) -> int:
data = 0
# encode instruction (basically, do the inverse of _decode_instr)
data = set_bits(data, instr.opcode, 0, 6)
data = set_bits(data, instr.A, 6, 8)
if instr.type == InstructionType.ABC:
data = set_bits(data, instr.B, 23, 9)
data = set_bits(data, instr.C, 14, 9)
elif instr.type == InstructionType.ABx:
data = set_bits(data, instr.B, 14, 18)
elif instr.type == InstructionType.AsBx:
data = set_bits(data, instr.B + 131071, 14, 18)
return data
class LuaUndump:
def __init__(self):
self.rootChunk: Chunk = None
self.index = 0
@staticmethod
def dis_chunk(chunk: Chunk):
chunk.print()
def loadBlock(self, sz) -> bytearray:
if self.index + sz > len(self.bytecode):
raise Exception("Malformed bytecode!")
temp = bytearray(self.bytecode[self.index:self.index+sz])
self.index = self.index + sz
return temp
def get_byte(self) -> int:
return self.loadBlock(1)[0]
def get_int32(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(4), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(4), byteorder='little', signed=False)
def get_int(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(self.int_size), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(self.int_size), byteorder='little', signed=False)
def get_size_t(self) -> int:
if (self.big_endian):
return int.from_bytes(self.loadBlock(self.size_t), byteorder='big', signed=False)
else:
return int.from_bytes(self.loadBlock(self.size_t), byteorder='little', signed=False)
def get_double(self) -> int:
if self.big_endian:
return struct.unpack('>d', self.loadBlock(8))[0]
else:
return struct.unpack('<d', self.loadBlock(8))[0]
def get_string(self, size) -> str:
if (size == None):
size = self.get_size_t()
if (size == 0):
return ""
return "".join(chr(x) for x in self.loadBlock(size))
def decode_chunk(self) -> Chunk:
chunk = Chunk()
chunk.name = self.get_string(None)
chunk.frst_line = self.get_int()
chunk.last_line = self.get_int()
chunk.numUpvals = self.get_byte()
chunk.numParams = self.get_byte()
chunk.isVarg = (self.get_byte() != 0)
chunk.maxStack = self.get_byte()
if (not chunk.name == ""):
chunk.name = chunk.name[1:-1]
# parse instructions
num = self.get_int()
for i in range(num):
chunk.appendInstruction(_decode_instr(self.get_int32()))
# get constants
num = self.get_int()
for i in range(num):
constant: Constant = None
type = self.get_byte()
if type == 0: #nil
constant = Constant(ConstType.NIL, None)
elif type == 1: # bool
constant = Constant(ConstType.BOOL, (self.get_byte() != 0))
elif type == 3: # number
constant = Constant(ConstType.NUMBER, self.get_double())
elif type == 4: # string
constant = Constant(ConstType.STRING, self.get_string(None)[:-1])
else:
raise Exception("Unknown Datatype! [%d]" % type)
chunk.appendConstant(constant)
# parse protos
num = self.get_int()
for i in range(num):
chunk.appendProto(self.decode_chunk())
# debug stuff, maybe i'll add this to chunks to have better disassembly annotation in the future?
# eh, for now just consume the bytes.
# line numbers
num = self.get_int()
for i in range(num):
self.get_int()
# locals
num = self.get_int()
for i in range(num):
name = self.get_string(None) # local name
start = self.get_int() # local start PC
end = self.get_int() # local end PC
chunk.appendLocal(Local(name, start, end))
# upvalues
num = self.get_int()
for i in range(num):
self.get_string(None) # upvalue name
return chunk
def decode_rawbytecode(self, rawbytecode):
# bytecode sanity checks
if not rawbytecode[0:4] == b'\x1bLua':
raise Exception("Lua Bytecode expected!")
bytecode = array.array('b', rawbytecode)
return self.decode_bytecode(bytecode)
def decode_bytecode(self, bytecode):
self.bytecode = bytecode
# aligns index, skips header
self.index = 4
self.vm_version = self.get_byte()
self.bytecode_format = self.get_byte()
self.big_endian = (self.get_byte() == 0)
self.int_size = self.get_byte()
self.size_t = self.get_byte()
self.instr_size = self.get_byte() # gets size of instructions
self.l_number_size = self.get_byte() # size of lua_Number
self.integral_flag = self.get_byte()
self.rootChunk = self.decode_chunk()
return self.rootChunk
def loadFile(self, luaCFile):
with open(luaCFile, 'rb') as luac_file:
bytecode = luac_file.read()
return self.decode_rawbytecode(bytecode)
def print_dissassembly(self):
LuaUndump.dis_chunk(self.rootChunk)

10
main.py Normal file
View File

@ -0,0 +1,10 @@
import sys
import lundump
import lparser
lc = lundump.LuaUndump()
print(sys.argv[1])
chunk = lc.loadFile(sys.argv[1])
lc.print_dissassembly()
lp = lparser.LuaDecomp(chunk)