From 0dbdfc49c66712193f56b868d91544700d7cd767 Mon Sep 17 00:00:00 2001 From: CPunch Date: Thu, 11 Aug 2022 15:38:31 -0500 Subject: [PATCH] Inital commit - Extremely basic decompiler implemented in lparser.py - lundump.py ported from [this repository](https://github.com/CPunch/LuaPytecode) --- .gitignore | 2 + lparser.py | 213 ++++++++++++++++++++++++++++ lundump.py | 399 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 10 ++ 4 files changed, 624 insertions(+) create mode 100644 .gitignore create mode 100644 lparser.py create mode 100644 lundump.py create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6ae2705 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +example.* +__pycache__ diff --git a/lparser.py b/lparser.py new file mode 100644 index 0000000..a016b7f --- /dev/null +++ b/lparser.py @@ -0,0 +1,213 @@ +''' + lparser.py + + Depends on ldump.py for lua dump deserialization. + + An experimental bytecode decompiler. +''' + +from operator import concat +from subprocess import call +from lundump import Chunk, LuaUndump, Constant, Instruction, InstructionType, Opcodes + +class _Scope: + def __init__(self, startPC: int, endPC: int): + self.startPC = startPC + self.endPC = endPC + +class LuaDecomp: + def __init__(self, chunk: Chunk): + self.chunk = chunk + self.pc = 0 + self.scope = [] + self.top = {} + self.locals = {} + self.unknownLocalCount = 0 + self.src: str = "" + + # configurations! + self.aggressiveLocals = False # should *EVERY* accessed register be considered a local? + self.indexWidth = 4 # how many spaces for indentions? + + # parse instructions + while self.pc < len(self.chunk.instructions): + self.parseExpr() + self.pc += 1 + + # end the scope (if we're supposed too) + self.__checkScope() + + print("\n==== [[" + str(self.chunk.name) + "'s decompiled source]] ====\n") + print(self.src) + + def __makeLocalIdentifier(self, indx: int) -> str: + self.locals[indx] = "__unknLocal%d" % self.unknownLocalCount + self.unknownLocalCount += 1 + + return self.locals[indx] + + def __newLocal(self, indx: int, expr: str) -> None: + # TODO: grab identifier from chunk(?) + self.__makeLocalIdentifier(indx) + + self.__startStatement() + self.__addExpr("local " + self.locals[indx] + " = " + expr) + + def __getNextInstr(self) -> Instruction: + if self.pc + 1 < len(self.chunk.instructions): + return self.chunk.instructions[self.pc + 1] + + return None + + def __getCurrInstr(self) -> Instruction: + return self.chunk.instructions[self.pc] + + def __addExpr(self, code: str) -> None: + self.src += code + + def __startStatement(self): + self.src += '\n' + (' ' * self.indexWidth * len(self.scope)) + + def __getReg(self, indx: int) -> str: + # if the top indx is a local, get it + return self.locals[indx] if indx in self.locals else self.top[indx] + + def __setReg(self, indx: int, code: str) -> None: + # if the top indx is a local, set it + if indx in self.locals: + self.__startStatement() + self.__addExpr(self.locals[indx] + " = " + code) + elif self.aggressiveLocals: # 'every register is a local!!' + self.__newLocal(indx, code) + + self.top[indx] = code + + def __startScope(self, scopeType: str, size: int) -> None: + self.__addExpr(scopeType) + self.scope.append(_Scope(self.pc, self.pc + size)) + + # checks if we need to end a scope + def __checkScope(self) -> None: + if len(self.scope) == 0: + return + + if self.pc > self.scope[len(self.scope) - 1].endPC: + self.__endScope() + + def __endScope(self) -> None: + self.scope.pop() + self.__startStatement() + self.__addExpr("end") + + def __emitOperand(self, a: int, b: str, c: str, op: str) -> None: + self.__setReg(a, "(" + b + op + c + ")") + + # 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which + def __readRK(self, rk: int) -> str: + if (rk & (1 << 8)) > 0: + return self.chunk.constants[(rk & ~(1 << 8))].toCode() + else: + return self.__getReg(rk) + + def parseExpr(self): + instr = self.__getCurrInstr() + + # python, add switch statements *please* + if instr.opcode == Opcodes.MOVE: # move is a fake ABC instr, C is ignored + # move registers + self.__setReg(instr.A, self.__getReg(instr.B)) + elif instr.opcode == Opcodes.LOADK: + self.__setReg(instr.A, self.chunk.constants[instr.B].toCode()) + elif instr.opcode == Opcodes.LOADBOOL: + if instr.B == 0: + self.__setReg(instr.A, "false") + else: + self.__setReg(instr.A, "true") + elif instr.opcode == Opcodes.GETGLOBAL: + self.__setReg(instr.A, self.chunk.constants[instr.B].data) + elif instr.opcode == Opcodes.GETTABLE: + self.__setReg(instr.A, self.__getReg(instr.B) + "[" + self.__readRK(instr.C) + "]") + elif instr.opcode == Opcodes.SETGLOBAL: + self.__startStatement() + self.__addExpr(self.chunk.constants[instr.B].data + " = " + self.__getReg(instr.A)) + elif instr.opcode == Opcodes.SETTABLE: + self.__startStatement() + self.__addExpr(self.__getReg(instr.A) + "[" + self.__readRK(instr.B) + "] = " + self.__readRK(instr.C)) + elif instr.opcode == Opcodes.ADD: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " + ") + elif instr.opcode == Opcodes.SUB: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " - ") + elif instr.opcode == Opcodes.MUL: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " * ") + elif instr.opcode == Opcodes.DIV: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " / ") + elif instr.opcode == Opcodes.MOD: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " % ") + elif instr.opcode == Opcodes.POW: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " ^ ") + elif instr.opcode == Opcodes.UNM: + self.__setReg(instr.A, "-" + self.__getReg(instr.B)) + elif instr.opcode == Opcodes.NOT: + self.__setReg(instr.A, "!" + self.__getReg(instr.B)) + elif instr.opcode == Opcodes.LEN: + self.__setReg(instr.A, "#" + self.__getCurrInstr(instr.B)) + elif instr.opcode == Opcodes.CONCAT: + count = instr.C-instr.B+1 + concatStr = "" + + # concat all items on stack from RC to RB + for i in range(count): + concatStr += self.__getReg(instr.B + i) + (" .. " if not i == count - 1 else "") + + self.__setReg(instr.A, concatStr) + elif instr.opcode == Opcodes.JMP: + pass + elif instr.opcode == Opcodes.EQ: + self.__startStatement() + if instr.A > 0: + self.__addExpr("if not ") + else: + self.__addExpr("if ") + self.__addExpr(self.__readRK(instr.B) + " == " + self.__readRK(instr.C) + " ") + self.__startScope("then ", self.__getNextInstr().B + 1) + + self.pc += 1 # skip next instr + elif instr.opcode == Opcodes.LT: + self.__emitOperand(instr.A, self.__readRK(instr.B), self.__readRK(instr.C), " < ") + elif instr.opcode == Opcodes.LE: + self.__emitOperand(instr.A, instr.B, instr.C, " <= ") + elif instr.opcode == Opcodes.CALL: + preStr = "" + callStr = "" + ident = "" + + # parse arguments + callStr += self.__getReg(instr.A) + "(" + for i in range(instr.A + 1, instr.A + instr.B): + callStr += self.__getReg(i) + (", " if not i + 1 == instr.A + instr.B else "") + callStr += ")" + + # parse return values + if instr.C > 1: + preStr = "local " + for indx in range(instr.A, instr.A + instr.C - 1): + if indx in self.locals: + ident = self.locals[indx] + else: + ident = self.__makeLocalIdentifier(indx) + preStr += ident + + # normally setReg() does this + self.top[indx] = ident + + # just so we don't have a trailing ', ' + preStr += ", " if not indx == instr.A + instr.C - 2 else "" + preStr += " = " + + self.__startStatement() + self.__addExpr(preStr + callStr) + elif instr.opcode == Opcodes.RETURN: + self.__startStatement() + pass # no-op for now + else: + raise Exception("unsupported instruction: %s" % instr.toString()) \ No newline at end of file diff --git a/lundump.py b/lundump.py new file mode 100644 index 0000000..f0d400d --- /dev/null +++ b/lundump.py @@ -0,0 +1,399 @@ +''' + l(un)dump.py + + A Lua5.1 cross-platform bytecode deserializer. This module pulls int and size_t sizes from the + chunk header, meaning it should be able to deserialize lua bytecode dumps from most platforms, + regardless of the host machine. + + For details on the Lua5.1 bytecode format, I read [this PDF](https://archive.org/download/a-no-frills-intro-to-lua-5.1-vm-instructions/a-no-frills-intro-to-lua-5.1-vm-instructions_archive.torrent) + as well as read the lundump.c source file from the Lua5.1 source. +''' + +from multiprocessing.spawn import get_executable +import struct +import array +from enum import IntEnum, Enum, auto +from typing_extensions import Self + +class InstructionType(Enum): + ABC = auto(), + ABx = auto(), + AsBx = auto() + +class Opcodes(IntEnum): + MOVE = 0, + LOADK = 1, + LOADBOOL = 2, + LOADNIL = 3, + GETUPVAL = 4, + GETGLOBAL = 5, + GETTABLE = 6, + SETGLOBAL = 7, + SETUPVAL = 8, + SETTABLE = 9, + NEWTABLE = 10, + SELF = 11, + ADD = 12, + SUB = 13, + MUL = 14, + DIV = 15, + MOD = 16, + POW = 17, + UNM = 18, + NOT = 19, + LEN = 20, + CONCAT = 21, + JMP = 22, + EQ = 23, + LT = 24, + LE = 25, + TEST = 26, + TESTSET = 27, + CALL = 28, + TAILCALL = 29, + RETURN = 30, + FORLOOP = 31, + FORPREP = 32, + TFORLOOP = 33, + SETLIST = 34, + CLOSE = 35, + CLOSURE = 36, + VARARG = 37 + +class ConstType(IntEnum): + NIL = 0, + BOOL = 1, + NUMBER = 3, + STRING = 4, + +class Instruction: + def __init__(self, type: InstructionType, name: str) -> None: + self.type = type + self.name = name + self.opcode: int = None + self.A: int = None + self.B: int = None + self.C: int = None + + # 'RK's are special in because can be a register or a konstant. a bitflag is read to determine which + def __readRK(self, rk: int) -> str: + if (rk & (1 << 8)) > 0: + return "K[" + str((rk & ~(1 << 8))) + "]" + else: + return "R[" + str(rk) + "]" + + def toString(self): + instr = "%10s" % self.name + regs = "" + + if self.type == InstructionType.ABC: + A = "%d" % self.A + B = "%d" % self.B + C = "%d" % self.C + + # these opcodes have RKs for B & C + if self.opcode == Opcodes.SETTABLE or self.opcode == Opcodes.EQ or self.opcode == Opcodes.LT: + B = self.__readRK(self.B) + C = self.__readRK(self.C) + elif self.opcode == Opcodes.GETTABLE: # just for C + C = self.__readRK(self.C) + + regs = "%s %s %s" % (A, B, C) + elif self.type == InstructionType.ABx or self.type == InstructionType.AsBx: + regs = "%d %d" % (self.A, self.B) + + return "%s : %s" % (instr, regs) + +class Constant: + def __init__(self, type: ConstType, data) -> None: + self.type = type + self.data = data + + def toString(self): + return "[" + self.type.name + "] " + str(self.data) + + # format the constant so that it is parsable by lua + def toCode(self): + if self.type == ConstType.STRING: + return "\"" + self.data + "\"" + elif self.type == ConstType.BOOL: + if self.data: + return "true" + else: + return "false" + elif self.type == ConstType.NUMBER: + return str(self.data) + else: + return "nil" + +class Local: + def __init__(self, name: str, start: int, end: int): + self.name = name + self.start = start + self.end = end + +class Chunk: + def __init__(self) -> None: + self.constants: list[Constant] = [] + self.instructions: list[Instruction] = [] + self.protos: list[Chunk] = [] + + self.name: str = "Unnamed proto" + self.frst_line: int = 0 + self.last_line: int = 0 + self.numUpvals: int = 0 + self.numParams: int = 0 + self.isVarg: bool = False + self.maxStack: int = 0 + + self.upvalues: list[str] = [] + self.locals: list[Local] = [] + + def appendInstruction(self, instr: Instruction): + self.instructions.append(instr) + + def appendConstant(self, const: Constant): + self.constants.append(const) + + def appendProto(self, proto): + self.protos.append(proto) + + def appendLocal(self, local: Local): + self.locals.append(local) + + def findLocal(self, pc: int) -> Local: + for l in self.locals: + if l.start <= pc and l.end >= pc: + return l + + # there's no local information (may have been stripped) + return None + + def print(self): + print("\n==== [[" + str(self.name) + "'s constants]] ====\n") + for z in range(len(self.constants)): + i = self.constants[z] + print(str(z) + ": " + i.toString()) + + print("\n==== [[" + str(self.name) + "'s dissassembly]] ====\n") + for i in range(len(self.instructions)): + print("[%3d] %s" % (i, self.instructions[i].toString())) + + print("\n==== [[" + str(self.name) + "'s protos]] ====\n") + for z in self.protos: + z.print() + +instr_lookup_tbl = [ + Instruction(InstructionType.ABC, "MOVE"), Instruction(InstructionType.ABx, "LOADK"), Instruction(InstructionType.ABC, "LOADBOOL"), + Instruction(InstructionType.ABC, "LOADNIL"), Instruction(InstructionType.ABC, "GETUPVAL"), Instruction(InstructionType.ABx, "GETGLOBAL"), + Instruction(InstructionType.ABC, "GETTABLE"), Instruction(InstructionType.ABx, "SETGLOBAL"), Instruction(InstructionType.ABC, "SETUPVAL"), + Instruction(InstructionType.ABC, "SETTABLE"), Instruction(InstructionType.ABC, "NEWTABLE"), Instruction(InstructionType.ABC, "SELF"), + Instruction(InstructionType.ABC, "ADD"), Instruction(InstructionType.ABC, "SUB"), Instruction(InstructionType.ABC, "MUL"), + Instruction(InstructionType.ABC, "DIV"), Instruction(InstructionType.ABC, "MOD"), Instruction(InstructionType.ABC, "POW"), + Instruction(InstructionType.ABC, "UNM"), Instruction(InstructionType.ABC, "NOT"), Instruction(InstructionType.ABC, "LEN"), + Instruction(InstructionType.ABC, "CONCAT"), Instruction(InstructionType.AsBx, "JMP"), Instruction(InstructionType.ABC, "EQ"), + Instruction(InstructionType.ABC, "LT"), Instruction(InstructionType.ABC, "LE"), Instruction(InstructionType.ABC, "TEST"), + Instruction(InstructionType.ABC, "TESTSET"), Instruction(InstructionType.ABC, "CALL"), Instruction(InstructionType.ABC, "TAILCALL"), + Instruction(InstructionType.ABC, "RETURN"), Instruction(InstructionType.AsBx, "FORLOOP"), Instruction(InstructionType.AsBx, "FORPREP"), + Instruction(InstructionType.ABC, "TFORLOOP"), Instruction(InstructionType.ABC, "SETLIST"), Instruction(InstructionType.ABC, "CLOSE"), + Instruction(InstructionType.ABx, "CLOSURE"), Instruction(InstructionType.ABC, "VARARG") +] + +# at [p]osition, with [s]ize of bits +def get_bits(num: int, p: int, s: int): + return (num>>p) & (~((~0)< int: + return (num & (~((~((~0)< Instruction: + opcode = get_bits(data, 0, 6) + template = instr_lookup_tbl[opcode] + instr = Instruction(template.type, template.name) + + # i read the lopcodes.h file to get these bit position and sizes. + instr.opcode = opcode + instr.A = get_bits(data, 6, 8) # starts after POS_OP + SIZE_OP (6), with a size of 8 + + if instr.type == InstructionType.ABC: + instr.B = get_bits(data, 23, 9) # starts after POS_C + SIZE_C (23), with a size of 9 + instr.C = get_bits(data, 14, 9) # starts after POS_A + SIZE_A (14), with a size of 9 + elif instr.type == InstructionType.ABx: + instr.B = get_bits(data, 14, 18) # starts after POS_A + SIZE_A (14), with a size of 18 + elif instr.type == InstructionType.AsBx: + instr.B = get_bits(data, 14, 18) - 131071 # Bx is now signed, so just sub half of the MAX_UINT for 18 bits + + return instr + +# returns a u32 instruction +def _encode_instr(instr: Instruction) -> int: + data = 0 + + # encode instruction (basically, do the inverse of _decode_instr) + data = set_bits(data, instr.opcode, 0, 6) + data = set_bits(data, instr.A, 6, 8) + + if instr.type == InstructionType.ABC: + data = set_bits(data, instr.B, 23, 9) + data = set_bits(data, instr.C, 14, 9) + elif instr.type == InstructionType.ABx: + data = set_bits(data, instr.B, 14, 18) + elif instr.type == InstructionType.AsBx: + data = set_bits(data, instr.B + 131071, 14, 18) + + return data + +class LuaUndump: + def __init__(self): + self.rootChunk: Chunk = None + self.index = 0 + + @staticmethod + def dis_chunk(chunk: Chunk): + chunk.print() + + def loadBlock(self, sz) -> bytearray: + if self.index + sz > len(self.bytecode): + raise Exception("Malformed bytecode!") + + temp = bytearray(self.bytecode[self.index:self.index+sz]) + self.index = self.index + sz + return temp + + def get_byte(self) -> int: + return self.loadBlock(1)[0] + + def get_int32(self) -> int: + if (self.big_endian): + return int.from_bytes(self.loadBlock(4), byteorder='big', signed=False) + else: + return int.from_bytes(self.loadBlock(4), byteorder='little', signed=False) + + def get_int(self) -> int: + if (self.big_endian): + return int.from_bytes(self.loadBlock(self.int_size), byteorder='big', signed=False) + else: + return int.from_bytes(self.loadBlock(self.int_size), byteorder='little', signed=False) + + def get_size_t(self) -> int: + if (self.big_endian): + return int.from_bytes(self.loadBlock(self.size_t), byteorder='big', signed=False) + else: + return int.from_bytes(self.loadBlock(self.size_t), byteorder='little', signed=False) + + def get_double(self) -> int: + if self.big_endian: + return struct.unpack('>d', self.loadBlock(8))[0] + else: + return struct.unpack(' str: + if (size == None): + size = self.get_size_t() + if (size == 0): + return "" + + return "".join(chr(x) for x in self.loadBlock(size)) + + def decode_chunk(self) -> Chunk: + chunk = Chunk() + + chunk.name = self.get_string(None) + chunk.frst_line = self.get_int() + chunk.last_line = self.get_int() + + chunk.numUpvals = self.get_byte() + chunk.numParams = self.get_byte() + chunk.isVarg = (self.get_byte() != 0) + chunk.maxStack = self.get_byte() + + if (not chunk.name == ""): + chunk.name = chunk.name[1:-1] + + # parse instructions + num = self.get_int() + for i in range(num): + chunk.appendInstruction(_decode_instr(self.get_int32())) + + # get constants + num = self.get_int() + for i in range(num): + constant: Constant = None + type = self.get_byte() + + if type == 0: #nil + constant = Constant(ConstType.NIL, None) + elif type == 1: # bool + constant = Constant(ConstType.BOOL, (self.get_byte() != 0)) + elif type == 3: # number + constant = Constant(ConstType.NUMBER, self.get_double()) + elif type == 4: # string + constant = Constant(ConstType.STRING, self.get_string(None)[:-1]) + else: + raise Exception("Unknown Datatype! [%d]" % type) + + chunk.appendConstant(constant) + + # parse protos + num = self.get_int() + for i in range(num): + chunk.appendProto(self.decode_chunk()) + + # debug stuff, maybe i'll add this to chunks to have better disassembly annotation in the future? + # eh, for now just consume the bytes. + + # line numbers + num = self.get_int() + for i in range(num): + self.get_int() + + # locals + num = self.get_int() + for i in range(num): + name = self.get_string(None) # local name + start = self.get_int() # local start PC + end = self.get_int() # local end PC + chunk.appendLocal(Local(name, start, end)) + + # upvalues + num = self.get_int() + for i in range(num): + self.get_string(None) # upvalue name + + return chunk + + def decode_rawbytecode(self, rawbytecode): + # bytecode sanity checks + if not rawbytecode[0:4] == b'\x1bLua': + raise Exception("Lua Bytecode expected!") + + bytecode = array.array('b', rawbytecode) + return self.decode_bytecode(bytecode) + + def decode_bytecode(self, bytecode): + self.bytecode = bytecode + + # aligns index, skips header + self.index = 4 + + self.vm_version = self.get_byte() + self.bytecode_format = self.get_byte() + self.big_endian = (self.get_byte() == 0) + self.int_size = self.get_byte() + self.size_t = self.get_byte() + self.instr_size = self.get_byte() # gets size of instructions + self.l_number_size = self.get_byte() # size of lua_Number + self.integral_flag = self.get_byte() + + self.rootChunk = self.decode_chunk() + return self.rootChunk + + def loadFile(self, luaCFile): + with open(luaCFile, 'rb') as luac_file: + bytecode = luac_file.read() + return self.decode_rawbytecode(bytecode) + + def print_dissassembly(self): + LuaUndump.dis_chunk(self.rootChunk) + diff --git a/main.py b/main.py new file mode 100644 index 0000000..b1044b1 --- /dev/null +++ b/main.py @@ -0,0 +1,10 @@ +import sys +import lundump +import lparser + +lc = lundump.LuaUndump() +print(sys.argv[1]) +chunk = lc.loadFile(sys.argv[1]) + +lc.print_dissassembly() +lp = lparser.LuaDecomp(chunk) \ No newline at end of file