| """ |
| Specification, compiler, disassembler, and interpreter |
| for LLDB dataformatter bytecode. |
| |
| See https://lldb.llvm.org/resources/formatterbytecode.html for more details. |
| """ |
| |
| from __future__ import annotations |
| |
| # Types |
| type_String = 1 |
| type_Int = 2 |
| type_UInt = 3 |
| type_Object = 4 |
| type_Type = 5 |
| |
| # Opcodes |
| opcode = dict() |
| |
| |
| def define_opcode(n, mnemonic, name): |
| globals()["op_" + name] = n |
| if mnemonic: |
| opcode[mnemonic] = n |
| opcode[n] = mnemonic |
| |
| |
| define_opcode(1, "dup", "dup") |
| define_opcode(2, "drop", "drop") |
| define_opcode(3, "pick", "pick") |
| define_opcode(4, "over", "over") |
| define_opcode(5, "swap", "swap") |
| define_opcode(6, "rot", "rot") |
| |
| define_opcode(0x10, "{", "begin") |
| define_opcode(0x11, "if", "if") |
| define_opcode(0x12, "ifelse", "ifelse") |
| define_opcode(0x13, "return", "return") |
| |
| define_opcode(0x20, None, "lit_uint") |
| define_opcode(0x21, None, "lit_int") |
| define_opcode(0x22, None, "lit_string") |
| define_opcode(0x23, None, "lit_selector") |
| |
| define_opcode(0x2A, "as_int", "as_int") |
| define_opcode(0x2B, "as_uint", "as_uint") |
| define_opcode(0x2C, "is_null", "is_null") |
| |
| define_opcode(0x30, "+", "plus") |
| define_opcode(0x31, "-", "minus") |
| define_opcode(0x32, "*", "mul") |
| define_opcode(0x33, "/", "div") |
| define_opcode(0x34, "%", "mod") |
| define_opcode(0x35, "<<", "shl") |
| define_opcode(0x36, ">>", "shr") |
| |
| define_opcode(0x40, "&", "and") |
| define_opcode(0x41, "|", "or") |
| define_opcode(0x42, "^", "xor") |
| define_opcode(0x43, "~", "not") |
| |
| define_opcode(0x50, "=", "eq") |
| define_opcode(0x51, "!=", "neq") |
| define_opcode(0x52, "<", "lt") |
| define_opcode(0x53, ">", "gt") |
| define_opcode(0x54, "=<", "le") |
| define_opcode(0x55, ">=", "ge") |
| |
| define_opcode(0x60, "call", "call") |
| |
| # Function signatures |
| sig_summary = 0 |
| sig_init = 1 |
| sig_get_num_children = 2 |
| sig_get_child_index = 3 |
| sig_get_child_at_index = 4 |
| |
| # Selectors |
| selector = dict() |
| |
| |
| def define_selector(n, name): |
| globals()["sel_" + name] = n |
| selector["@" + name] = n |
| selector[n] = "@" + name |
| |
| |
| define_selector(0, "summary") |
| define_selector(1, "type_summary") |
| |
| define_selector(0x10, "get_num_children") |
| define_selector(0x11, "get_child_at_index") |
| define_selector(0x12, "get_child_with_name") |
| define_selector(0x13, "get_child_index") |
| define_selector(0x15, "get_type") |
| define_selector(0x16, "get_template_argument_type") |
| define_selector(0x17, "cast") |
| define_selector(0x20, "get_value") |
| define_selector(0x21, "get_value_as_unsigned") |
| define_selector(0x22, "get_value_as_signed") |
| define_selector(0x23, "get_value_as_address") |
| |
| define_selector(0x40, "read_memory_byte") |
| define_selector(0x41, "read_memory_uint32") |
| define_selector(0x42, "read_memory_int32") |
| define_selector(0x43, "read_memory_unsigned") |
| define_selector(0x44, "read_memory_signed") |
| define_selector(0x45, "read_memory_address") |
| define_selector(0x46, "read_memory") |
| |
| define_selector(0x50, "fmt") |
| define_selector(0x51, "sprintf") |
| define_selector(0x52, "strlen") |
| |
| |
| ################################################################################ |
| # Compiler. |
| ################################################################################ |
| |
| |
| def compile(assembler: str) -> bytearray: |
| """Compile assembler into bytecode""" |
| # This is a stack of all in-flight/unterminated blocks. |
| bytecode = [bytearray()] |
| |
| def emit(byte): |
| bytecode[-1].append(byte) |
| |
| tokens = list(assembler.split(" ")) |
| tokens.reverse() |
| while tokens: |
| tok = tokens.pop() |
| if tok == "": |
| pass |
| elif tok == "{": |
| bytecode.append(bytearray()) |
| elif tok == "}": |
| block = bytecode.pop() |
| emit(op_begin) |
| emit(len(block)) # FIXME: uleb |
| bytecode[-1].extend(block) |
| elif tok[0].isdigit(): |
| if tok[-1] == "u": |
| emit(op_lit_uint) |
| emit(int(tok[:-1])) # FIXME |
| else: |
| emit(op_lit_int) |
| emit(int(tok)) # FIXME |
| elif tok[0] == "@": |
| emit(op_lit_selector) |
| emit(selector[tok]) |
| elif tok[0] == '"': |
| s = bytearray() |
| done = False |
| chrs = tok[1:] |
| while not done: |
| quoted = False |
| for c in chrs: |
| if quoted: |
| s.append(ord(c)) # FIXME |
| quoted = False |
| elif c == "\\": |
| quoted = True |
| elif c == '"': |
| done = True |
| break |
| # FIXME assert this is last in token |
| else: |
| s.append(ord(c)) |
| if not done: |
| s.append(ord(" ")) |
| chrs = tokens.pop() |
| |
| emit(op_lit_string) |
| emit(len(s)) |
| bytecode[-1].extend(s) |
| else: |
| emit(opcode[tok]) |
| assert len(bytecode) == 1 # unterminated { |
| return bytecode[0] |
| |
| |
| ################################################################################ |
| # Disassembler. |
| ################################################################################ |
| |
| |
| def disassemble(bytecode: bytearray) -> (str, int): |
| """Disassemble bytecode into (assembler, token starts)""" |
| asm = "" |
| all_bytes = list(bytecode) |
| all_bytes.reverse() |
| blocks = [] |
| tokens = [0] |
| |
| def next_byte(): |
| """Fetch the next byte in the bytecode and keep track of all |
| in-flight blocks""" |
| for i in range(len(blocks)): |
| blocks[i] -= 1 |
| tokens.append(len(asm)) |
| return all_bytes.pop() |
| |
| while all_bytes: |
| b = next_byte() |
| if b == op_begin: |
| asm += "{" |
| length = next_byte() |
| blocks.append(length) |
| elif b == op_lit_uint: |
| b = next_byte() |
| asm += str(b) # FIXME uleb |
| asm += "u" |
| elif b == op_lit_int: |
| b = next_byte() |
| asm += str(b) |
| elif b == op_lit_selector: |
| b = next_byte() |
| asm += selector[b] |
| elif b == op_lit_string: |
| length = next_byte() |
| s = "'" |
| while length: |
| s += chr(next_byte()) |
| length -= 1 |
| asm += '"' + repr(s)[2:] |
| else: |
| asm += opcode[b] |
| |
| while blocks and blocks[-1] == 0: |
| asm += " }" |
| blocks.pop() |
| |
| if all_bytes: |
| asm += " " |
| |
| if blocks: |
| asm += "ERROR" |
| return asm, tokens |
| |
| |
| ################################################################################ |
| # Interpreter. |
| ################################################################################ |
| |
| |
| def count_fmt_params(fmt: str) -> int: |
| """Count the number of parameters in a format string""" |
| from string import Formatter |
| |
| f = Formatter() |
| n = 0 |
| for _, name, _, _ in f.parse(fmt): |
| if name > n: |
| n = name |
| return n |
| |
| |
| def interpret(bytecode: bytearray, control: list, data: list, tracing: bool = False): |
| """Interpret bytecode""" |
| frame = [] |
| frame.append((0, len(bytecode))) |
| |
| def trace(): |
| """print a trace of the execution for debugging purposes""" |
| |
| def fmt(d): |
| if isinstance(d, int): |
| return str(d) |
| if isinstance(d, str): |
| return d |
| return repr(type(d)) |
| |
| pc, end = frame[-1] |
| asm, tokens = disassemble(bytecode) |
| print( |
| "=== frame = {1}, data = {2}, opcode = {0}".format( |
| opcode[b], frame, [fmt(d) for d in data] |
| ) |
| ) |
| print(asm) |
| print(" " * (tokens[pc]) + "^") |
| |
| def next_byte(): |
| """Fetch the next byte and update the PC""" |
| pc, end = frame[-1] |
| assert pc < len(bytecode) |
| b = bytecode[pc] |
| frame[-1] = pc + 1, end |
| # At the end of a block? |
| while pc >= end: |
| frame.pop() |
| if not frame: |
| return None |
| pc, end = frame[-1] |
| if pc >= end: |
| return None |
| b = bytecode[pc] |
| frame[-1] = pc + 1, end |
| return b |
| |
| while frame[-1][0] < len(bytecode): |
| b = next_byte() |
| if b == None: |
| break |
| if tracing: |
| trace() |
| # Data stack manipulation. |
| if b == op_dup: |
| data.append(data[-1]) |
| elif b == op_drop: |
| data.pop() |
| elif b == op_pick: |
| data.append(data[data.pop()]) |
| elif b == op_over: |
| data.append(data[-2]) |
| elif b == op_swap: |
| x = data.pop() |
| y = data.pop() |
| data.append(x) |
| data.append(y) |
| elif b == op_rot: |
| z = data.pop() |
| y = data.pop() |
| x = data.pop() |
| data.append(z) |
| data.append(x) |
| data.append(y) |
| |
| # Control stack manipulation. |
| elif b == op_begin: |
| length = next_byte() |
| pc, end = frame[-1] |
| control.append((pc, pc + length)) |
| frame[-1] = pc + length, end |
| elif b == op_if: |
| if data.pop(): |
| frame.append(control.pop()) |
| elif b == op_ifelse: |
| if data.pop(): |
| control.pop() |
| frame.append(control.pop()) |
| else: |
| frame.append(control.pop()) |
| control.pop() |
| elif b == op_return: |
| control.clear() |
| return data[-1] |
| |
| # Literals. |
| elif b == op_lit_uint: |
| b = next_byte() # FIXME uleb |
| data.append(int(b)) |
| elif b == op_lit_int: |
| b = next_byte() # FIXME uleb |
| data.append(int(b)) |
| elif b == op_lit_selector: |
| b = next_byte() |
| data.append(b) |
| elif b == op_lit_string: |
| length = next_byte() |
| s = "" |
| while length: |
| s += chr(next_byte()) |
| length -= 1 |
| data.append(s) |
| |
| elif b == op_as_uint: |
| pass |
| elif b == op_as_int: |
| pass |
| elif b == op_is_null: |
| data.append(1 if data.pop() == None else 0) |
| |
| # Arithmetic, logic, etc. |
| elif b == op_plus: |
| data.append(data.pop() + data.pop()) |
| elif b == op_minus: |
| data.append(-data.pop() + data.pop()) |
| elif b == op_mul: |
| data.append(data.pop() * data.pop()) |
| elif b == op_div: |
| y = data.pop() |
| data.append(data.pop() / y) |
| elif b == op_mod: |
| y = data.pop() |
| data.append(data.pop() % y) |
| elif b == op_shl: |
| y = data.pop() |
| data.append(data.pop() << y) |
| elif b == op_shr: |
| y = data.pop() |
| data.append(data.pop() >> y) |
| elif b == op_and: |
| data.append(data.pop() & data.pop()) |
| elif b == op_or: |
| data.append(data.pop() | data.pop()) |
| elif b == op_xor: |
| data.append(data.pop() ^ data.pop()) |
| elif b == op_not: |
| data.append(not data.pop()) |
| elif b == op_eq: |
| data.append(data.pop() == data.pop()) |
| elif b == op_neq: |
| data.append(data.pop() != data.pop()) |
| elif b == op_lt: |
| data.append(data.pop() > data.pop()) |
| elif b == op_gt: |
| data.append(data.pop() < data.pop()) |
| elif b == op_le: |
| data.append(data.pop() >= data.pop()) |
| elif b == op_ge: |
| data.append(data.pop() <= data.pop()) |
| |
| # Function calls. |
| elif b == op_call: |
| sel = data.pop() |
| if sel == sel_summary: |
| data.append(data.pop().GetSummary()) |
| elif sel == sel_get_num_children: |
| data.append(data.pop().GetNumChildren()) |
| elif sel == sel_get_child_at_index: |
| index = data.pop() |
| valobj = data.pop() |
| data.append(valobj.GetChildAtIndex(index)) |
| elif sel == sel_get_child_with_name: |
| name = data.pop() |
| valobj = data.pop() |
| data.append(valobj.GetChildMemberWithName(name)) |
| elif sel == sel_get_child_index: |
| name = data.pop() |
| valobj = data.pop() |
| data.append(valobj.GetIndexOfChildWithName(name)) |
| elif sel == sel_get_type: |
| data.append(data.pop().GetType()) |
| elif sel == sel_get_template_argument_type: |
| n = data.pop() |
| valobj = data.pop() |
| data.append(valobj.GetTemplateArgumentType(n)) |
| elif sel == sel_get_value: |
| data.append(data.pop().GetValue()) |
| elif sel == sel_get_value_as_unsigned: |
| data.append(data.pop().GetValueAsUnsigned()) |
| elif sel == sel_get_value_as_signed: |
| data.append(data.pop().GetValueAsSigned()) |
| elif sel == sel_get_value_as_address: |
| data.append(data.pop().GetValueAsAddress()) |
| elif sel == sel_cast: |
| sbtype = data.pop() |
| valobj = data.pop() |
| data.append(valobj.Cast(sbtype)) |
| elif sel == sel_strlen: |
| s = data.pop() |
| data.append(len(s) if s else 0) |
| elif sel == sel_fmt: |
| fmt = data.pop() |
| n = count_fmt_params(fmt) |
| args = [] |
| for i in range(n): |
| args.append(data.pop()) |
| data.append(fmt.format(*args)) |
| else: |
| print("not implemented: " + selector[sel]) |
| assert False |
| pass |
| return data[-1] |
| |
| |
| if __name__ == "__main__": |
| # Work around the fact that one of the local files is called |
| # types.py, which breaks some versions of python. |
| import os, sys |
| |
| path = os.path.abspath(os.path.dirname(__file__)) |
| sys.path.remove(path) |
| import argparse |
| |
| parser = argparse.ArgumentParser( |
| description=""" |
| Compiler, disassembler, and interpreter for LLDB dataformatter bytecode. |
| See https://lldb.llvm.org/resources/formatterbytecode.html for more details. |
| """ |
| ) |
| parser.add_argument( |
| "-c", "--compile", type=str, help="compile assembler into bytecode" |
| ) |
| parser.add_argument("-d", "--disassemble", type=str, help="disassemble bytecode") |
| parser.add_argument("-t", "--test", action="store_true", help="run unit tests") |
| args = parser.parse_args() |
| if args.compile: |
| print(compile(str(args.compile)).hex()) |
| |
| if args.disassemble: |
| print(disassemble(bytearray.fromhex(str(args.disassemble)))) |
| |
| ############################################################################ |
| # Tests. |
| ############################################################################ |
| if args.test: |
| import unittest |
| |
| class TestCompiler(unittest.TestCase): |
| def test(self): |
| self.assertEqual(compile("1u dup").hex(), "200101") |
| self.assertEqual(compile('"1u dup"').hex(), "2206317520647570") |
| self.assertEqual(compile("16 < { dup } if").hex(), "21105210010111") |
| self.assertEqual(compile('{ { " } " } }').hex(), "100710052203207d20") |
| |
| def roundtrip(asm): |
| self.assertEqual(disassemble(compile(asm))[0], asm) |
| |
| roundtrip("1u dup") |
| roundtrip('1u dup "1u dup"') |
| roundtrip("16 < { dup } if") |
| roundtrip('{ { " } " } }') |
| |
| self.assertEqual(interpret(compile("1 1 +"), [], []), 2) |
| self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4) |
| self.assertEqual( |
| interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes" |
| ) |
| |
| import sys |
| |
| sys.argv.pop() |
| path = os.path.dirname(__file__) |
| sys.path.remove |
| unittest.main() |