lldb/examples/python/formatter_bytecode.py - llvm-project - Git at Google

 """
 Specification, assembler, disassembler, and interpreter
 for LLDB dataformatter bytecode.

 See https://lldb.llvm.org/resources/formatterbytecode.html for more details.
 """

 from __future__ import annotations

 # Work around the fact that one of the local files is called
 # types.py, which breaks some versions of python.
 import os, sys

 path = os.path.abspath(os.path.dirname(__file__))
 if path in sys.path:
     sys.path.remove(path)

 import re
 import io
 import ast
 import enum
 import shlex
 import textwrap
 from copy import copy
 from dataclasses import dataclass
 from typing import Any, BinaryIO, Optional, Sequence, TextIO, Tuple, Union, cast

 BINARY_VERSION = 1

 # Types
 type_String = 1
 type_Int = 2
 type_UInt = 3
 type_Object = 4
 type_Type = 5

 # Opcodes
 opcode = dict()


 def define_opcode(n, mnemonic, name):
     globals()["op_" + name] = n
     if mnemonic:
         opcode[mnemonic] = n
     opcode[n] = mnemonic


 define_opcode(1, "dup", "dup")
 define_opcode(2, "drop", "drop")
 define_opcode(3, "pick", "pick")
 define_opcode(4, "over", "over")
 define_opcode(5, "swap", "swap")
 define_opcode(6, "rot", "rot")

 define_opcode(0x10, "{", "begin")
 define_opcode(0x11, "if", "if")
 define_opcode(0x12, "ifelse", "ifelse")
 define_opcode(0x13, "return", "return")

 define_opcode(0x20, None, "lit_uint")
 define_opcode(0x21, None, "lit_int")
 define_opcode(0x22, None, "lit_string")
 define_opcode(0x23, None, "lit_selector")

 define_opcode(0x2A, "as_int", "as_int")
 define_opcode(0x2B, "as_uint", "as_uint")
 define_opcode(0x2C, "is_null", "is_null")

 define_opcode(0x30, "+", "plus")
 define_opcode(0x31, "-", "minus")
 define_opcode(0x32, "*", "mul")
 define_opcode(0x33, "/", "div")
 define_opcode(0x34, "%", "mod")
 define_opcode(0x35, "<<", "shl")
 define_opcode(0x36, ">>", "shr")

 define_opcode(0x40, "&", "and")
 define_opcode(0x41, "|", "or")
 define_opcode(0x42, "^", "xor")
 define_opcode(0x43, "~", "not")

 define_opcode(0x50, "=", "eq")
 define_opcode(0x51, "!=", "neq")
 define_opcode(0x52, "<", "lt")
 define_opcode(0x53, ">", "gt")
 define_opcode(0x54, "=<", "le")
 define_opcode(0x55, ">=", "ge")

 define_opcode(0x60, "call", "call")

 # Function signatures
 sig_summary = 0
 sig_init = 1
 sig_get_num_children = 2
 sig_get_child_index = 3
 sig_get_child_at_index = 4
 sig_get_value = 5
 sig_update = 6

 SIGNATURES = {
     "summary": sig_summary,
     "init": sig_init,
     "get_num_children": sig_get_num_children,
     "get_child_index": sig_get_child_index,
     "get_child_at_index": sig_get_child_at_index,
     "get_value": sig_get_value,
     "update": sig_update,
 }

 SIGNATURE_NAMES = "|".join(SIGNATURES.keys())
 SIGNATURE_IDS = {v: k for k, v in SIGNATURES.items()}

 # Selectors
 selector = dict()


 def define_selector(n, name):
     globals()["sel_" + name] = n
     selector["@" + name] = n
     selector[n] = "@" + name


 define_selector(0, "summary")
 define_selector(1, "type_summary")

 define_selector(0x10, "get_num_children")
 define_selector(0x11, "get_child_at_index")
 define_selector(0x12, "get_child_with_name")
 define_selector(0x13, "get_child_index")
 define_selector(0x15, "get_type")
 define_selector(0x16, "get_template_argument_type")
 define_selector(0x17, "cast")
 define_selector(0x18, "get_synthetic_value")
 define_selector(0x19, "get_non_synthetic_value")
 define_selector(0x20, "get_value")
 define_selector(0x21, "get_value_as_unsigned")
 define_selector(0x22, "get_value_as_signed")
 define_selector(0x23, "get_value_as_address")

 define_selector(0x40, "read_memory_byte")
 define_selector(0x41, "read_memory_uint32")
 define_selector(0x42, "read_memory_int32")
 define_selector(0x43, "read_memory_unsigned")
 define_selector(0x44, "read_memory_signed")
 define_selector(0x45, "read_memory_address")
 define_selector(0x46, "read_memory")

 define_selector(0x50, "fmt")
 define_selector(0x51, "sprintf")
 define_selector(0x52, "strlen")


 ################################################################################
 # Assembler.
 ################################################################################

 _SIGNATURE_LABEL = re.compile(f"@(?:{SIGNATURE_NAMES}):$")


 def _tokenize(assembler: str) -> list[str]:
     """Convert string of assembly into tokens."""
     # With one exception, tokens are sequences of non-space characters.
     # The one exception is string literals, which may have spaces.

     # To parse strings, which can contain escaped contents, use a "Friedl
     # unrolled loop". The high level of such a regex is:
     #     open normal* ( special normal* )* close
     # which for string literals is:
     string_literal = r'" [^"\\]* (?: \\. [^"\\]* )* "'

     return re.findall(rf"{string_literal} | \S+", assembler, re.VERBOSE)


 def _segment_by_signature(input: list[str]) -> list[Tuple[str, list[str]]]:
     """Segment the input tokens along signature labels."""
     segments = []

     # Loop state
     signature = None
     tokens = []

     for token in input:
         if _SIGNATURE_LABEL.match(token):
             if signature:
                 segments.append((signature, tokens))
             signature = token[1:-1]  # strip leading @, trailing :
             tokens = []
         else:
             tokens.append(token)

     if signature:
         segments.append((signature, tokens))

     return segments


 @dataclass
 class BytecodeSection:
     """Abstraction of the data serialized to __lldbformatters sections."""

     type_name: str
     flags: int
     signatures: list[Tuple[str, bytes]]

     def validate(self):
         seen = set()
         for sig, _ in self.signatures:
             if sig in seen:
                 raise ValueError(f"duplicate signature: {sig}")
             seen.add(sig)

     def _to_binary(self) -> bytes:
         bin = bytearray()
         bin.extend(_to_uleb(len(self.type_name)))
         bin.extend(bytes(self.type_name, encoding="utf-8"))
         bin.extend(_to_byte(self.flags))
         for sig, bc in self.signatures:
             bin.extend(_to_byte(SIGNATURES[sig]))
             bin.extend(_to_uleb(len(bc)))
             bin.extend(bc)

         return bytes(bin)

     def write_binary(self, output: BinaryIO) -> None:
         self.validate()

         bin = self._to_binary()
         output.write(_to_byte(BINARY_VERSION))
         output.write(_to_uleb(len(bin)))
         output.write(self._to_binary())

     def write_source(self, output: TextIO, language: str) -> None:
         if language == "c":
             self.write_c(output)
         elif language == "swift":
             self.write_swift(output)

     class _CBuilder:
         """Helper class for emitting binary data as a C-string literal."""

         entries: list[Tuple[str, str]]

         def __init__(self) -> None:
             self.entries = []

         def emit_byte(self, x: int, comment: str) -> None:
             self.emit_bytes(_to_byte(x), comment)

         def emit_uleb(self, x: int, comment: str) -> None:
             self.emit_bytes(_to_uleb(x), comment)

         def emit_bytes(self, x: bytes, comment: str) -> None:
             # Construct zero pemited hex values with length two.
             string = "".join(f"\\x{b:02x}" for b in x)
             self.emit_string(string, comment)

         def emit_string(self, string: str, comment: str) -> None:
             self.entries.append((f'"{string}"', comment))

     class _SwiftBuilder:
         """Helper class for emitting binary data as a Swift tuple literal."""

         entries: list[Tuple[bytes, str]]

         def __init__(self) -> None:
             self.entries = []

         def emit_byte(self, x: int, comment: str) -> None:
             self.emit_bytes(_to_byte(x), comment)

         def emit_uleb(self, x: int, comment: str) -> None:
             self.emit_bytes(_to_uleb(x), comment)

         def emit_bytes(self, x: bytes, comment: str) -> None:
             self.entries.append((x, comment))

         def emit_string(self, string: str, comment: str) -> None:
             self.emit_bytes(string.encode(), comment)

         @property
         def type_decl(self):
             total_bytes = sum((len(bs) for bs, _ in self.entries))
             element_list = ", ".join(["UInt8"] * total_bytes)
             return f"({element_list})"

     def _build(self, builder) -> None:
         size = len(self._to_binary())
         builder.emit_byte(BINARY_VERSION, "version")
         builder.emit_uleb(size, "remaining record size")
         builder.emit_uleb(len(self.type_name), "type name size")
         builder.emit_string(self.type_name, "type name")
         builder.emit_byte(self.flags, "flags")
         for sig, bc in self.signatures:
             builder.emit_byte(SIGNATURES[sig], f"sig_{sig}")
             builder.emit_uleb(len(bc), "program size")
             builder.emit_bytes(bc, "program")

     @property
     def _var_name(self):
         var_name = re.sub(r"\W", "_", self.type_name)
         return f"_{var_name}_formatter"

     def write_c(self, output: TextIO) -> None:
         self.validate()

         builder = self._CBuilder()
         self._build(builder)

         print(
             textwrap.dedent(
                 """
                 #ifdef __APPLE__
                 #define FORMATTER_SECTION "__DATA_CONST,__lldbformatters"
                 #else
                 #define FORMATTER_SECTION ".lldbformatters"
                 #endif
                 """
             ),
             file=output,
         )
         print(
             "__attribute__((used, section(FORMATTER_SECTION)))",
             file=output,
         )
         print(f"unsigned char {self._var_name}[] =", file=output)
         indent = "    "
         for string, comment in builder.entries:
             print(f"{indent}// {comment}", file=output)
             print(f"{indent}{string}", file=output)
         print(";", file=output)

     def write_swift(self, output: TextIO) -> None:
         self.validate()

         builder = self._SwiftBuilder()
         self._build(builder)

         print(
             textwrap.dedent(
                 """\
                 #if swift(>=6.3)
                 #if os(macOS) || os(iOS) || os(watchOS) || os(tvOS) || os(visionOS)
                 @section("__DATA_CONST,__lldbformatters")
                 #else
                 @section(".lldbformatters")
                 #endif
                 @used"""
             ),
             file=output,
         )
         print(
             f"let {self._var_name}: {builder.type_decl} = (",
             file=output,
         )
         indent = "    "
         for bs, comment in builder.entries:
             print(f"{indent}// {comment}", file=output)
             byte_list = ", ".join(f"0x{b:02x}" for b in bs)
             print(f"{indent}{byte_list},", file=output)
         print(")", file=output)
         print("#endif", file=output)  # swift(>=6.3)


 def assemble_file(type_name: str, input: TextIO) -> BytecodeSection:
     input_tokens = _tokenize(input.read())
     signatures = []
     for sig, tokens in _segment_by_signature(input_tokens):
         if tokens:
             signatures.append((sig, assemble_tokens(tokens)))

     return BytecodeSection(type_name, flags=0, signatures=signatures)


 def assemble(assembly: str) -> bytes:
     return assemble_tokens(_tokenize(assembly))


 def assemble_tokens(tokens: list[str]) -> bytes:
     """Assemble assembly into bytecode"""
     # This is a stack of all in-flight/unterminated blocks.
     bytecode = [bytearray()]

     def emit(byte):
         bytecode[-1].append(byte)

     tokens.reverse()
     while tokens:
         tok = tokens.pop()
         if tok == "":
             pass
         elif tok == "{":
             bytecode.append(bytearray())
         elif tok == "}":
             block = bytecode.pop()
             emit(op_begin)
             emit(len(block))  # FIXME: uleb
             bytecode[-1].extend(block)
         elif tok[0].isdigit():
             if tok[-1] == "u":
                 emit(op_lit_uint)
                 emit(int(tok[:-1]))  # FIXME
             else:
                 emit(op_lit_int)
                 emit(int(tok))  # FIXME
         elif tok[0] == "@":
             emit(op_lit_selector)
             emit(selector[tok])
         elif tok[0] == '"':
             # Remove backslash escaping '"' and '\'.
             s = re.sub(r'\\(["\\])', r"\1", tok[1:-1]).encode()
             emit(op_lit_string)
             emit(len(s))
             bytecode[-1].extend(s)
         else:
             emit(opcode[tok])
     assert len(bytecode) == 1  # unterminated {
     return bytes(bytecode[0])


 ################################################################################
 # Disassembler.
 ################################################################################


 def disassemble_file(input: BinaryIO, output: TextIO) -> None:
     stream = io.BytesIO(input.read())

     version = stream.read(1)[0]
     if version != BINARY_VERSION:
         raise ValueError(f"unknown binary version: {version}")

     record_size = _from_uleb(stream)
     stream.truncate(stream.tell() + record_size)

     name_size = _from_uleb(stream)
     _type_name = stream.read(name_size).decode()
     _flags = stream.read(1)[0]

     while True:
         sig_byte = stream.read(1)
         if not sig_byte:
             break
         sig_name = SIGNATURE_IDS[sig_byte[0]]
         body_size = _from_uleb(stream)
         bc = stream.read(body_size)
         asm, _ = disassemble(bc)
         print(f"@{sig_name}: {asm}", file=output)


 def disassemble(bytecode: bytes) -> Tuple[str, list[int]]:
     """Disassemble bytecode into (assembly, token starts)"""
     asm = ""
     all_bytes = list(bytecode)
     all_bytes.reverse()
     blocks = []
     tokens = [0]

     def next_byte():
         """Fetch the next byte in the bytecode and keep track of all
         in-flight blocks"""
         for i in range(len(blocks)):
             blocks[i] -= 1
         tokens.append(len(asm))
         return all_bytes.pop()

     while all_bytes:
         b = next_byte()
         if b == op_begin:
             asm += "{"
             length = next_byte()
             blocks.append(length)
         elif b == op_lit_uint:
             b = next_byte()
             asm += str(b)  # FIXME uleb
             asm += "u"
         elif b == op_lit_int:
             b = next_byte()
             asm += str(b)
         elif b == op_lit_selector:
             b = next_byte()
             asm += selector[b]
         elif b == op_lit_string:
             length = next_byte()
             s = '"'
             for _ in range(length):
                 c = chr(next_byte())
                 if c in ('"', "\\"):
                     s += "\\"
                 s += c
             s += '"'
             asm += s
         else:
             asm += opcode[b]

         while blocks and blocks[-1] == 0:
             asm += " }"
             blocks.pop()

         if all_bytes:
             asm += " "

     if blocks:
         asm += "ERROR"
     return asm, tokens


 ################################################################################
 # Interpreter.
 ################################################################################


 def count_fmt_params(fmt: str) -> int:
     """Count the number of parameters in a format string"""
     from string import Formatter

     f = Formatter()
     n = 0
     for _, name, _, _ in f.parse(fmt):
         if name > n:
             n = name
     return n


 def interpret(bytecode: bytes, control: list, data: list, tracing: bool = False):
     """Interpret bytecode"""
     frame = []
     frame.append((0, len(bytecode)))

     def trace():
         """print a trace of the execution for debugging purposes"""

         def fmt(d):
             if isinstance(d, int):
                 return str(d)
             if isinstance(d, str):
                 return d
             return repr(type(d))

         pc, end = frame[-1]
         asm, tokens = disassemble(bytecode)
         print(
             "=== frame = {1}, data = {2}, opcode = {0}".format(
                 opcode[b], frame, [fmt(d) for d in data]
             )
         )
         print(asm)
         print(" " * (tokens[pc]) + "^")

     def next_byte():
         """Fetch the next byte and update the PC"""
         pc, end = frame[-1]
         assert pc < len(bytecode)
         b = bytecode[pc]
         frame[-1] = pc + 1, end
         # At the end of a block?
         while pc >= end:
             frame.pop()
             if not frame:
                 return None
             pc, end = frame[-1]
             if pc >= end:
                 return None
             b = bytecode[pc]
             frame[-1] = pc + 1, end
         return b

     while frame[-1][0] < len(bytecode):
         b = next_byte()
         if b == None:
             break
         if tracing:
             trace()
         # Data stack manipulation.
         if b == op_dup:
             data.append(data[-1])
         elif b == op_drop:
             data.pop()
         elif b == op_pick:
             data.append(data[data.pop()])
         elif b == op_over:
             data.append(data[-2])
         elif b == op_swap:
             x = data.pop()
             y = data.pop()
             data.append(x)
             data.append(y)
         elif b == op_rot:
             z = data.pop()
             y = data.pop()
             x = data.pop()
             data.append(z)
             data.append(x)
             data.append(y)

         # Control stack manipulation.
         elif b == op_begin:
             length = next_byte()
             pc, end = frame[-1]
             control.append((pc, pc + length))
             frame[-1] = pc + length, end
         elif b == op_if:
             if data.pop():
                 frame.append(control.pop())
         elif b == op_ifelse:
             if data.pop():
                 control.pop()
                 frame.append(control.pop())
             else:
                 frame.append(control.pop())
                 control.pop()
         elif b == op_return:
             control.clear()
             return data[-1]

         # Literals.
         elif b == op_lit_uint:
             b = next_byte()  # FIXME uleb
             data.append(int(b))
         elif b == op_lit_int:
             b = next_byte()  # FIXME uleb
             data.append(int(b))
         elif b == op_lit_selector:
             b = next_byte()
             data.append(b)
         elif b == op_lit_string:
             length = next_byte()
             s = ""
             while length:
                 s += chr(next_byte())
                 length -= 1
             data.append(s)

         elif b == op_as_uint:
             pass
         elif b == op_as_int:
             pass
         elif b == op_is_null:
             data.append(1 if data.pop() == None else 0)

         # Arithmetic, logic, etc.
         elif b == op_plus:
             data.append(data.pop() + data.pop())
         elif b == op_minus:
             data.append(-data.pop() + data.pop())
         elif b == op_mul:
             data.append(data.pop() * data.pop())
         elif b == op_div:
             y = data.pop()
             data.append(data.pop() / y)
         elif b == op_mod:
             y = data.pop()
             data.append(data.pop() % y)
         elif b == op_shl:
             y = data.pop()
             data.append(data.pop() << y)
         elif b == op_shr:
             y = data.pop()
             data.append(data.pop() >> y)
         elif b == op_and:
             data.append(data.pop() & data.pop())
         elif b == op_or:
             data.append(data.pop() | data.pop())
         elif b == op_xor:
             data.append(data.pop() ^ data.pop())
         elif b == op_not:
             data.append(not data.pop())
         elif b == op_eq:
             data.append(data.pop() == data.pop())
         elif b == op_neq:
             data.append(data.pop() != data.pop())
         elif b == op_lt:
             data.append(data.pop() > data.pop())
         elif b == op_gt:
             data.append(data.pop() < data.pop())
         elif b == op_le:
             data.append(data.pop() >= data.pop())
         elif b == op_ge:
             data.append(data.pop() <= data.pop())

         # Function calls.
         elif b == op_call:
             sel = data.pop()
             if sel == sel_summary:
                 data.append(data.pop().GetSummary())
             elif sel == sel_get_num_children:
                 data.append(data.pop().GetNumChildren())
             elif sel == sel_get_child_at_index:
                 index = data.pop()
                 valobj = data.pop()
                 data.append(valobj.GetChildAtIndex(index))
             elif sel == sel_get_child_with_name:
                 name = data.pop()
                 valobj = data.pop()
                 data.append(valobj.GetChildMemberWithName(name))
             elif sel == sel_get_child_index:
                 name = data.pop()
                 valobj = data.pop()
                 data.append(valobj.GetIndexOfChildWithName(name))
             elif sel == sel_get_type:
                 data.append(data.pop().GetType())
             elif sel == sel_get_template_argument_type:
                 n = data.pop()
                 valobj = data.pop()
                 data.append(valobj.GetTemplateArgumentType(n))
             elif sel == sel_get_synthetic_value:
                 data.append(data.pop().GetSyntheticValue())
             elif sel == sel_get_non_synthetic_value:
                 data.append(data.pop().GetNonSyntheticValue())
             elif sel == sel_get_value:
                 data.append(data.pop().GetValue())
             elif sel == sel_get_value_as_unsigned:
                 data.append(data.pop().GetValueAsUnsigned())
             elif sel == sel_get_value_as_signed:
                 data.append(data.pop().GetValueAsSigned())
             elif sel == sel_get_value_as_address:
                 data.append(data.pop().GetValueAsAddress())
             elif sel == sel_cast:
                 sbtype = data.pop()
                 valobj = data.pop()
                 data.append(valobj.Cast(sbtype))
             elif sel == sel_strlen:
                 s = data.pop()
                 data.append(len(s) if s else 0)
             elif sel == sel_fmt:
                 fmt = data.pop()
                 n = count_fmt_params(fmt)
                 args = []
                 for i in range(n):
                     args.append(data.pop())
                 data.append(fmt.format(*args))
             else:
                 print("not implemented: " + selector[sel])
                 assert False
     return data[-1]


 ################################################################################
 # Python -> Bytecode Compiler
 ################################################################################

 _BUILTINS = {
     "Cast": "@cast",
     "GetChildAtIndex": "@get_child_at_index",
     "GetChildMemberWithName": "@get_child_with_name",
     "GetSummary": "@summary",
     "GetSyntheticValue": "@get_synthetic_value",
     "GetTemplateArgumentType": "@get_template_argument_type",
     "GetType": "@get_type",
     "GetValueAsUnsigned": "@get_value_as_unsigned",
 }

 _COMPS = {
     ast.Eq: "=",
     ast.NotEq: "!=",
     ast.Lt: "<",
     ast.LtE: "=<",
     ast.Gt: ">",
     ast.GtE: "=>",
 }

 # Maps Python method names in a formatter class to their bytecode signatures.
 _METHOD_SIGS = {
     "__init__": "@init",
     "update": "@update",
     "num_children": "@get_num_children",
     "get_child_index": "@get_child_index",
     "get_child_at_index": "@get_child_at_index",
     "get_value": "@get_value",
 }


 class CompilerError(Exception):
     lineno: int

     def __init__(self, message, node: Union[ast.expr, ast.stmt]) -> None:
         super().__init__(message)
         self.lineno = node.lineno


 class Compiler(ast.NodeVisitor):
     """
     Compile Python LLDB data formatters to LLDB formatter bytecode.

     This compiler is supports a limited subset of Python.

     # Supported Features

     * Top level functions implementing LLDB summary formatters
     * Top level classes implementing LLDB synthetic formatters
     * Partial support for the following, see below for more details:
       - Object attributes (properties)
       - Local variables
       - Function calls
     * Python language support
     [x] If statements (including else, elif and nested if)
     [x] Return statements
     [x] String, integer, float, boolean, and None literals
     [x] Binary comparisons
     [ ] Boolean operators
     [ ] Math operations

     # Unsupported Features

         Note: that this is not exhaustive, refer to the list of supported
         features above.

     * For and while loops
     * Exceptions
     * User defined general purpose functions and classes
     * Lists, dicts, sets, and other container data types
     * Iterators, comprehensions, yield, etc
     * With statements
     * Imports of any modules

     # Variables

     The compiler supports two kinds of variables, local variables and attribute
     variables (properties), but there are limitations on both.

     In __init__ and update, local variables are currently *not* supported, but
     attributes can be assigned to. This matches the common case for these
     functions.

     In all other function bodies, local variables _are_ supported, but
     attributes can only be read from, *not* assigned to. This also matches the
     common case for these functions.

     Variables (local and attributes) are tracked, allowing the compiler to know
     their position in the stack. Variable reads can then be lowered to `pick`
     instructions. See the compiler's `locals` and `attrs` attributes.

     # Functions

     Known functions are supported, a design that customizes the scope of what
     formatters can and can't do. The functions known to the compiler are called
     "selectors". The selectors are primarily SBValue API, although there are
     also general purpose selectors. Formatters can only call selectors, not user
     defined functions, and not SB methods that have not been defined as a
     selector.
     """

     # Names of locals in bottom-to-top stack order. locals[0] is the
     # oldest/deepest; locals[-1] is the most recently pushed.
     locals: list[str]

     # Names of visible attrs in bottom-to-top stack order. Always holds the
     # full combined frame for the method being compiled: grows incrementally
     # during __init__/update, and is set to the combined list before getter
     # methods are compiled.
     attrs: list[str]

     # Bytecode signature of the method being compiled, or None for top-level
     # functions.
     current_sig: Optional[str]

     buffer: io.StringIO

     def __init__(self) -> None:
         self.locals = []
         self.attrs = []
         self.current_sig = None
         self.buffer = io.StringIO()

     def compile(self, source_file: str) -> str:
         with open(source_file) as f:
             root = ast.parse(f.read())
         self.visit(root)
         return self.buffer.getvalue()

     def visit_ClassDef(self, node: ast.ClassDef) -> None:
         # Compile methods in a fixed order so that attrs is fully populated
         # before getter methods are compiled.
         methods = {}
         for item in node.body:
             if isinstance(item, ast.FunctionDef):
                 if item.name not in _METHOD_SIGS:
                     raise CompilerError(f"unsupported method: {item.name}", item)
                 methods[item.name] = item

         self.attrs = []
         if method := methods.get("__init__"):
             self._compile_method(method)
         # self.attrs now holds init's attrs. update's attrs are appended above
         # them, so after update self.attrs is the combined init+update list.
         if method := methods.get("update"):
             self._compile_method(method)

         for method_name, method in methods.items():
             if method_name not in ("__init__", "update"):
                 self._compile_method(method)

     def _compile_method(self, node: ast.FunctionDef) -> None:
         self.current_sig = _METHOD_SIGS[node.name]

         # Strip 'self' (and 'internal_dict' for __init__) from the arg list;
         # the remaining args become the initial locals.
         args = copy(node.args.args)
         args.pop(0)  # drop 'self'
         if node.name == "__init__":
             args.pop()  # drop trailing 'internal_dict'

         self.locals = [arg.arg for arg in args]

         # Compile into a temporary buffer so the signature line can be
         # emitted first.
         saved_buffer = self.buffer
         self.buffer = io.StringIO()

         self._visit_each(node.body)

         method_output = self.buffer.getvalue()
         self.buffer = saved_buffer
         self._output(f"{self.current_sig}:")
         self._output(method_output)

         self.locals.clear()
         self.current_sig = None

     def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
         # Top-level function (not inside a class).
         self.current_sig = None
         self.attrs = []
         self.locals = [arg.arg for arg in node.args.args]
         self._visit_each(node.body)
         self.locals.clear()

     def visit_Compare(self, node: ast.Compare) -> None:
         self.visit(node.left)
         # XXX: Does not handle multiple comparisons, ex: `0 < x < 10`
         self.visit(node.comparators[0])
         self._output(_COMPS[type(node.ops[0])])

     def visit_If(self, node: ast.If) -> None:
         self.visit(node.test)

         self._output("{")
         self._visit_each(node.body)
         if node.orelse:
             self._output("} {")
             self._visit_each(node.orelse)
             self._output("} ifelse")
         else:
             self._output("} if")

     def visit_Return(self, node: ast.Return) -> None:
         if node.value:
             self.visit(node.value)
         self._output("return")

     def visit_Constant(self, node: ast.Constant) -> None:
         if isinstance(node.value, str):
             self._output(f'"{node.value}"')
         elif isinstance(node.value, bool):
             self._output(int(node.value))
         else:
             self._output(node.value)

     def visit_Call(self, node: ast.Call) -> None:
         func = node.func
         if isinstance(func, ast.Attribute):
             receiver = func.value
             method = func.attr
             # self is not a valid call receiver.
             if isinstance(receiver, ast.Name) and receiver.id == "self":
                 raise CompilerError(
                     "self is not a valid call receiver; use self.attr to read an attribute",
                     node,
                 )
             if selector := _BUILTINS.get(method):
                 self.visit(receiver)
                 self._visit_each(node.args)
                 self._output(f"{selector} call")
                 return
             raise CompilerError(f"unsupported method: {method}", node)

         if isinstance(func, ast.Name):
             raise CompilerError(f"unsupported function: {func.id}", node)

         raise CompilerError("unsupported function call expression", node)

     def visit_Assign(self, node: ast.Assign) -> None:
         target = node.targets[0]

         # Handle self.attr = expr (attribute assignment).
         if (
             isinstance(target, ast.Attribute)
             and isinstance(target.value, ast.Name)
             and target.value.id == "self"
         ):
             if self.current_sig not in ("@init", "@update"):
                 raise CompilerError(
                     "attribute assignment is only allowed in __init__ and update",
                     node,
                 )

             attr = target.attr
             if attr in self.attrs:
                 raise CompilerError(f"attribute '{attr}' is already assigned", node)

             # If the RHS is an argument (the only kind of local permitted in
             # __init__) - then it is already on the stack in place, and no
             # evaluation is needed.
             is_arg = (
                 isinstance(node.value, ast.Name)
                 and self._local_index(node.value) is not None
             )
             if not is_arg:
                 # Evaluate the RHS, leaving its value on the stack.
                 self.visit(node.value)

             # Record the attr.
             self.attrs.append(attr)
             return

         # Handle local variable assignment.
         if self.current_sig in ("@init", "@update"):
             raise CompilerError(
                 "local variable assignment is not allowed in __init__ or update; "
                 "use attribute assignment (self.attr = ...) instead",
                 node,
             )

         if isinstance(target, ast.Name):
             names = [target]
         elif isinstance(target, ast.Tuple):
             names = cast(list[ast.Name], target.elts)
         else:
             raise CompilerError("unsupported assignment target", node)

         # Visit RHS, leaving its value on the stack.
         self.visit(node.value)

         # Forget any previous bindings of these names.
         # Their values are orphaned on the stack.
         for name in names:
             idx = self._local_index(name)
             if idx is not None:
                 self.locals[idx] = ""

         self.locals.extend(x.id for x in names)

     def visit_Attribute(self, node: ast.Attribute) -> None:
         # Only self.attr reads are supported here.
         if not (isinstance(node.value, ast.Name) and node.value.id == "self"):
             raise CompilerError(
                 "unsupported attribute access (only self.attr is supported)", node
             )
         pick_idx = self._attr_index(node.attr, node)
         self._output(f"{pick_idx}u pick")  # "# self.{node.attr}"

     def visit_Name(self, node: ast.Name) -> None:
         idx = self._local_index(node)
         if idx is None:
             raise CompilerError(f"unknown local variable: {node.id}", node)
         self._output(f"{idx}u pick")  # "# {node.id}"

     def _visit_each(self, nodes: Sequence[ast.AST]) -> None:
         for child in nodes:
             self.visit(child)

     def _attr_index(self, name: str, node: ast.expr) -> int:
         # self.attrs is always the full visible attr frame, so the index is
         # the direct pick offset with no further adjustment.
         try:
             return self.attrs.index(name)
         except ValueError:
             raise CompilerError(f"unknown attribute: {name}", node)

     def _local_index(self, name: ast.Name) -> Optional[int]:
         try:
             idx = self.locals.index(name.id)
             # Offset past all attrs.
             return len(self.attrs) + idx
         except ValueError:
             return None

     def _output(self, x: Any) -> None:
         print(x, file=self.buffer)


 ################################################################################
 # Helper functions.
 ################################################################################


 def _to_uleb(value: int) -> bytes:
     """Encode an integer to ULEB128 bytes."""
     if value < 0:
         raise ValueError(f"negative number cannot be encoded to ULEB128: {value}")

     result = bytearray()
     while True:
         byte = value & 0x7F
         value >>= 7
         if value != 0:
             byte |= 0x80
         result.append(byte)
         if value == 0:
             break

     return bytes(result)


 def _from_uleb(stream: BinaryIO) -> int:
     """Decode a ULEB128 integer by reading bytes from the stream."""
     result = 0
     shift = 0
     while True:
         byte = stream.read(1)[0]
         result |= (byte & 0x7F) << shift
         shift += 7
         if not (byte & 0x80):
             break

     return result


 def _to_byte(n: int) -> bytes:
     return n.to_bytes(1, "big")


 def _main():
     import argparse

     parser = argparse.ArgumentParser(
         description="""
     Assembler, disassembler, and interpreter for LLDB dataformatter bytecode.
     See https://lldb.llvm.org/resources/formatterbytecode.html for more details.
     """
     )
     parser.add_argument("input", help="input file")
     mode = parser.add_mutually_exclusive_group()
     mode.add_argument(
         "-c",
         "--compile",
         action="store_true",
         help="compile a Python LLDB data formatter into LLDB formatter bytecode",
     )
     mode.add_argument(
         "-a",
         "--assemble",
         action="store_true",
         help="assemble assembly into bytecode",
     )
     mode.add_argument(
         "-d",
         "--disassemble",
         action="store_true",
         help="disassemble bytecode",
     )
     parser.add_argument("-n", "--type-name", help="source type of formatter")
     parser.add_argument(
         "--skip-invocation-comment",
         action="store_true",
         help="do not print invocation comment in compiled output",
     )
     parser.add_argument(
         "-o",
         "--output",
         help="output file (required for --assemble)",
     )
     parser.add_argument(
         "-f",
         "--format",
         choices=("binary", "c", "swift"),
         default="binary",
         help="output file format",
     )
     parser.add_argument("-t", "--test", action="store_true", help="run unit tests")

     args = parser.parse_args()
     if args.compile:
         if not args.type_name:
             parser.error("--type-name is required with --compile")
         if not args.output:
             parser.error("--output is required with --compile")
         compiler = Compiler()
         try:
             assembly = compiler.compile(args.input)
         except CompilerError as e:
             print(f"{args.input}:{e.lineno}: {e}", file=sys.stderr)
             return

         section = assemble_file(args.type_name, io.StringIO(assembly))
         if args.format == "binary":
             with open(args.output, "wb") as output:
                 section.write_binary(output)
         else:
             with open(args.output, "w") as output:
                 if not args.skip_invocation_comment:
                     print("// Generated with:", file=output)
                     print("//  ", shlex.join(sys.argv), file=output)
                 section.write_source(output, language=args.format)
     elif args.assemble:
         if not args.type_name:
             parser.error("--type-name is required with --assemble")
         if not args.output:
             parser.error("--output is required with --assemble")
         with open(args.input) as input:
             section = assemble_file(args.type_name, input)
         if args.format == "binary":
             with open(args.output, "wb") as output:
                 section.write_binary(output)
         else:
             with open(args.output, "w") as output:
                 section.write_source(output, language=args.format)
     elif args.disassemble:
         if args.output:
             with (
                 open(args.input, "rb") as input,
                 open(args.output, "w") as output,
             ):
                 disassemble_file(input, output)
         else:
             with open(args.input, "rb") as input:
                 disassemble_file(input, sys.stdout)


 if __name__ == "__main__":
     if not ("-t" in sys.argv or "--test" in sys.argv):
         _main()
         sys.exit()

     ############################################################################
     # Tests.
     ############################################################################
     import unittest

     class TestAssembler(unittest.TestCase):

         def test_assemble(self):
             self.assertEqual(assemble("1u dup").hex(), "200101")
             self.assertEqual(assemble('"1u dup"').hex(), "2206317520647570")
             self.assertEqual(assemble("16 < { dup } if").hex(), "21105210010111")
             self.assertEqual(assemble('{ { " } " } }').hex(), "100710052203207d20")

             def roundtrip(asm):
                 self.assertEqual(disassemble(assemble(asm))[0], asm)

             roundtrip("1u dup")
             roundtrip("16 < { dup } if")
             roundtrip('{ { " } " } }')

             # String specific checks.
             roundtrip('1u "2u 3u"')
             roundtrip('"a  b"')
             roundtrip('"a \\" b"')

             self.assertEqual(interpret(assemble("1 1 +"), [], []), 2)
             self.assertEqual(interpret(assemble("2 1 1 + *"), [], []), 4)
             self.assertEqual(
                 interpret(assemble('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes"
             )

         def test_assemble_file(self):
             def run_assemble(type_name, asm):
                 out = io.BytesIO()
                 section = assemble_file(type_name, io.StringIO(asm))
                 section.write_binary(out)
                 out.seek(0)
                 return out

             def run_disassemble(binary):
                 out = io.StringIO()
                 disassemble_file(binary, out)
                 out.seek(0)
                 return out

             # assemble -> disassemble -> assemble round-trip: binary is identical.
             asm = "@summary: dup @get_value_as_unsigned call return\n@get_num_children: drop 5u return"
             binary1 = run_assemble("MyType", asm)
             dis = run_disassemble(binary1)
             binary2 = run_assemble("MyType", dis.read())
             self.assertEqual(binary1.getvalue(), binary2.getvalue())

             # disassemble -> assemble -> disassemble round-trip: text is identical.
             dis2 = run_disassemble(binary2)
             self.assertEqual(dis.getvalue(), dis2.getvalue())

             # disassemble output contains expected signatures.
             self.assertIn("@summary:", dis.getvalue())
             self.assertIn("@get_num_children:", dis.getvalue())

             # Duplicate signature is an error.
             with self.assertRaises(ValueError):
                 run_assemble("MyType", "@summary: 1u return\n@summary: 2u return")

         def test_write_source(self):
             # Use the Account example from main.cpp as a reference, whose
             # exact byte values are known.
             section = BytecodeSection(
                 type_name="Account",
                 flags=0,
                 signatures=[
                     ("get_num_children", bytes([0x20, 0x01])),
                     ("get_child_at_index", bytes([0x02, 0x20, 0x00, 0x23, 0x11, 0x60])),
                 ],
             )
             out = io.StringIO()
             section.write_source(out, language="c")
             src = out.getvalue()

             self.assertIn("__attribute__((used, section(FORMATTER_SECTION)))", src)
             self.assertIn("unsigned char _Account_formatter[] =", src)
             self.assertIn('"\\x01"', src)  # version
             self.assertIn('"\\x15"', src)  # record size (21)
             self.assertIn('"\\x07"', src)  # type name size (7)
             self.assertIn('"Account"', src)  # type name
             self.assertIn('"\\x00"', src)  # flags
             self.assertIn('"\\x02"', src)  # sig_get_num_children
             self.assertIn('"\\x20\\x01"', src)  # program
             self.assertIn('"\\x04"', src)  # sig_get_child_at_index
             self.assertIn('"\\x06"', src)  # program size
             self.assertIn('"\\x02\\x20\\x00\\x23\\x11\\x60"', src)  # program
             self.assertIn("// version", src)
             self.assertIn("// type name", src)
             self.assertIn("// program", src)
             # Semicolon terminates the array initializer.
             self.assertEqual(src.count(";"), 1)

             # Non-identifier characters in the type name are replaced with '_'.
             out2 = io.StringIO()
             BytecodeSection("std::vector<int>", 0, []).write_source(out2, language="c")
             self.assertIn("_std__vector_int__formatter[] =", out2.getvalue())

     unittest.main(argv=[__file__])