[lldb] Corretly parse Wasm segments (#154727)
My original implementation for parsing Wasm segments was wrong in two
related ways. I had a bug in calculating the file vm address and I
didn't fully understand the difference between active and passive
segments and how that impacted their file vm address.
With this PR, we now support parsing init expressions for active
segments, rather than just skipping over them. This is necessary to
determine where they get loaded.
Similar to llvm-objdump, we currently only support simple opcodes (i.e.
constants). We also currently do not support active segments that use a
non-zero memory index. However this covers all segments for a
non-trivial Swift binary compiled to Wasm.
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 777b20e..492b441 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -71,6 +71,47 @@
return std::string(toStringRef(llvm::ArrayRef(str_storage)));
}
+/// An "init expr" refers to a constant expression used to determine the initial
+/// value of certain elements within a module during instantiation. These
+/// expressions are restricted to operations that can be evaluated at module
+/// instantiation time. Currently we only support simple constant opcodes.
+static lldb::offset_t GetWasmOffsetFromInitExpr(DataExtractor &data,
+ lldb::offset_t &offset) {
+ lldb::offset_t init_expr_offset = LLDB_INVALID_OFFSET;
+
+ uint8_t opcode = data.GetU8(&offset);
+ switch (opcode) {
+ case llvm::wasm::WASM_OPCODE_I32_CONST:
+ case llvm::wasm::WASM_OPCODE_I64_CONST:
+ init_expr_offset = data.GetSLEB128(&offset);
+ break;
+ case llvm::wasm::WASM_OPCODE_GLOBAL_GET:
+ init_expr_offset = data.GetULEB128(&offset);
+ break;
+ case llvm::wasm::WASM_OPCODE_F32_CONST:
+ case llvm::wasm::WASM_OPCODE_F64_CONST:
+ // Not a meaningful offset.
+ data.GetFloat(&offset);
+ break;
+ case llvm::wasm::WASM_OPCODE_REF_NULL:
+ // Not a meaningful offset.
+ data.GetULEB128(&offset);
+ break;
+ }
+
+ // Make sure the opcodes we read aren't part of an extended init expr.
+ opcode = data.GetU8(&offset);
+ if (opcode == llvm::wasm::WASM_OPCODE_END)
+ return init_expr_offset;
+
+ // Extended init expressions are not supported, but we still have to parse
+ // them to skip over them and read the next segment.
+ do {
+ opcode = data.GetU8(&offset);
+ } while (opcode != llvm::wasm::WASM_OPCODE_END);
+ return LLDB_INVALID_OFFSET;
+}
+
/// Checks whether the data buffer starts with a valid Wasm module header.
static bool ValidateModuleHeader(const DataBufferSP &data_sp) {
if (!data_sp || data_sp->GetByteSize() < kWasmHeaderSize)
@@ -261,17 +302,20 @@
return true;
}
-static llvm::Expected<std::vector<AddressRange>>
-ParseFunctions(SectionSP code_section_sp) {
- DataExtractor data;
- code_section_sp->GetSectionData(data);
+struct WasmFunction {
+ lldb::offset_t section_offset = LLDB_INVALID_OFFSET;
+ uint32_t size = 0;
+};
+
+static llvm::Expected<std::vector<WasmFunction>>
+ParseFunctions(DataExtractor &data) {
lldb::offset_t offset = 0;
llvm::Expected<uint32_t> function_count = GetULEB32(data, offset);
if (!function_count)
return function_count.takeError();
- std::vector<AddressRange> functions;
+ std::vector<WasmFunction> functions;
functions.reserve(*function_count);
for (uint32_t i = 0; i < *function_count; ++i) {
@@ -281,7 +325,7 @@
// llvm-objdump considers the ULEB with the function size to be part of the
// function. We can't do that here because that would break symbolic
// breakpoints, as that address is never executed.
- functions.emplace_back(code_section_sp, offset, *function_size);
+ functions.push_back({offset, *function_size});
std::optional<lldb::offset_t> next_offset =
llvm::checkedAddUnsigned<lldb::offset_t>(offset, *function_size);
@@ -294,17 +338,22 @@
}
struct WasmSegment {
- WasmSegment(SectionSP section_sp, lldb::offset_t offset, uint32_t size)
- : address_range(section_sp, offset, size) {};
+ enum SegmentType {
+ Active,
+ Passive,
+ };
+
std::string name;
- AddressRange address_range;
+ SegmentType type = Passive;
+ lldb::offset_t section_offset = LLDB_INVALID_OFFSET;
+ uint32_t size = 0;
+ uint32_t memory_index = 0;
+ lldb::offset_t init_expr_offset = 0;
+
+ lldb::offset_t GetFileOffset() const { return section_offset & 0xffffffff; }
};
-static llvm::Expected<std::vector<WasmSegment>>
-ParseData(SectionSP data_section_sp) {
- DataExtractor data;
- data_section_sp->GetSectionData(data);
-
+static llvm::Expected<std::vector<WasmSegment>> ParseData(DataExtractor &data) {
lldb::offset_t offset = 0;
llvm::Expected<uint32_t> segment_count = GetULEB32(data, offset);
@@ -319,27 +368,34 @@
if (!flags)
return flags.takeError();
+ WasmSegment segment;
+
// Data segments have a mode that identifies them as either passive or
// active. An active data segment copies its contents into a memory during
// instantiation, as specified by a memory index and a constant expression
// defining an offset into that memory.
+ segment.type = (*flags & llvm::wasm::WASM_DATA_SEGMENT_IS_PASSIVE)
+ ? WasmSegment::Passive
+ : WasmSegment::Active;
+
if (*flags & llvm::wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX) {
+ assert(segment.type == WasmSegment::Active);
llvm::Expected<uint32_t> memidx = GetULEB32(data, offset);
if (!memidx)
return memidx.takeError();
+ segment.memory_index = *memidx;
}
- if ((*flags & llvm::wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
- // Skip over the constant expression.
- for (uint8_t b = 0; b != llvm::wasm::WASM_OPCODE_END;)
- b = data.GetU8(&offset);
- }
+ if (segment.type == WasmSegment::Active)
+ segment.init_expr_offset = GetWasmOffsetFromInitExpr(data, offset);
llvm::Expected<uint32_t> segment_size = GetULEB32(data, offset);
if (!segment_size)
return segment_size.takeError();
- segments.emplace_back(data_section_sp, offset, *segment_size);
+ segment.section_offset = offset;
+ segment.size = *segment_size;
+ segments.push_back(segment);
std::optional<lldb::offset_t> next_offset =
llvm::checkedAddUnsigned<lldb::offset_t>(offset, *segment_size);
@@ -352,13 +408,11 @@
}
static llvm::Expected<std::vector<Symbol>>
-ParseNames(SectionSP name_section_sp,
- const std::vector<AddressRange> &function_ranges,
+ParseNames(SectionSP code_section_sp, DataExtractor &name_data,
+ const std::vector<WasmFunction> &functions,
std::vector<WasmSegment> &segments) {
- DataExtractor name_section_data;
- name_section_sp->GetSectionData(name_section_data);
- llvm::DataExtractor data = name_section_data.GetAsLLVM();
+ llvm::DataExtractor data = name_data.GetAsLLVM();
llvm::DataExtractor::Cursor c(0);
std::vector<Symbol> symbols;
while (c && c.tell() < data.size()) {
@@ -380,12 +434,13 @@
llvm::Expected<std::string> name = GetWasmString(data, c);
if (!name)
return name.takeError();
- if (*idx >= function_ranges.size())
+ if (*idx >= functions.size())
continue;
symbols.emplace_back(
- symbols.size(), Mangled(*name), lldb::eSymbolTypeCode,
+ symbols.size(), *name, lldb::eSymbolTypeCode,
/*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
- /*is_artificial=*/false, function_ranges[*idx],
+ /*is_artificial=*/false, code_section_sp,
+ functions[i].section_offset, functions[i].size,
/*size_is_valid=*/true, /*contains_linker_annotations=*/false,
/*flags=*/0);
}
@@ -405,12 +460,6 @@
continue;
// Update the segment name.
segments[i].name = *name;
- symbols.emplace_back(
- symbols.size(), Mangled(*name), lldb::eSymbolTypeData,
- /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
- /*is_artificial=*/false, segments[i].address_range,
- /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
- /*flags=*/0);
}
} break;
@@ -432,80 +481,11 @@
}
void ObjectFileWasm::ParseSymtab(Symtab &symtab) {
- assert(m_sections_up && "sections must be parsed");
- Log *log = GetLog(LLDBLog::Object);
-
- // The name section contains names and indexes. First parse the data from the
- // relevant sections so we can access it by its index.
- std::vector<AddressRange> functions;
- std::vector<WasmSegment> segments;
-
- // Parse the code section.
- if (SectionSP code_section_sp =
- m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false)) {
- llvm::Expected<std::vector<AddressRange>> maybe_functions =
- ParseFunctions(code_section_sp);
- if (!maybe_functions) {
- LLDB_LOG_ERROR(log, maybe_functions.takeError(),
- "Failed to parse Wasm code section: {0}");
- return;
- }
- functions = *maybe_functions;
- }
-
- // Parse the data section.
- SectionSP data_section_sp =
- m_sections_up->FindSectionByType(lldb::eSectionTypeData, false);
- if (data_section_sp) {
- llvm::Expected<std::vector<WasmSegment>> maybe_segments =
- ParseData(data_section_sp);
- if (!maybe_segments) {
- LLDB_LOG_ERROR(log, maybe_segments.takeError(),
- "Failed to parse Wasm data section: {0}");
- return;
- }
- segments = *maybe_segments;
- }
-
- // Parse the name section.
- SectionSP name_section_sp =
- m_sections_up->FindSectionByType(lldb::eSectionTypeWasmName, false);
- if (!name_section_sp) {
- LLDB_LOG(log, "Failed to parse Wasm symbol table: no names section");
- return;
- }
-
- llvm::Expected<std::vector<Symbol>> symbols =
- ParseNames(name_section_sp, functions, segments);
- if (!symbols) {
- LLDB_LOG_ERROR(log, symbols.takeError(), "Failed to parse Wasm names: {0}");
- return;
- }
-
- for (const Symbol &symbol : *symbols)
+ for (const Symbol &symbol : m_symbols)
symtab.AddSymbol(symbol);
- lldb::user_id_t segment_id = 0;
- for (const WasmSegment &segment : segments) {
- const lldb::addr_t segment_addr =
- segment.address_range.GetBaseAddress().GetFileAddress();
- const size_t segment_size = segment.address_range.GetByteSize();
- SectionSP segment_sp = std::make_shared<Section>(
- /*parent_section_sp=*/data_section_sp, GetModule(),
- /*obj_file=*/this,
- ++segment_id << 8, // 1-based segment index, shifted by 8 bits to avoid
- // collision with section IDs.
- ConstString(segment.name), eSectionTypeData,
- /*file_vm_addr=*/segment_addr,
- /*vm_size=*/segment_size,
- /*file_offset=*/segment_addr,
- /*file_size=*/segment_size,
- /*log2align=*/0, /*flags=*/0);
- m_sections_up->AddSection(segment_sp);
- GetModule()->GetSectionList()->AddSection(segment_sp);
- }
-
symtab.Finalize();
+ m_symbols.clear();
}
static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
@@ -516,7 +496,27 @@
return eSectionTypeOther;
}
+std::optional<ObjectFileWasm::section_info>
+ObjectFileWasm::GetSectionInfo(uint32_t section_id) {
+ for (const section_info §_info : m_sect_infos) {
+ if (sect_info.id == section_id)
+ return sect_info;
+ }
+ return std::nullopt;
+}
+
+std::optional<ObjectFileWasm::section_info>
+ObjectFileWasm::GetSectionInfo(llvm::StringRef section_name) {
+ for (const section_info §_info : m_sect_infos) {
+ if (sect_info.name == section_name)
+ return sect_info;
+ }
+ return std::nullopt;
+}
+
void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
+ Log *log = GetLog(LLDBLog::Object);
+
if (m_sections_up)
return;
@@ -530,7 +530,7 @@
SectionType section_type = eSectionTypeOther;
ConstString section_name;
offset_t file_offset = sect_info.offset & 0xffffffff;
- addr_t vm_addr = file_offset;
+ addr_t vm_addr = sect_info.offset;
size_t vm_size = sect_info.size;
if (llvm::wasm::WASM_SEC_CODE == sect_info.id) {
@@ -542,9 +542,6 @@
// For this reason Section::GetFileAddress() must return zero for the
// Code section.
vm_addr = 0;
- } else if (llvm::wasm::WASM_SEC_DATA == sect_info.id) {
- section_type = eSectionTypeData;
- section_name = ConstString("data");
} else {
section_type = GetSectionTypeFromName(sect_info.name.GetStringRef());
if (section_type == eSectionTypeOther)
@@ -556,23 +553,107 @@
}
}
- SectionSP section_sp(
- new Section(GetModule(), // Module to which this section belongs.
- this, // ObjectFile to which this section belongs and
- // should read section data from.
- section_type, // Section ID.
- section_name, // Section name.
- section_type, // Section type.
- vm_addr, // VM address.
- vm_size, // VM size in bytes of this section.
- file_offset, // Offset of this section in the file.
- sect_info.size, // Size of the section as found in the file.
- 0, // Alignment of the section
- 0, // Flags for this section.
- 1)); // Number of host bytes per target byte
+ SectionSP section_sp = std::make_shared<Section>(
+ GetModule(), // Module to which this section belongs.
+ this, // ObjectFile to which this section belongs and
+ // should read section data from.
+ section_type, // Section ID.
+ section_name, // Section name.
+ section_type, // Section type.
+ vm_addr, // VM address.
+ vm_size, // VM size in bytes of this section.
+ file_offset, // Offset of this section in the file.
+ sect_info.size, // Size of the section as found in the file.
+ 0, // Alignment of the section
+ 0, // Flags for this section.
+ 1); // Number of host bytes per target byte
m_sections_up->AddSection(section_sp);
unified_section_list.AddSection(section_sp);
}
+
+ // The name section contains names and indexes. First parse the data from the
+ // relevant sections so we can access it by its index.
+ std::vector<WasmFunction> functions;
+ std::vector<WasmSegment> segments;
+
+ // Parse the code section.
+ if (std::optional<section_info> info =
+ GetSectionInfo(llvm::wasm::WASM_SEC_CODE)) {
+ DataExtractor code_data = ReadImageData(info->offset, info->size);
+ llvm::Expected<std::vector<WasmFunction>> maybe_functions =
+ ParseFunctions(code_data);
+ if (!maybe_functions) {
+ LLDB_LOG_ERROR(log, maybe_functions.takeError(),
+ "Failed to parse Wasm code section: {0}");
+ } else {
+ functions = *maybe_functions;
+ }
+ }
+
+ // Parse the data section.
+ std::optional<section_info> data_info =
+ GetSectionInfo(llvm::wasm::WASM_SEC_DATA);
+ if (data_info) {
+ DataExtractor data_data = ReadImageData(data_info->offset, data_info->size);
+ llvm::Expected<std::vector<WasmSegment>> maybe_segments =
+ ParseData(data_data);
+ if (!maybe_segments) {
+ LLDB_LOG_ERROR(log, maybe_segments.takeError(),
+ "Failed to parse Wasm data section: {0}");
+ } else {
+ segments = *maybe_segments;
+ }
+ }
+
+ if (std::optional<section_info> info = GetSectionInfo("name")) {
+ DataExtractor names_data = ReadImageData(info->offset, info->size);
+ llvm::Expected<std::vector<Symbol>> symbols = ParseNames(
+ m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false),
+ names_data, functions, segments);
+ if (!symbols) {
+ LLDB_LOG_ERROR(log, symbols.takeError(),
+ "Failed to parse Wasm names: {0}");
+ } else {
+ m_symbols = *symbols;
+ }
+ }
+
+ lldb::user_id_t segment_id = 0;
+ for (const WasmSegment &segment : segments) {
+ if (segment.type == WasmSegment::Active) {
+ // FIXME: Support segments with a memory index.
+ if (segment.memory_index != 0) {
+ LLDB_LOG(log, "Skipping segment {0}: non-zero memory index is "
+ "currently unsupported");
+ continue;
+ }
+
+ if (segment.init_expr_offset == LLDB_INVALID_OFFSET) {
+ LLDB_LOG(log, "Skipping segment {0}: unsupported init expression");
+ continue;
+ }
+ }
+
+ const lldb::addr_t file_vm_addr =
+ segment.type == WasmSegment::Active
+ ? segment.init_expr_offset
+ : data_info->offset + segment.section_offset;
+ const lldb::offset_t file_offset =
+ data_info->GetFileOffset() + segment.GetFileOffset();
+ SectionSP segment_sp = std::make_shared<Section>(
+ GetModule(),
+ /*obj_file=*/this,
+ ++segment_id << 8, // 1-based segment index, shifted by 8 bits to avoid
+ // collision with section IDs.
+ ConstString(segment.name), eSectionTypeData,
+ /*file_vm_addr=*/file_vm_addr,
+ /*vm_size=*/segment.size,
+ /*file_offset=*/file_offset,
+ /*file_size=*/segment.size,
+ /*log2align=*/0, /*flags=*/0);
+ m_sections_up->AddSection(segment_sp);
+ GetModule()->GetSectionList()->AddSection(segment_sp);
+ }
}
bool ObjectFileWasm::SetLoadAddress(Target &target, lldb::addr_t load_address,
@@ -697,7 +778,7 @@
}
void ObjectFileWasm::DumpSectionHeader(llvm::raw_ostream &ostream,
- const section_info_t &sh) {
+ const section_info &sh) {
ostream << llvm::left_justify(sh.name.GetStringRef(), 16) << " "
<< llvm::format_hex(sh.offset, 10) << " "
<< llvm::format_hex(sh.size, 10) << " " << llvm::format_hex(sh.id, 6)
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
index 531b5f0..86ecbf2 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
@@ -128,20 +128,25 @@
/// Read a range of bytes from the Wasm module.
DataExtractor ReadImageData(lldb::offset_t offset, uint32_t size);
- typedef struct section_info {
+ struct section_info {
lldb::offset_t offset;
uint32_t size;
uint32_t id;
ConstString name;
- } section_info_t;
+ lldb::offset_t GetFileOffset() const { return offset & 0xffffffff; }
+ };
+
+ std::optional<section_info> GetSectionInfo(uint32_t section_id);
+ std::optional<section_info> GetSectionInfo(llvm::StringRef section_name);
/// Wasm section header dump routines.
/// \{
- void DumpSectionHeader(llvm::raw_ostream &ostream, const section_info_t &sh);
+ void DumpSectionHeader(llvm::raw_ostream &ostream, const section_info &sh);
void DumpSectionHeaders(llvm::raw_ostream &ostream);
/// \}
- std::vector<section_info_t> m_sect_infos;
+ std::vector<section_info> m_sect_infos;
+ std::vector<Symbol> m_symbols;
ArchSpec m_arch;
UUID m_uuid;
};
diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test
index 4170d9a..524691b 100644
--- a/lldb/test/Shell/Symtab/symtab-wasm.test
+++ b/lldb/test/Shell/Symtab/symtab-wasm.test
@@ -1,15 +1,16 @@
# RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm
-# RUN: %lldb %t.wasm -o 'image dump symtab' -o 'image dump sections' | FileCheck %s
-CHECK: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors
-CHECK: Code 0x0000000000000005 0x0000000000000029 0x00000000 add
-CHECK: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main
-CHECK: Code 0x000000000000007c 0x0000000000000009 0x00000000 main
-CHECK: Data 0x0000000000000233 0x0000000000000009 0x00000000 .rodata
-CHECK: Data 0x0000000000000242 0x0000000000000004 0x00000000 .data
+# RUN: %lldb %t.wasm -o 'image dump symtab' | FileCheck %s --check-prefix SYMTAB
+SYMTAB: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors
+SYMTAB: Code 0x0000000000000005 0x0000000000000029 0x00000000 add
+SYMTAB: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main
+SYMTAB: Code 0x000000000000007c 0x0000000000000009 0x00000000 main
-CHECK: 0x0000000000000001 code {{.*}} 0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
-CHECK: 0x0000000000000003 data {{.*}} 0x0000022c 0x0000001a 0x00000000 symtab-wasm.test.tmp.wasm.data
-CHECK: 0x0000000000000040 wasm-name {{.*}} 0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name
-CHECK: 0x0000000000000100 data {{.*}} 0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm.data..rodata
-CHECK: 0x0000000000000200 data {{.*}} 0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm.data..data
+# RUN: %lldb %t.wasm -o 'image dump sections' | FileCheck %s --check-prefix SECTIONS
+SECTIONS: 0x0000000000000001 code [0x0000000000000000-0x0000000000000085) --- 0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
+SECTIONS: 0x0000000000000040 wasm-name --- 0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name
+SECTIONS: 0x0000000000000100 data [0x0000000000000400-0x0000000000000409) --- 0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata
+SECTIONS: 0x0000000000000200 data [0x000000000000040c-0x0000000000000410) --- 0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data
+
+# RUN: %lldb %t.wasm -o 'x/s 0x0000000000000400' | FileCheck %s --check-prefix STR
+STR: "data str"