| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| """Extract IR for training. |
| |
| Extract IR for training, either from a compile_commands.json file produced by |
| cmake, or a linker parameter list file. |
| |
| Only run with |
| 'python compiler_opt/tools/extract_ir.py ...' |
| |
| The compilation is assumed to have been performed with clang, using |
| -fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) |
| |
| In a distributed ThinLTO case, the compilation is assumed to have been performed |
| specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. |
| |
| In a local ThinLTO case, the compilation is assumedto have been performed |
| specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files |
| |
| To change the logging verbosity, set the --verbosity flag to the desired level. |
| Setting it to a specific level will enable all messages at that level and |
| higher. Exact values can be found by invoking the script with --help. |
| """ |
| |
| import argparse |
| import json |
| import logging |
| |
| from mlgo.corpus import extract_ir_lib |
| from mlgo.corpus import flags |
| |
| |
| def parse_args_and_run(): |
| parser = argparse.ArgumentParser( |
| description="A tool for making a corpus from build artifacts" |
| ) |
| parser.add_argument( |
| "--input", |
| type=str, |
| help="Input file or directory - either compile_commands.json, a linker " |
| "parameter list, or a path to a directory containing object files.", |
| ) |
| parser.add_argument( |
| "--input_type", |
| type=str, |
| help="Input file type - JSON, LLD params, directory, or bazel aquery.", |
| choices=["json", "params", "directory", "bazel_aquery"], |
| default="json", |
| nargs="?", |
| ) |
| parser.add_argument("--output_dir", type=str, help="Output directory") |
| parser.add_argument( |
| "--num_workers", |
| type=int, |
| help="Number of parallel works for objcopy. `None` for maximum available.", |
| default=None, |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--llvm_objcopy_path", |
| type=str, |
| help="Path to llvm-objcopy", |
| default="llvm-objcopy", |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--obj_base_dir", |
| type=str, |
| help="Base directory for object files. Defaults to current working dir.", |
| default="", |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--cmd_filter", |
| type=str, |
| help="Include only those modules with a command line matching this regular " |
| "expression. Set it to None to not perform any filtering. Note that the " |
| "regular expression is applied independently for each separate command line " |
| "option. For example, ^-Oz$ will match Oz built binaries. This does not work " |
| "with thinlto_build=lld.", |
| default=None, |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--thinlto_build", |
| type=str, |
| help="Set if the build was performed with either 'distributed' or 'local' " |
| "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " |
| "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " |
| "the distributed case or -Wl,--save-temps=import and " |
| "-Wl,--thinlto-emit-index-files passed in the local case", |
| choices=["distributed", "local"], |
| default=None, |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--cmd_section_name", |
| type=str, |
| help="The section name passed to llvm-objcopy. For ELF object files, the " |
| "default .llvmcmd is correct. For Mach-O object files, one should use " |
| "something like __LLVM,__cmdline", |
| default=".llvmcmd", |
| nargs="?", |
| ) |
| parser.add_argument( |
| "--bitcode_section_name", |
| type=str, |
| help="The section name passed to llvm-objcopy. For ELF object files, the " |
| "default .llvmbc is correct. For Mach-O object files, one should use " |
| "__LLVM,__bitcode", |
| default=".llvmbc", |
| nargs="?", |
| ) |
| flags.add_verbosity_arguments(parser) |
| args = parser.parse_args() |
| main(args) |
| |
| |
| def main(args): |
| logging.basicConfig(level=args.verbosity) |
| |
| objs = [] |
| if args.input is not None and args.thinlto_build == "local": |
| raise ValueError("--thinlto_build=local cannot be run with --input") |
| if args.input is None: |
| if args.thinlto_build != "local": |
| raise ValueError("--input or --thinlto_build=local must be provided") |
| objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) |
| elif args.input_type == "json": |
| with open(args.input, encoding="utf-8") as f: |
| objs = extract_ir_lib.load_from_compile_commands( |
| json.load(f), args.output_dir |
| ) |
| elif args.input_type == "params": |
| if not args.obj_base_dir: |
| logging.info( |
| "-obj_base_dir is unspecified, assuming current directory. " |
| "If no objects are found, use this option to specify the root " |
| "directory for the object file paths in the input file." |
| ) |
| with open(args.input, encoding="utf-8") as f: |
| objs = extract_ir_lib.load_from_lld_params( |
| [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir |
| ) |
| elif args.input_type == "directory": |
| logging.warning( |
| "Using the directory input is only recommended if the build system " |
| "your project uses does not support any structured output that " |
| "ml-compiler-opt understands. If your build system provides a " |
| "structured compilation database, use that instead" |
| ) |
| objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) |
| elif args.input_type == "bazel_aquery": |
| with open(args.input, encoding="utf-8") as aquery_json_handle: |
| objs = extract_ir_lib.load_bazel_aquery( |
| json.load(aquery_json_handle), args.obj_base_dir, args.output_dir |
| ) |
| else: |
| logging.error("Unknown input type: %s", args.input_type) |
| |
| relative_output_paths = extract_ir_lib.run_extraction( |
| objs, |
| args.num_workers, |
| args.llvm_objcopy_path, |
| args.cmd_filter, |
| args.thinlto_build, |
| args.cmd_section_name, |
| args.bitcode_section_name, |
| ) |
| |
| extract_ir_lib.write_corpus_manifest( |
| args.thinlto_build, relative_output_paths, args.output_dir |
| ) |
| |
| logging.info( |
| "Converted %d files out of %d", |
| len(objs) - relative_output_paths.count(None), |
| len(objs), |
| ) |
| |
| |
| if __name__ == "__main__": |
| parse_args_and_run() |