|  | # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | # See https://llvm.org/LICENSE.txt for license information. | 
|  | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | """Extract IR for training. | 
|  |  | 
|  | Extract IR for training, either from a compile_commands.json file produced by | 
|  | cmake, or a linker parameter list file. | 
|  |  | 
|  | Only run with | 
|  | 'python compiler_opt/tools/extract_ir.py ...' | 
|  |  | 
|  | The compilation is assumed to have been performed with clang, using | 
|  | -fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) | 
|  |  | 
|  | In a distributed ThinLTO case, the compilation is assumed to have been performed | 
|  | specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. | 
|  |  | 
|  | In a local ThinLTO case, the compilation is assumedto have been performed | 
|  | specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files | 
|  |  | 
|  | To change the logging verbosity, set the --verbosity flag to the desired level. | 
|  | Setting it to a specific level will enable all messages at that level and | 
|  | higher. Exact values can be found by invoking the script with --help. | 
|  | """ | 
|  |  | 
|  | import argparse | 
|  | import json | 
|  | import logging | 
|  |  | 
|  | from mlgo.corpus import extract_ir_lib | 
|  | from mlgo.corpus import flags | 
|  |  | 
|  |  | 
|  | def parse_args_and_run(): | 
|  | parser = argparse.ArgumentParser( | 
|  | description="A tool for making a corpus from build artifacts" | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--input", | 
|  | type=str, | 
|  | help="Input file or directory - either compile_commands.json, a linker " | 
|  | "parameter list, or a path to a directory containing object files.", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--input_type", | 
|  | type=str, | 
|  | help="Input file type - JSON, LLD params, directory, or bazel aquery.", | 
|  | choices=["json", "params", "directory", "bazel_aquery"], | 
|  | default="json", | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument("--output_dir", type=str, help="Output directory") | 
|  | parser.add_argument( | 
|  | "--num_workers", | 
|  | type=int, | 
|  | help="Number of parallel works for objcopy. `None` for maximum available.", | 
|  | default=None, | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--llvm_objcopy_path", | 
|  | type=str, | 
|  | help="Path to llvm-objcopy", | 
|  | default="llvm-objcopy", | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--obj_base_dir", | 
|  | type=str, | 
|  | help="Base directory for object files. Defaults to current working dir.", | 
|  | default="", | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--cmd_filter", | 
|  | type=str, | 
|  | help="Include only those modules with a command line matching this regular " | 
|  | "expression. Set it to None to not perform any filtering. Note that the " | 
|  | "regular expression is applied independently for each separate command line " | 
|  | "option. For example, ^-Oz$ will match Oz built binaries. This does not work " | 
|  | "with thinlto_build=lld.", | 
|  | default=None, | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--thinlto_build", | 
|  | type=str, | 
|  | help="Set if the build was performed with either 'distributed' or 'local' " | 
|  | "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " | 
|  | "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " | 
|  | "the distributed case or -Wl,--save-temps=import and " | 
|  | "-Wl,--thinlto-emit-index-files passed in the local case", | 
|  | choices=["distributed", "local"], | 
|  | default=None, | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--cmd_section_name", | 
|  | type=str, | 
|  | help="The section name passed to llvm-objcopy. For ELF object files, the " | 
|  | "default .llvmcmd is correct. For Mach-O object files, one should use " | 
|  | "something like __LLVM,__cmdline", | 
|  | default=".llvmcmd", | 
|  | nargs="?", | 
|  | ) | 
|  | parser.add_argument( | 
|  | "--bitcode_section_name", | 
|  | type=str, | 
|  | help="The section name passed to llvm-objcopy. For ELF object files, the " | 
|  | "default .llvmbc is correct. For Mach-O object files, one should use " | 
|  | "__LLVM,__bitcode", | 
|  | default=".llvmbc", | 
|  | nargs="?", | 
|  | ) | 
|  | flags.add_verbosity_arguments(parser) | 
|  | args = parser.parse_args() | 
|  | main(args) | 
|  |  | 
|  |  | 
|  | def main(args): | 
|  | logging.basicConfig(level=args.verbosity) | 
|  |  | 
|  | objs = [] | 
|  | if args.input is not None and args.thinlto_build == "local": | 
|  | raise ValueError("--thinlto_build=local cannot be run with --input") | 
|  | if args.input is None: | 
|  | if args.thinlto_build != "local": | 
|  | raise ValueError("--input or --thinlto_build=local must be provided") | 
|  | objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) | 
|  | elif args.input_type == "json": | 
|  | with open(args.input, encoding="utf-8") as f: | 
|  | objs = extract_ir_lib.load_from_compile_commands( | 
|  | json.load(f), args.output_dir | 
|  | ) | 
|  | elif args.input_type == "params": | 
|  | if not args.obj_base_dir: | 
|  | logging.info( | 
|  | "-obj_base_dir is unspecified, assuming current directory. " | 
|  | "If no objects are found, use this option to specify the root " | 
|  | "directory for the object file paths in the input file." | 
|  | ) | 
|  | with open(args.input, encoding="utf-8") as f: | 
|  | objs = extract_ir_lib.load_from_lld_params( | 
|  | [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir | 
|  | ) | 
|  | elif args.input_type == "directory": | 
|  | logging.warning( | 
|  | "Using the directory input is only recommended if the build system " | 
|  | "your project uses does not support any structured output that " | 
|  | "ml-compiler-opt understands. If your build system provides a " | 
|  | "structured compilation database, use that instead" | 
|  | ) | 
|  | objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) | 
|  | elif args.input_type == "bazel_aquery": | 
|  | with open(args.input, encoding="utf-8") as aquery_json_handle: | 
|  | objs = extract_ir_lib.load_bazel_aquery( | 
|  | json.load(aquery_json_handle), args.obj_base_dir, args.output_dir | 
|  | ) | 
|  | else: | 
|  | logging.error("Unknown input type: %s", args.input_type) | 
|  |  | 
|  | relative_output_paths = extract_ir_lib.run_extraction( | 
|  | objs, | 
|  | args.num_workers, | 
|  | args.llvm_objcopy_path, | 
|  | args.cmd_filter, | 
|  | args.thinlto_build, | 
|  | args.cmd_section_name, | 
|  | args.bitcode_section_name, | 
|  | ) | 
|  |  | 
|  | extract_ir_lib.write_corpus_manifest( | 
|  | args.thinlto_build, relative_output_paths, args.output_dir | 
|  | ) | 
|  |  | 
|  | logging.info( | 
|  | "Converted %d files out of %d", | 
|  | len(objs) - relative_output_paths.count(None), | 
|  | len(objs), | 
|  | ) | 
|  |  | 
|  |  | 
|  | if __name__ == "__main__": | 
|  | parse_args_and_run() |