| """Code generator for Code Completion Model Inference. |
| |
| Tool runs on the Decision Forest model defined in {model} directory. |
| It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp |
| The generated files defines the Example class named {cpp_class} having all the features as class members. |
| The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate. |
| """ |
| |
| import argparse |
| import json |
| import struct |
| |
| |
| class CppClass: |
| """Holds class name and names of the enclosing namespaces.""" |
| |
| def __init__(self, cpp_class): |
| ns_and_class = cpp_class.split("::") |
| self.ns = [ns for ns in ns_and_class[0:-1] if len(ns) > 0] |
| self.name = ns_and_class[-1] |
| if len(self.name) == 0: |
| raise ValueError("Empty class name.") |
| |
| def ns_begin(self): |
| """Returns snippet for opening namespace declarations.""" |
| open_ns = ["namespace %s {" % ns for ns in self.ns] |
| return "\n".join(open_ns) |
| |
| def ns_end(self): |
| """Returns snippet for closing namespace declarations.""" |
| close_ns = [ |
| "} // namespace %s" % ns for ns in reversed(self.ns)] |
| return "\n".join(close_ns) |
| |
| |
| def header_guard(filename): |
| '''Returns the header guard for the generated header.''' |
| return "GENERATED_DECISION_FOREST_MODEL_%s_H" % filename.upper() |
| |
| |
| def boost_node(n, label, next_label): |
| """Returns code snippet for a leaf/boost node.""" |
| return "%s: return %sf;" % (label, n['score']) |
| |
| |
| def if_greater_node(n, label, next_label): |
| """Returns code snippet for a if_greater node. |
| Jumps to true_label if the Example feature (NUMBER) is greater than the threshold. |
| Comparing integers is much faster than comparing floats. Assuming floating points |
| are represented as IEEE 754, it order-encodes the floats to integers before comparing them. |
| Control falls through if condition is evaluated to false.""" |
| threshold = n["threshold"] |
| return "%s: if (E.get%s() >= %s /*%s*/) goto %s;" % ( |
| label, n['feature'], order_encode(threshold), threshold, next_label) |
| |
| |
| def if_member_node(n, label, next_label): |
| """Returns code snippet for a if_member node. |
| Jumps to true_label if the Example feature (ENUM) is present in the set of enum values |
| described in the node. |
| Control falls through if condition is evaluated to false.""" |
| members = '|'.join([ |
| "BIT(%s_type::%s)" % (n['feature'], member) |
| for member in n["set"] |
| ]) |
| return "%s: if (E.get%s() & (%s)) goto %s;" % ( |
| label, n['feature'], members, next_label) |
| |
| |
| def node(n, label, next_label): |
| """Returns code snippet for the node.""" |
| return { |
| 'boost': boost_node, |
| 'if_greater': if_greater_node, |
| 'if_member': if_member_node, |
| }[n['operation']](n, label, next_label) |
| |
| |
| def tree(t, tree_num, node_num): |
| """Returns code for inferencing a Decision Tree. |
| Also returns the size of the decision tree. |
| |
| A tree starts with its label `t{tree#}`. |
| A node of the tree starts with label `t{tree#}_n{node#}`. |
| |
| The tree contains two types of node: Conditional node and Leaf node. |
| - Conditional node evaluates a condition. If true, it jumps to the true node/child. |
| Code is generated using pre-order traversal of the tree considering |
| false node as the first child. Therefore the false node is always the |
| immediately next label. |
| - Leaf node adds the value to the score and jumps to the next tree. |
| """ |
| label = "t%d_n%d" % (tree_num, node_num) |
| code = [] |
| |
| if t["operation"] == "boost": |
| code.append(node(t, label=label, next_label="t%d" % (tree_num + 1))) |
| return code, 1 |
| |
| false_code, false_size = tree( |
| t['else'], tree_num=tree_num, node_num=node_num+1) |
| |
| true_node_num = node_num+false_size+1 |
| true_label = "t%d_n%d" % (tree_num, true_node_num) |
| |
| true_code, true_size = tree( |
| t['then'], tree_num=tree_num, node_num=true_node_num) |
| |
| code.append(node(t, label=label, next_label=true_label)) |
| |
| return code+false_code+true_code, 1+false_size+true_size |
| |
| |
| def gen_header_code(features_json, cpp_class, filename): |
| """Returns code for header declaring the inference runtime. |
| |
| Declares the Example class named {cpp_class} inside relevant namespaces. |
| The Example class contains all the features as class members. This |
| class can be used to represent a code completion candidate. |
| Provides `float Evaluate()` function which can be used to score the Example. |
| """ |
| setters = [] |
| getters = [] |
| for f in features_json: |
| feature = f["name"] |
| |
| if f["kind"] == "NUMBER": |
| # Floats are order-encoded to integers for faster comparison. |
| setters.append( |
| "void set%s(float V) { %s = OrderEncode(V); }" % ( |
| feature, feature)) |
| elif f["kind"] == "ENUM": |
| setters.append( |
| "void set%s(unsigned V) { %s = 1LL << V; }" % (feature, feature)) |
| else: |
| raise ValueError("Unhandled feature type.", f["kind"]) |
| |
| # Class members represent all the features of the Example. |
| class_members = [ |
| "uint%d_t %s = 0;" |
| % (64 if f["kind"] == "ENUM" else 32, f['name']) |
| for f in features_json |
| ] |
| getters = [ |
| "LLVM_ATTRIBUTE_ALWAYS_INLINE uint%d_t get%s() const { return %s; }" |
| % (64 if f["kind"] == "ENUM" else 32, f['name'], f['name']) |
| for f in features_json |
| ] |
| nline = "\n " |
| guard = header_guard(filename) |
| return """#ifndef %s |
| #define %s |
| #include <cstdint> |
| #include "llvm/Support/Compiler.h" |
| |
| %s |
| class %s { |
| public: |
| // Setters. |
| %s |
| |
| // Getters. |
| %s |
| |
| private: |
| %s |
| |
| // Produces an integer that sorts in the same order as F. |
| // That is: a < b <==> orderEncode(a) < orderEncode(b). |
| static uint32_t OrderEncode(float F); |
| }; |
| |
| float Evaluate(const %s&); |
| %s |
| #endif // %s |
| """ % (guard, guard, cpp_class.ns_begin(), cpp_class.name, |
| nline.join(setters), |
| nline.join(getters), |
| nline.join(class_members), |
| cpp_class.name, cpp_class.ns_end(), guard) |
| |
| |
| def order_encode(v): |
| i = struct.unpack('<I', struct.pack('<f', v))[0] |
| TopBit = 1 << 31 |
| # IEEE 754 floats compare like sign-magnitude integers. |
| if (i & TopBit): # Negative float |
| return (1 << 32) - i # low half of integers, order reversed. |
| return TopBit + i # top half of integers |
| |
| |
| def evaluate_func(forest_json, cpp_class): |
| """Generates evaluation functions for each tree and combines them in |
| `float Evaluate(const {Example}&)` function. This function can be |
| used to score an Example.""" |
| |
| code = "" |
| |
| # Generate evaluation function of each tree. |
| code += "namespace {\n" |
| tree_num = 0 |
| for tree_json in forest_json: |
| code += "LLVM_ATTRIBUTE_NOINLINE float EvaluateTree%d(const %s& E) {\n" % (tree_num, cpp_class.name) |
| code += " " + \ |
| "\n ".join( |
| tree(tree_json, tree_num=tree_num, node_num=0)[0]) + "\n" |
| code += "}\n\n" |
| tree_num += 1 |
| code += "} // namespace\n\n" |
| |
| # Combine the scores of all trees in the final function. |
| # MSAN will timeout if these functions are inlined. |
| code += "float Evaluate(const %s& E) {\n" % cpp_class.name |
| code += " float Score = 0;\n" |
| for tree_num in range(len(forest_json)): |
| code += " Score += EvaluateTree%d(E);\n" % tree_num |
| code += " return Score;\n" |
| code += "}\n" |
| |
| return code |
| |
| |
| def gen_cpp_code(forest_json, features_json, filename, cpp_class): |
| """Generates code for the .cpp file.""" |
| # Headers |
| # Required by OrderEncode(float F). |
| angled_include = [ |
| '#include <%s>' % h |
| for h in ["cstring", "limits"] |
| ] |
| |
| # Include generated header. |
| qouted_headers = {filename + '.h', 'llvm/ADT/bit.h'} |
| # Headers required by ENUM features used by the model. |
| qouted_headers |= {f["header"] |
| for f in features_json if f["kind"] == "ENUM"} |
| quoted_include = ['#include "%s"' % h for h in sorted(qouted_headers)] |
| |
| # using-decl for ENUM features. |
| using_decls = "\n".join("using %s_type = %s;" % ( |
| feature['name'], feature['type']) |
| for feature in features_json |
| if feature["kind"] == "ENUM") |
| nl = "\n" |
| return """%s |
| |
| %s |
| |
| #define BIT(X) (1LL << X) |
| |
| %s |
| |
| %s |
| |
| uint32_t %s::OrderEncode(float F) { |
| static_assert(std::numeric_limits<float>::is_iec559, ""); |
| constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1); |
| |
| // Get the bits of the float. Endianness is the same as for integers. |
| uint32_t U = llvm::bit_cast<uint32_t>(F); |
| std::memcpy(&U, &F, sizeof(U)); |
| // IEEE 754 floats compare like sign-magnitude integers. |
| if (U & TopBit) // Negative float. |
| return 0 - U; // Map onto the low half of integers, order reversed. |
| return U + TopBit; // Positive floats map onto the high half of integers. |
| } |
| |
| %s |
| %s |
| """ % (nl.join(angled_include), nl.join(quoted_include), cpp_class.ns_begin(), |
| using_decls, cpp_class.name, evaluate_func(forest_json, cpp_class), |
| cpp_class.ns_end()) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser('DecisionForestCodegen') |
| parser.add_argument('--filename', help='output file name.') |
| parser.add_argument('--output_dir', help='output directory.') |
| parser.add_argument('--model', help='path to model directory.') |
| parser.add_argument( |
| '--cpp_class', |
| help='The name of the class (which may be a namespace-qualified) created in generated header.' |
| ) |
| ns = parser.parse_args() |
| |
| output_dir = ns.output_dir |
| filename = ns.filename |
| header_file = "%s/%s.h" % (output_dir, filename) |
| cpp_file = "%s/%s.cpp" % (output_dir, filename) |
| cpp_class = CppClass(cpp_class=ns.cpp_class) |
| |
| model_file = "%s/forest.json" % ns.model |
| features_file = "%s/features.json" % ns.model |
| |
| with open(features_file) as f: |
| features_json = json.load(f) |
| |
| with open(model_file) as m: |
| forest_json = json.load(m) |
| |
| with open(cpp_file, 'w+t') as output_cc: |
| output_cc.write( |
| gen_cpp_code(forest_json=forest_json, |
| features_json=features_json, |
| filename=filename, |
| cpp_class=cpp_class)) |
| |
| with open(header_file, 'w+t') as output_h: |
| output_h.write(gen_header_code( |
| features_json=features_json, |
| cpp_class=cpp_class, |
| filename=filename)) |
| |
| |
| if __name__ == '__main__': |
| main() |