test/CodeGen/builtins-nvptx-mma.py - llvm-project/clang - Git at Google

 # This script generates all variants of wmma builtins, verifies that clang calls
 # correct LLVM intrinsics, and checks that availability of specific builtins is
 # constrained by the correct PTX version and the target GPU variant.

 # Dummy test run to avoid lit warnings.
 # RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null

 from __future__ import print_function

 import argparse
 from collections import defaultdict
 from itertools import product
 from string import Template


 class MMAFrag:
     def __init__(self, geom, frag, ptx_elt_type):
         self.geom = geom
         self.frag = frag
         self.ptx_type = ptx_elt_type

     def __repr__(self):
         return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)


 class MMAOp:
     def __init__(self, a, b, c, d, b1op=""):
         self.a = a
         self.b = b
         self.c = c
         self.d = d
         self.b1op = b1op

     def __repr__(self):
         return "{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d)


 def make_mma_ops(geoms, types_a, types_b, types_c, types_d, b1ops=None):
     ops = []
     if b1ops is None:
         b1ops = [""]
     for geom, type_a, type_c in product(geoms, types_a, types_c):
         for type_b, type_d in product(
             types_b if types_b else [type_a], types_d if types_d else [type_c]
         ):
             ops += [
                 MMAOp(
                     MMAFrag(geom, "a", type_a),
                     MMAFrag(geom, "b", type_b),
                     MMAFrag(geom, "c", type_c),
                     MMAFrag(geom, "d", type_d),
                     b1op,
                 )
                 for b1op in b1ops
             ]
     return ops


 def make_ldst_ops(geoms, frags, types):
     return [
         MMAFrag(geom, frag, ptx_type)
         for (geom, frag, ptx_type) in product(geoms, frags, types)
     ]


 def get_mma_ops():
     return (
         make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
         + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["bf16"], [], ["f32"], [])
         + make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
         + make_mma_ops(
             ["m16n16k16", "m32n8k16", "m8n32k16"],
             ["f16"],
             [],
             ["f16", "f32"],
             ["f16", "f32"],
         )
         + make_mma_ops(
             ["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []
         )
         + make_mma_ops(["m8n8k32"], ["s4", "u4"], [], ["s32"], [])
         + make_mma_ops(
             ["m8n8k128"], ["b1"], [], ["s32"], [], [".xor.popc", ".and.popc"]
         )
     )


 def get_ldst_ops():
     # NOTE: fragemts are from the point of view of PTX.
     # fragment `d` is only for store ops, others for both loads and stores.
     return (
         make_ldst_ops(
             ["m16n16k16", "m32n8k16", "m8n32k16"],
             ["a", "b"],
             ["f16", "u8", "s8", "bf16"],
         )
         + make_ldst_ops(
             ["m16n16k16", "m32n8k16", "m8n32k16"], ["c", "d"], ["f16", "f32", "s32"]
         )
         + make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4", "u4"])
         + make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"])
         + make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"])
         + make_ldst_ops(["m8n8k4"], ["a", "b", "c", "d"], ["f64"])
         +
         # TF32 m16n16k8 is odd.
         # For fragment 'C' it uses __mma_*tf32*_m16n16k8_ld_c
         # but 'D' calls __mma_m16n16k8_st_c_*f32*.
         make_ldst_ops(["m16n16k8"], ["a", "b", "c"], ["tf32"])
         + make_ldst_ops(["m16n16k8"], ["d"], ["f32"])
     )


 def is_geom_supported(geom):
     # geometries for FP and ints.
     if geom in ["m8n32k16", "m32n8k16"]:
         return ptx_version >= 61
     # geometries for sub-ints.
     if geom in ["m8n8k32", "m8n8k128"]:
         return ptx_version >= 63 and gpu_arch >= 75
     if geom == "m16n16k16":
         return ptx_version >= 60
     if geom in ["m16n16k8", "m8n8k4"]:
         return ptx_version >= 70 and gpu_arch >= 80
     assert False  # Unexpected geometry.


 def is_type_supported(ptx_type):
     if ptx_type in ["s8", "u8", "s32"]:
         return ptx_version >= 63 and gpu_arch >= 72
     if ptx_type in ["s4", "u4", "b1"]:
         return ptx_version >= 63 and gpu_arch >= 75
     if ptx_type in ["bf16", "tf32", "f64"]:
         return ptx_version >= 70 and gpu_arch >= 80
     return ptx_version >= 60 and gpu_arch >= 70


 def is_rnd_supported(op):
     # rnd is only supported for FP64 WMMA
     return op.a.ptx_type == "f64"


 def is_mma_variant_supported(op, layout_a, layout_b, satf):
     if not (is_type_supported(op.a.ptx_type) and is_geom_supported(op.a.geom)):
         return False

     if satf and not op.a.ptx_type in ["f16", "s8", "u8", "s4", "u4"]:
         return False

     # sub-integer types require row/col layout.
     if op.a.ptx_type in ["s4", "u4", "b1"]:
         return layout_a == "row" and layout_b == "col"
     return True


 def is_ldst_variant_supported(frag, layout):
     if not (is_type_supported(frag.ptx_type) and is_geom_supported(frag.geom)):
         return False
     if frag.ptx_type in ["s4", "u4", "b1"]:
         # sub-integer types require sm_75 and ptx63, row/col layout for a/b.
         return (
             (frag.frag == "a" and layout == "row")
             or (frag.frag == "b" and layout == "col")
             or frag.frag in ["c", "d"]
         )
     return True


 def get_builtin_prefix(frag):
     prefix = None
     if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]:
         if frag.ptx_type in ["f16", "f32"]:
             prefix = "__hmma"
         elif frag.ptx_type == "bf16":
             prefix = "__mma_bf16"
         else:
             prefix = "__imma"
     elif frag.geom == "m8n8k32":
         prefix = "__imma"  # sub-integers
     elif frag.geom == "m8n8k128":
         prefix = "__bmma"
     elif frag.geom == "m8n8k4":
         prefix = "__dmma"
     elif frag.geom == "m16n16k8":
         if frag.ptx_type == "f32":
             prefix = "__mma"
         else:
             prefix = "__mma_tf32"
     assert prefix
     return prefix


 def get_ldst_builtin_name(frag):
     prefix = get_builtin_prefix(frag)

     if prefix == "__hmma":
         suffix = "" if frag.frag in ["a", "b"] else frag.ptx_type
     elif prefix in ["__dmma", "__mma_bf16", "__mma_tf32"]:
         suffix = "" if frag.frag in ["a", "b", "c"] else frag.ptx_type
     else:
         suffix = "" if frag.frag == "c" else frag.ptx_type
         if suffix == "s32":
             suffix = "i32"

     if frag.frag == "d":
         ifrag = "c"
         op = "st"
     else:
         ifrag = frag.frag
         op = "ld"

     name = "%s_%s_%s_%s%s" % (
         prefix,
         frag.geom,
         op,
         ifrag,
         "_" + suffix if suffix else "",
     )
     return name


 def get_mma_builtin_name(op):
     prefix = get_builtin_prefix(op.a)

     if prefix == "__hmma":
         suffix = op.d.ptx_type + op.c.ptx_type
     elif prefix in ["__mma_bf16", "__mma_tf32"]:
         suffix = op.d.ptx_type
     else:
         suffix = op.a.ptx_type

     name = "{prefix}_{geom}_mma{b1op}_{suffix}".format(
         prefix=prefix, geom=op.a.geom, b1op=op.b1op.replace(".", "_"), suffix=suffix
     )
     return name


 def get_required_sm(frag, b1op=""):
     if frag.ptx_type in ["f64", "bf16", "tf32"]:
         return 80
     if frag.ptx_type in ["u4", "s4", "b1"]:
         if b1op == ".and.popc":
             return 80
         return 75
     if frag.ptx_type in ["s8", "u8"]:
         return 72
     if frag.ptx_type == "s32":
         if frag.geom in ["m8n8k32", "m8n8k128"]:  # s4/u4/b1
             return 75
         else:  # s8/u8
             return 72
     if frag.ptx_type in ["f16", "f32"]:
         if frag.geom == "m16n16k8":
             return 80
         else:
             return 70
     assert False


 def get_required_ptx(frag, b1op=""):
     if frag.ptx_type == "b1" and b1op == ".and.popc":
         return 71
     if frag.ptx_type in ["f64", "bf16", "tf32"]:
         return 70
     if frag.ptx_type in ["f16", "f32"]:
         if frag.geom == "m16n16k16":
             return 60
         if frag.geom == "m16n16k8":
             return 70
         return 61
     return 63


 def get_src_dst_prefix(frag):
     if frag.ptx_type == "f32":
         return "f"
     if frag.ptx_type == "f64":
         return "d"
     if frag.ptx_type == "tf32" and frag.frag in ["c", "d"]:
         return "f"
     return ""


 def gen_wmma_ldst_tests(results):
     load_template = """
   // CHECK${check_suffix}: call {{.*}} @${intrinsic}
   // expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.*}},(ptx${min_ptx}{{.*}}}}
   ${builtin}(${dst}, ${src}, ldm, ${blayout});
 """.rstrip()
     intrinsic_template = (
         "llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}"
     )

     for frag, layout in sorted(product(get_ldst_ops(), ["row", "col"]), key=str):

         if not is_ldst_variant_supported(frag, layout):
             continue

         src_dst_prefix = get_src_dst_prefix(frag)

         min_sm = get_required_sm(frag)
         min_ptx = get_required_ptx(frag)
         # TF32 uses f32 for accumulator loads.
         if frag.geom == "m16n16k8" and frag.frag == "c":
             assert frag.ptx_type == "tf32"
             itype = "f32"
         else:
             itype = frag.ptx_type

         params = {
             "check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
             "builtin": get_ldst_builtin_name(frag),
             "min_ptx": min_ptx,
             "min_sm": min_sm,
             "dst": src_dst_prefix + "dst",
             "src": src_dst_prefix + "src",
             "blayout": 0 if layout == "row" else 1,
             "intrinsic": Template(intrinsic_template).substitute(
                 {
                     "frag": frag.frag,
                     "geom": frag.geom,
                     "ilayout": layout,
                     "itype": itype,
                     "op": "store" if frag.frag == "d" else "load",
                 }
             ),
         }
         results[(min_ptx, min_sm)] += Template(load_template).substitute(params)

     return results


 def mma_signature(op):
     if op.a.ptx_type == "f16":
         # FP16 ops identified by accumulator & result type.
         return "%s.%s" % (op.d.ptx_type, op.c.ptx_type)
     else:
         # other ops are identified by input type.
         return op.a.ptx_type


 # Get numeric value for rowcol parameter of the builtin
 # AFAICT it uses the encoding accepted by NVVM intrinsics:
 # https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma
 def get_ilayout(a, b):
     return {"row.row": 0, "row.col": 1, "col.row": 2, "col.col": 3}[a + "." + b]


 def gen_wmma_mma_tests(results):
     mma_template = """
   // CHECK${check_suffix}: call {{.*}} @${intrinsic}
   // expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.*}},(ptx${min_ptx}{{.*}}}}
   ${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}${maybe_satf});
 """.rstrip()
     intrinsic_template = "llvm.nvvm.wmma.${geom}.mma${b1op}.${alayout}.${blayout}.${intrinsic_signature}${satf}"

     for op, alayout, blayout, satf in sorted(
         product(get_mma_ops(), ["row", "col"], ["row", "col"], [".satfinite", ""]),
         key=str,
     ):

         if not is_mma_variant_supported(op, alayout, blayout, satf):
             continue

         asrc_prefix = get_src_dst_prefix(op.a)
         csrc_prefix = get_src_dst_prefix(op.c)
         ddst_prefix = get_src_dst_prefix(op.d)
         if op.a.ptx_type == "b1":  # .b1 MMA has no satf argument.
             isatf_arg = ""
         else:
             isatf_arg = ", 1" if satf else ", 0"
         min_sm = get_required_sm(op.a, op.b1op)
         min_ptx = get_required_ptx(op.a, op.b1op)
         params = {
             "check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
             "builtin": get_mma_builtin_name(op),
             "min_ptx": min_ptx,
             "min_sm": min_sm,
             "dst": ddst_prefix + "dst",
             "asrc": asrc_prefix + "src",
             "csrc": csrc_prefix + "src",
             "ilayout": get_ilayout(alayout, blayout),
             "maybe_satf": isatf_arg,
             "intrinsic": Template(intrinsic_template).substitute(
                 {
                     "geom": op.a.geom,
                     "alayout": alayout,
                     "blayout": blayout,
                     "intrinsic_signature": mma_signature(op),
                     "satf": satf,
                     "b1op": op.b1op,
                 }
             ),
         }
         results[(min_ptx, min_sm)] += Template(mma_template).substitute(params)

     return results


 def gen_tests():
     results = gen_wmma_ldst_tests(defaultdict(str))
     results = gen_wmma_mma_tests(results)

     run_template = r"""
 //
 // *** DO NOT EDIT ***
 //
 //  This test has been automatically generated by
 //  builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm}
 //
 // Make sure we can handle all builtins available on sm_${sm} with PTX${ptx}
 // ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \
 // ${run}:            -fcuda-is-device -target-feature +ptx${ptx} \
 // ${run}:            -DPTX=${ptx} -DSM=${sm} \
 // ${run}:            -S -emit-llvm -o - -x cuda %s \
 // ${run}:   | FileCheck -check-prefixes=${check_labels} %s
 // Verify that all builtins have correct constraints.
 // ${run}: %clang_cc1 -triple nvptx-unknown-unknown \
 // ${run}:   -target-cpu sm_60 -target-feature +ptx42 \
 // ${run}:   -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \
 // ${run}:   -verify %s
 """

     def supported_variants(ptx, sm, results):
         return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm]

     print(
         Template(run_template).substitute(
             {
                 "run": "RUN",  # To avoid lit misinterpreting the template
                 "ptx": ptx_version,
                 "sm": gpu_arch,
                 "check_labels": ",".join(
                     [
                         "CHECK_PTX%d_SM%d" % (ptx_, sm_)
                         for ptx_, sm_ in supported_variants(
                             ptx_version, gpu_arch, results
                         )
                     ]
                 ),
             }
         )
     )

     print(
         """
 #if !defined(CUDA_VERSION)
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
 #define __shared__ __attribute__((shared))
 #define __constant__ __attribute__((constant))

 typedef unsigned long long uint64_t;
 #endif

 // CHECK-LABEL: test_wmma_buitins
 __device__ void test_wmma_buitins(int *src, int *dst,
                                   float *fsrc, float *fdst,
                                   double *dsrc, double *ddst, int ldm) {
 """
     )

     for (ptx, sm), tests in sorted(results.items()):
         print()
         print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm))
         print(tests)
         print("#endif // (PTX >= %d) && (SM >= %d)" % (ptx, sm))

     print("}")


 parser = argparse.ArgumentParser()
 parser.add_argument("--ptx", type=int, default=60)
 parser.add_argument("--gpu-arch", type=int, default=70)
 args = parser.parse_args()
 ptx_version = args.ptx
 gpu_arch = args.gpu_arch

 gen_tests()
	# This script generates all variants of wmma builtins, verifies that clang calls
	# correct LLVM intrinsics, and checks that availability of specific builtins is
	# constrained by the correct PTX version and the target GPU variant.

	# Dummy test run to avoid lit warnings.
	# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null

	from __future__ import print_function

	import argparse
	from collections import defaultdict
	from itertools import product
	from string import Template


	class MMAFrag:
	def __init__(self, geom, frag, ptx_elt_type):
	self.geom = geom
	self.frag = frag
	self.ptx_type = ptx_elt_type

	def __repr__(self):
	return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)


	class MMAOp:
	def __init__(self, a, b, c, d, b1op=""):
	self.a = a
	self.b = b
	self.c = c
	self.d = d
	self.b1op = b1op

	def __repr__(self):
	return "{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d)


	def make_mma_ops(geoms, types_a, types_b, types_c, types_d, b1ops=None):
	ops = []
	if b1ops is None:
	b1ops = [""]
	for geom, type_a, type_c in product(geoms, types_a, types_c):
	for type_b, type_d in product(
	types_b if types_b else [type_a], types_d if types_d else [type_c]
	):
	ops += [
	MMAOp(
	MMAFrag(geom, "a", type_a),
	MMAFrag(geom, "b", type_b),
	MMAFrag(geom, "c", type_c),
	MMAFrag(geom, "d", type_d),
	b1op,
	)
	for b1op in b1ops
	]
	return ops


	def make_ldst_ops(geoms, frags, types):
	return [
	MMAFrag(geom, frag, ptx_type)
	for (geom, frag, ptx_type) in product(geoms, frags, types)
	]


	def get_mma_ops():
	return (
	make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
	+ make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["bf16"], [], ["f32"], [])
	+ make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
	+ make_mma_ops(
	["m16n16k16", "m32n8k16", "m8n32k16"],
	["f16"],
	[],
	["f16", "f32"],
	["f16", "f32"],
	)
	+ make_mma_ops(
	["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []
	)
	+ make_mma_ops(["m8n8k32"], ["s4", "u4"], [], ["s32"], [])
	+ make_mma_ops(
	["m8n8k128"], ["b1"], [], ["s32"], [], [".xor.popc", ".and.popc"]
	)
	)


	def get_ldst_ops():
	# NOTE: fragemts are from the point of view of PTX.
	# fragment `d` is only for store ops, others for both loads and stores.
	return (
	make_ldst_ops(
	["m16n16k16", "m32n8k16", "m8n32k16"],
	["a", "b"],
	["f16", "u8", "s8", "bf16"],
	)
	+ make_ldst_ops(
	["m16n16k16", "m32n8k16", "m8n32k16"], ["c", "d"], ["f16", "f32", "s32"]
	)
	+ make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4", "u4"])
	+ make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"])
	+ make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"])
	+ make_ldst_ops(["m8n8k4"], ["a", "b", "c", "d"], ["f64"])
	+
	# TF32 m16n16k8 is odd.
	# For fragment 'C' it uses __mma_tf32_m16n16k8_ld_c
	# but 'D' calls __mma_m16n16k8_st_c_f32.
	make_ldst_ops(["m16n16k8"], ["a", "b", "c"], ["tf32"])
	+ make_ldst_ops(["m16n16k8"], ["d"], ["f32"])
	)


	def is_geom_supported(geom):
	# geometries for FP and ints.
	if geom in ["m8n32k16", "m32n8k16"]:
	return ptx_version >= 61
	# geometries for sub-ints.
	if geom in ["m8n8k32", "m8n8k128"]:
	return ptx_version >= 63 and gpu_arch >= 75
	if geom == "m16n16k16":
	return ptx_version >= 60
	if geom in ["m16n16k8", "m8n8k4"]:
	return ptx_version >= 70 and gpu_arch >= 80
	assert False # Unexpected geometry.


	def is_type_supported(ptx_type):
	if ptx_type in ["s8", "u8", "s32"]:
	return ptx_version >= 63 and gpu_arch >= 72
	if ptx_type in ["s4", "u4", "b1"]:
	return ptx_version >= 63 and gpu_arch >= 75
	if ptx_type in ["bf16", "tf32", "f64"]:
	return ptx_version >= 70 and gpu_arch >= 80
	return ptx_version >= 60 and gpu_arch >= 70


	def is_rnd_supported(op):
	# rnd is only supported for FP64 WMMA
	return op.a.ptx_type == "f64"


	def is_mma_variant_supported(op, layout_a, layout_b, satf):
	if not (is_type_supported(op.a.ptx_type) and is_geom_supported(op.a.geom)):
	return False

	if satf and not op.a.ptx_type in ["f16", "s8", "u8", "s4", "u4"]:
	return False

	# sub-integer types require row/col layout.
	if op.a.ptx_type in ["s4", "u4", "b1"]:
	return layout_a == "row" and layout_b == "col"
	return True


	def is_ldst_variant_supported(frag, layout):
	if not (is_type_supported(frag.ptx_type) and is_geom_supported(frag.geom)):
	return False
	if frag.ptx_type in ["s4", "u4", "b1"]:
	# sub-integer types require sm_75 and ptx63, row/col layout for a/b.
	return (
	(frag.frag == "a" and layout == "row")
	or (frag.frag == "b" and layout == "col")
	or frag.frag in ["c", "d"]
	)
	return True


	def get_builtin_prefix(frag):
	prefix = None
	if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]:
	if frag.ptx_type in ["f16", "f32"]:
	prefix = "__hmma"
	elif frag.ptx_type == "bf16":
	prefix = "__mma_bf16"
	else:
	prefix = "__imma"
	elif frag.geom == "m8n8k32":
	prefix = "__imma" # sub-integers
	elif frag.geom == "m8n8k128":
	prefix = "__bmma"
	elif frag.geom == "m8n8k4":
	prefix = "__dmma"
	elif frag.geom == "m16n16k8":
	if frag.ptx_type == "f32":
	prefix = "__mma"
	else:
	prefix = "__mma_tf32"
	assert prefix
	return prefix


	def get_ldst_builtin_name(frag):
	prefix = get_builtin_prefix(frag)

	if prefix == "__hmma":
	suffix = "" if frag.frag in ["a", "b"] else frag.ptx_type
	elif prefix in ["__dmma", "__mma_bf16", "__mma_tf32"]:
	suffix = "" if frag.frag in ["a", "b", "c"] else frag.ptx_type
	else:
	suffix = "" if frag.frag == "c" else frag.ptx_type
	if suffix == "s32":
	suffix = "i32"

	if frag.frag == "d":
	ifrag = "c"
	op = "st"
	else:
	ifrag = frag.frag
	op = "ld"

	name = "%s_%s_%s_%s%s" % (
	prefix,
	frag.geom,
	op,
	ifrag,
	"_" + suffix if suffix else "",
	)
	return name


	def get_mma_builtin_name(op):
	prefix = get_builtin_prefix(op.a)

	if prefix == "__hmma":
	suffix = op.d.ptx_type + op.c.ptx_type
	elif prefix in ["__mma_bf16", "__mma_tf32"]:
	suffix = op.d.ptx_type
	else:
	suffix = op.a.ptx_type

	name = "{prefix}_{geom}_mma{b1op}_{suffix}".format(
	prefix=prefix, geom=op.a.geom, b1op=op.b1op.replace(".", "_"), suffix=suffix
	)
	return name


	def get_required_sm(frag, b1op=""):
	if frag.ptx_type in ["f64", "bf16", "tf32"]:
	return 80
	if frag.ptx_type in ["u4", "s4", "b1"]:
	if b1op == ".and.popc":
	return 80
	return 75
	if frag.ptx_type in ["s8", "u8"]:
	return 72
	if frag.ptx_type == "s32":
	if frag.geom in ["m8n8k32", "m8n8k128"]: # s4/u4/b1
	return 75
	else: # s8/u8
	return 72
	if frag.ptx_type in ["f16", "f32"]:
	if frag.geom == "m16n16k8":
	return 80
	else:
	return 70
	assert False


	def get_required_ptx(frag, b1op=""):
	if frag.ptx_type == "b1" and b1op == ".and.popc":
	return 71
	if frag.ptx_type in ["f64", "bf16", "tf32"]:
	return 70
	if frag.ptx_type in ["f16", "f32"]:
	if frag.geom == "m16n16k16":
	return 60
	if frag.geom == "m16n16k8":
	return 70
	return 61
	return 63


	def get_src_dst_prefix(frag):
	if frag.ptx_type == "f32":
	return "f"
	if frag.ptx_type == "f64":
	return "d"
	if frag.ptx_type == "tf32" and frag.frag in ["c", "d"]:
	return "f"
	return ""


	def gen_wmma_ldst_tests(results):
	load_template = """
	// CHECK${check_suffix}: call {{.*}} @${intrinsic}
	// expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.}},(ptx${min_ptx}{{.}}}}
	${builtin}(${dst}, ${src}, ldm, ${blayout});
	""".rstrip()
	intrinsic_template = (
	"llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}"
	)

	for frag, layout in sorted(product(get_ldst_ops(), ["row", "col"]), key=str):

	if not is_ldst_variant_supported(frag, layout):
	continue

	src_dst_prefix = get_src_dst_prefix(frag)

	min_sm = get_required_sm(frag)
	min_ptx = get_required_ptx(frag)
	# TF32 uses f32 for accumulator loads.
	if frag.geom == "m16n16k8" and frag.frag == "c":
	assert frag.ptx_type == "tf32"
	itype = "f32"
	else:
	itype = frag.ptx_type

	params = {
	"check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
	"builtin": get_ldst_builtin_name(frag),
	"min_ptx": min_ptx,
	"min_sm": min_sm,
	"dst": src_dst_prefix + "dst",
	"src": src_dst_prefix + "src",
	"blayout": 0 if layout == "row" else 1,
	"intrinsic": Template(intrinsic_template).substitute(
	{
	"frag": frag.frag,
	"geom": frag.geom,
	"ilayout": layout,
	"itype": itype,
	"op": "store" if frag.frag == "d" else "load",
	}
	),
	}
	results[(min_ptx, min_sm)] += Template(load_template).substitute(params)

	return results


	def mma_signature(op):
	if op.a.ptx_type == "f16":
	# FP16 ops identified by accumulator & result type.
	return "%s.%s" % (op.d.ptx_type, op.c.ptx_type)
	else:
	# other ops are identified by input type.
	return op.a.ptx_type


	# Get numeric value for rowcol parameter of the builtin
	# AFAICT it uses the encoding accepted by NVVM intrinsics:
	# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma
	def get_ilayout(a, b):
	return {"row.row": 0, "row.col": 1, "col.row": 2, "col.col": 3}[a + "." + b]


	def gen_wmma_mma_tests(results):
	mma_template = """
	// CHECK${check_suffix}: call {{.*}} @${intrinsic}
	// expected-error-re@+1 {{'${builtin}' needs target feature (sm_${min_sm}{{.}},(ptx${min_ptx}{{.}}}}
	${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}${maybe_satf});
	""".rstrip()
	intrinsic_template = "llvm.nvvm.wmma.${geom}.mma${b1op}.${alayout}.${blayout}.${intrinsic_signature}${satf}"

	for op, alayout, blayout, satf in sorted(
	product(get_mma_ops(), ["row", "col"], ["row", "col"], [".satfinite", ""]),
	key=str,
	):

	if not is_mma_variant_supported(op, alayout, blayout, satf):
	continue

	asrc_prefix = get_src_dst_prefix(op.a)
	csrc_prefix = get_src_dst_prefix(op.c)
	ddst_prefix = get_src_dst_prefix(op.d)
	if op.a.ptx_type == "b1": # .b1 MMA has no satf argument.
	isatf_arg = ""
	else:
	isatf_arg = ", 1" if satf else ", 0"
	min_sm = get_required_sm(op.a, op.b1op)
	min_ptx = get_required_ptx(op.a, op.b1op)
	params = {
	"check_suffix": "_PTX%d_SM%d" % (min_ptx, min_sm),
	"builtin": get_mma_builtin_name(op),
	"min_ptx": min_ptx,
	"min_sm": min_sm,
	"dst": ddst_prefix + "dst",
	"asrc": asrc_prefix + "src",
	"csrc": csrc_prefix + "src",
	"ilayout": get_ilayout(alayout, blayout),
	"maybe_satf": isatf_arg,
	"intrinsic": Template(intrinsic_template).substitute(
	{
	"geom": op.a.geom,
	"alayout": alayout,
	"blayout": blayout,
	"intrinsic_signature": mma_signature(op),
	"satf": satf,
	"b1op": op.b1op,
	}
	),
	}
	results[(min_ptx, min_sm)] += Template(mma_template).substitute(params)

	return results


	def gen_tests():
	results = gen_wmma_ldst_tests(defaultdict(str))
	results = gen_wmma_mma_tests(results)

	run_template = r"""
	//
	// * DO NOT EDIT *
	//
	// This test has been automatically generated by
	// builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm}
	//
	// Make sure we can handle all builtins available on sm_${sm} with PTX${ptx}
	// ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \
	// ${run}: -fcuda-is-device -target-feature +ptx${ptx} \
	// ${run}: -DPTX=${ptx} -DSM=${sm} \
	// ${run}: -S -emit-llvm -o - -x cuda %s \
	// ${run}: \| FileCheck -check-prefixes=${check_labels} %s
	// Verify that all builtins have correct constraints.
	// ${run}: %clang_cc1 -triple nvptx-unknown-unknown \
	// ${run}: -target-cpu sm_60 -target-feature +ptx42 \
	// ${run}: -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \
	// ${run}: -verify %s
	"""

	def supported_variants(ptx, sm, results):
	return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm]

	print(
	Template(run_template).substitute(
	{
	"run": "RUN", # To avoid lit misinterpreting the template
	"ptx": ptx_version,
	"sm": gpu_arch,
	"check_labels": ",".join(
	[
	"CHECK_PTX%d_SM%d" % (ptx_, sm_)
	for ptx_, sm_ in supported_variants(
	ptx_version, gpu_arch, results
	)
	]
	),
	}
	)
	)

	print(
	"""
	#if !defined(CUDA_VERSION)
	#define __device__ __attribute__((device))
	#define __global__ __attribute__((global))
	#define __shared__ __attribute__((shared))
	#define __constant__ __attribute__((constant))

	typedef unsigned long long uint64_t;
	#endif

	// CHECK-LABEL: test_wmma_buitins
	__device__ void test_wmma_buitins(int src, int dst,
	float fsrc, float fdst,
	double dsrc, double ddst, int ldm) {
	"""
	)

	for (ptx, sm), tests in sorted(results.items()):
	print()
	print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm))
	print(tests)
	print("#endif // (PTX >= %d) && (SM >= %d)" % (ptx, sm))

	print("}")


	parser = argparse.ArgumentParser()
	parser.add_argument("--ptx", type=int, default=60)
	parser.add_argument("--gpu-arch", type=int, default=70)
	args = parser.parse_args()
	ptx_version = args.ptx
	gpu_arch = args.gpu_arch

	gen_tests()