blob: 238fe45bf45a0ed19bd450663d7e65e8946160a4 [file] [edit]
# For manual usage, not as a part of lit tests. Used for generating the following tests:
from string import Template
from itertools import product
TEST_SM_ARCH_PAIRS = [(60, 50), (70, 63), (90, 87)]
SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]
INTEGER_OPERATIONS = [
"xchg",
"add",
"sub",
"and",
"nand",
"or",
"xor",
"max",
"min",
"umax",
"umin",
"uinc_wrap",
"udec_wrap",
"usub_cond",
"usub_sat",
]
FLOATING_POINT_OPERATIONS = ["fadd", "fsub", "fmin", "fmax", "fminimum", "fmaximum"]
ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
atomicrmw_func = Template(
"""define ${datatype} @${operation}_${ordering}_${datatype}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, ${datatype} %val) {
%retval = atomicrmw ${operation} ptr ${addrspace_cast} %addr, ${datatype} %val syncscope(\"${llvm_scope}\") ${ordering}
ret $datatype %retval
}
"""
)
run_statement = Template(
"""; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
; RUN: %if ptxas-sm_${sm} && ptxas-isa-${ptxfp} %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
"""
)
def get_addrspace_cast(addrspace):
if addrspace == 0:
return ""
else:
return " addrspace({})".format(str(addrspace))
if __name__ == "__main__":
for sm, ptx in TEST_SM_ARCH_PAIRS:
# Slice 1: Keep addrspace, llvm_scope, ordering fixed, generate all possible operations and sizes
with open("atomicrmw-sm{}.ll".format(str(sm)), "w") as fp:
print(run_statement.substitute(sm=sm, ptx=ptx, ptxfp=ptx / 10.0), file=fp)
# Integer operations
addrspace, llvm_scope, ordering = 1, "block", "acq_rel"
for operation, datatype in product(
INTEGER_OPERATIONS, ["i8", "i16", "i32", "i64"]
):
print(
atomicrmw_func.substitute(
operation=operation,
ordering=ordering,
datatype=datatype,
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
llvm_scope=llvm_scope,
addrspace_cast=get_addrspace_cast(addrspace),
),
file=fp,
)
# Floating point add
for datatype, operation in product(
["float", "double", "half", "bfloat"], FLOATING_POINT_OPERATIONS
):
print(
atomicrmw_func.substitute(
operation=operation,
ordering=ordering,
datatype=datatype,
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
llvm_scope=llvm_scope,
addrspace_cast=get_addrspace_cast(addrspace),
),
file=fp,
)
# Slice 2: Keep addrspace, llvm_scope fixed, and generate all possible orderings for operations add and nand.
# add is natively supported for larger bitwidths, while nand is emulated always
addrspace, llvm_scope = 1, "block"
for operation, datatype, ordering in product(
["add", "nand"], ["i8", "i32"], ORDERINGS
):
if addrspace == 1 and llvm_scope == "block" and ordering == "acq_rel":
# These are a part of Slice 1
continue
print(
atomicrmw_func.substitute(
operation=operation,
ordering=ordering,
datatype=datatype,
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
addrspace_cast=get_addrspace_cast(addrspace),
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
llvm_scope=llvm_scope,
),
file=fp,
)