| # For manual usage, not as a part of lit tests. Used for generating the following tests: |
| |
| from string import Template |
| from itertools import product |
| |
| TEST_SM_ARCH_PAIRS = [(60, 50), (70, 63), (90, 87)] |
| |
| SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"} |
| |
| ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"] |
| |
| INTEGER_OPERATIONS = [ |
| "xchg", |
| "add", |
| "sub", |
| "and", |
| "nand", |
| "or", |
| "xor", |
| "max", |
| "min", |
| "umax", |
| "umin", |
| "uinc_wrap", |
| "udec_wrap", |
| "usub_cond", |
| "usub_sat", |
| ] |
| |
| FLOATING_POINT_OPERATIONS = ["fadd", "fsub", "fmin", "fmax", "fminimum", "fmaximum"] |
| |
| ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} |
| |
| atomicrmw_func = Template( |
| """define ${datatype} @${operation}_${ordering}_${datatype}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, ${datatype} %val) { |
| %retval = atomicrmw ${operation} ptr ${addrspace_cast} %addr, ${datatype} %val syncscope(\"${llvm_scope}\") ${ordering} |
| ret $datatype %retval |
| } |
| """ |
| ) |
| |
| run_statement = Template( |
| """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm} |
| ; RUN: %if ptxas-sm_${sm} && ptxas-isa-${ptxfp} %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %} |
| """ |
| ) |
| |
| |
| def get_addrspace_cast(addrspace): |
| if addrspace == 0: |
| return "" |
| else: |
| return " addrspace({})".format(str(addrspace)) |
| |
| |
| if __name__ == "__main__": |
| for sm, ptx in TEST_SM_ARCH_PAIRS: |
| # Slice 1: Keep addrspace, llvm_scope, ordering fixed, generate all possible operations and sizes |
| with open("atomicrmw-sm{}.ll".format(str(sm)), "w") as fp: |
| print(run_statement.substitute(sm=sm, ptx=ptx, ptxfp=ptx / 10.0), file=fp) |
| # Integer operations |
| addrspace, llvm_scope, ordering = 1, "block", "acq_rel" |
| for operation, datatype in product( |
| INTEGER_OPERATIONS, ["i8", "i16", "i32", "i64"] |
| ): |
| print( |
| atomicrmw_func.substitute( |
| operation=operation, |
| ordering=ordering, |
| datatype=datatype, |
| addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], |
| ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], |
| llvm_scope=llvm_scope, |
| addrspace_cast=get_addrspace_cast(addrspace), |
| ), |
| file=fp, |
| ) |
| |
| # Floating point add |
| for datatype, operation in product( |
| ["float", "double", "half", "bfloat"], FLOATING_POINT_OPERATIONS |
| ): |
| print( |
| atomicrmw_func.substitute( |
| operation=operation, |
| ordering=ordering, |
| datatype=datatype, |
| addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], |
| ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], |
| llvm_scope=llvm_scope, |
| addrspace_cast=get_addrspace_cast(addrspace), |
| ), |
| file=fp, |
| ) |
| |
| # Slice 2: Keep addrspace, llvm_scope fixed, and generate all possible orderings for operations add and nand. |
| # add is natively supported for larger bitwidths, while nand is emulated always |
| addrspace, llvm_scope = 1, "block" |
| for operation, datatype, ordering in product( |
| ["add", "nand"], ["i8", "i32"], ORDERINGS |
| ): |
| if addrspace == 1 and llvm_scope == "block" and ordering == "acq_rel": |
| # These are a part of Slice 1 |
| continue |
| print( |
| atomicrmw_func.substitute( |
| operation=operation, |
| ordering=ordering, |
| datatype=datatype, |
| addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], |
| addrspace_cast=get_addrspace_cast(addrspace), |
| ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], |
| llvm_scope=llvm_scope, |
| ), |
| file=fp, |
| ) |