mlir/test/Examples/NVGPU/Ch0.py - llvm-project - Git at Google

 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
 # RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
 # RUN: then %PYTHON %s | FileCheck %s; \
 # RUN: else export MLIR_NVDSL_PRINT_IR=1; \
 # RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


 # ===----------------------------------------------------------------------===//
 #  Chapter 0 : Hello World
 # ===----------------------------------------------------------------------===//
 #
 # This program demonstrates Hello World:
 #   1. Build MLIR function with arguments
 #   2. Build MLIR GPU kernel
 #   3. Print from a GPU thread
 #   4. Pass arguments, JIT compile and run the MLIR function
 #
 # ===----------------------------------------------------------------------===//


 from mlir.dialects import gpu
 from tools.nvdsl import *


 # 1. The decorator generates a MLIR func.func.
 # Everything inside the Python function becomes the body of the func.
 # The decorator also translates `alpha` to an `index` type.
 @NVDSL.mlir_func
 def main(alpha):
     # 2. The decorator generates a MLIR gpu.launch.
     # Everything inside the Python function becomes the body of the gpu.launch.
     # This allows for late outlining of the GPU kernel, enabling optimizations
     # like constant folding from host to device.
     @NVDSL.mlir_gpu_launch(grid=(1, 1, 1), block=(4, 1, 1))
     def kernel():
         tidx = gpu.thread_id(gpu.Dimension.x)
         # + operator generates arith.addi
         myValue = alpha + tidx
         # Print from a GPU thread
         gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)

     # 3. Call the GPU kernel
     kernel()


 alpha = 100
 # 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
 main(alpha)

 # CHECK: GPU thread 0 has 100
 # CHECK: GPU thread 1 has 101
 # CHECK: GPU thread 2 has 102
 # CHECK: GPU thread 3 has 103

 # DUMPIR:   func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
 # DUMPIR:     %[[C0_I32:.*]] = arith.constant 0 : i32
 # DUMPIR:     %[[C1:.*]] = arith.constant 1 : index
 # DUMPIR:     %[[C1_0:.*]] = arith.constant 1 : index
 # DUMPIR:     %[[C1_1:.*]] = arith.constant 1 : index
 # DUMPIR:     %[[C4:.*]] = arith.constant 4 : index
 # DUMPIR:     %[[C1_2:.*]] = arith.constant 1 : index
 # DUMPIR:     %[[C1_3:.*]] = arith.constant 1 : index
 # DUMPIR:     gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
 # DUMPIR:       %[[TIDX:.*]] = gpu.thread_id  x
 # DUMPIR:       %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
 # DUMPIR:       gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
 # DUMPIR:       gpu.terminator
 # DUMPIR:     }
 # DUMPIR:     return
 # DUMPIR:   }
	# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
	# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
	# RUN: then %PYTHON %s \| FileCheck %s; \
	# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
	# RUN: %PYTHON %s \| FileCheck %s --check-prefix=DUMPIR; fi'


	# ===----------------------------------------------------------------------===//
	# Chapter 0 : Hello World
	# ===----------------------------------------------------------------------===//
	#
	# This program demonstrates Hello World:
	# 1. Build MLIR function with arguments
	# 2. Build MLIR GPU kernel
	# 3. Print from a GPU thread
	# 4. Pass arguments, JIT compile and run the MLIR function
	#
	# ===----------------------------------------------------------------------===//


	from mlir.dialects import gpu
	from tools.nvdsl import *


	# 1. The decorator generates a MLIR func.func.
	# Everything inside the Python function becomes the body of the func.
	# The decorator also translates `alpha` to an `index` type.
	@NVDSL.mlir_func
	def main(alpha):
	# 2. The decorator generates a MLIR gpu.launch.
	# Everything inside the Python function becomes the body of the gpu.launch.
	# This allows for late outlining of the GPU kernel, enabling optimizations
	# like constant folding from host to device.
	@NVDSL.mlir_gpu_launch(grid=(1, 1, 1), block=(4, 1, 1))
	def kernel():
	tidx = gpu.thread_id(gpu.Dimension.x)
	# + operator generates arith.addi
	myValue = alpha + tidx
	# Print from a GPU thread
	gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)

	# 3. Call the GPU kernel
	kernel()


	alpha = 100
	# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
	main(alpha)

	# CHECK: GPU thread 0 has 100
	# CHECK: GPU thread 1 has 101
	# CHECK: GPU thread 2 has 102
	# CHECK: GPU thread 3 has 103

	# DUMPIR: func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
	# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
	# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
	# DUMPIR: %[[C1_0:.*]] = arith.constant 1 : index
	# DUMPIR: %[[C1_1:.*]] = arith.constant 1 : index
	# DUMPIR: %[[C4:.*]] = arith.constant 4 : index
	# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
	# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
	# DUMPIR: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
	# DUMPIR: %[[TIDX:.*]] = gpu.thread_id x
	# DUMPIR: %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
	# DUMPIR: gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
	# DUMPIR: gpu.terminator
	# DUMPIR: }
	# DUMPIR: return
	# DUMPIR: }