mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp - llvm-project - Git at Google

 //===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements in-dialect rewriting of the gpu.subgroup_id op for archs
 // where:
 // subgroup_id = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / subgroup_size
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"

 using namespace mlir;

 namespace {
 struct GpuSubgroupIdRewriter final : OpRewritePattern<gpu::SubgroupIdOp> {
   using OpRewritePattern<gpu::SubgroupIdOp>::OpRewritePattern;

   LogicalResult matchAndRewrite(gpu::SubgroupIdOp op,
                                 PatternRewriter &rewriter) const override {
     // Calculation of the thread's subgroup identifier.
     //
     // The process involves mapping the thread's 3D identifier within its
     // block (b_id.x, b_id.y, b_id.z) to a 1D linear index.
     // This linearization assumes a layout where the x-dimension (w_dim.x)
     // varies most rapidly (i.e., it is the innermost dimension).
     //
     // The formula for the linearized thread index is:
     // L = tid.x + dim.x * (tid.y + (dim.y * tid.z))
     //
     // Subsequently, the range of linearized indices [0, N_threads-1] is
     // divided into consecutive, non-overlapping segments, each representing
     // a subgroup of size 'subgroup_size'.
     //
     // Example Partitioning (N = subgroup_size):
     // | Subgroup 0      | Subgroup 1      | Subgroup 2      | ... |
     // | Indices 0..N-1  | Indices N..2N-1 | Indices 2N..3N-1| ... |
     //
     // The subgroup identifier is obtained via integer division of the
     // linearized thread index by the predefined 'subgroup_size'.
     //
     // subgroup_id = floor( L / subgroup_size )
     //             = (tid.x + dim.x * (tid.y + dim.y * tid.z)) /
     //             subgroup_size

     Location loc = op->getLoc();
     Type indexType = rewriter.getIndexType();

     Value dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
     Value dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
     Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
     Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y);
     Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z);

     Value dimYxIdZ =
         arith::MulIOp::create(rewriter, loc, indexType, dimY, tidZ);
     Value dimYxIdZPlusIdY =
         arith::AddIOp::create(rewriter, loc, indexType, dimYxIdZ, tidY);
     Value dimYxIdZPlusIdYTimesDimX =
         arith::MulIOp::create(rewriter, loc, indexType, dimX, dimYxIdZPlusIdY);
     Value IdXPlusDimYxIdZPlusIdYTimesDimX = arith::AddIOp::create(
         rewriter, loc, indexType, tidX, dimYxIdZPlusIdYTimesDimX);
     Value subgroupSize = gpu::SubgroupSizeOp::create(
         rewriter, loc, rewriter.getIndexType(), /*upper_bound = */ nullptr);
     Value subgroupIdOp =
         arith::DivUIOp::create(rewriter, loc, indexType,
                                IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
     rewriter.replaceOp(op, {subgroupIdOp});
     return success();
   }
 };

 } // namespace

 void mlir::populateGpuSubgroupIdPatterns(RewritePatternSet &patterns) {
   patterns.add<GpuSubgroupIdRewriter>(patterns.getContext());
 }
	//===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements in-dialect rewriting of the gpu.subgroup_id op for archs
	// where:
	// subgroup_id = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / subgroup_size
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/GPU/Transforms/Passes.h"
	#include "mlir/Dialect/Index/IR/IndexOps.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/PatternMatch.h"

	using namespace mlir;

	namespace {
	struct GpuSubgroupIdRewriter final : OpRewritePattern<gpu::SubgroupIdOp> {
	using OpRewritePattern<gpu::SubgroupIdOp>::OpRewritePattern;

	LogicalResult matchAndRewrite(gpu::SubgroupIdOp op,
	PatternRewriter &rewriter) const override {
	// Calculation of the thread's subgroup identifier.
	//
	// The process involves mapping the thread's 3D identifier within its
	// block (b_id.x, b_id.y, b_id.z) to a 1D linear index.
	// This linearization assumes a layout where the x-dimension (w_dim.x)
	// varies most rapidly (i.e., it is the innermost dimension).
	//
	// The formula for the linearized thread index is:
	// L = tid.x + dim.x * (tid.y + (dim.y * tid.z))
	//
	// Subsequently, the range of linearized indices [0, N_threads-1] is
	// divided into consecutive, non-overlapping segments, each representing
	// a subgroup of size 'subgroup_size'.
	//
	// Example Partitioning (N = subgroup_size):
	// \| Subgroup 0 \| Subgroup 1 \| Subgroup 2 \| ... \|
	// \| Indices 0..N-1 \| Indices N..2N-1 \| Indices 2N..3N-1\| ... \|
	//
	// The subgroup identifier is obtained via integer division of the
	// linearized thread index by the predefined 'subgroup_size'.
	//
	// subgroup_id = floor( L / subgroup_size )
	// = (tid.x + dim.x * (tid.y + dim.y * tid.z)) /
	// subgroup_size

	Location loc = op->getLoc();
	Type indexType = rewriter.getIndexType();

	Value dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
	Value dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
	Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
	Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y);
	Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z);

	Value dimYxIdZ =
	arith::MulIOp::create(rewriter, loc, indexType, dimY, tidZ);
	Value dimYxIdZPlusIdY =
	arith::AddIOp::create(rewriter, loc, indexType, dimYxIdZ, tidY);
	Value dimYxIdZPlusIdYTimesDimX =
	arith::MulIOp::create(rewriter, loc, indexType, dimX, dimYxIdZPlusIdY);
	Value IdXPlusDimYxIdZPlusIdYTimesDimX = arith::AddIOp::create(
	rewriter, loc, indexType, tidX, dimYxIdZPlusIdYTimesDimX);
	Value subgroupSize = gpu::SubgroupSizeOp::create(
	rewriter, loc, rewriter.getIndexType(), /upper_bound = / nullptr);
	Value subgroupIdOp =
	arith::DivUIOp::create(rewriter, loc, indexType,
	IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
	rewriter.replaceOp(op, {subgroupIdOp});
	return success();
	}
	};

	} // namespace

	void mlir::populateGpuSubgroupIdPatterns(RewritePatternSet &patterns) {
	patterns.add<GpuSubgroupIdRewriter>(patterns.getContext());
	}