blob: 21c1583bf26330bd9d31cd0edbb96d47c4ccb0fa [file] [log] [blame]
//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeRange.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir
#define DEBUG_TYPE "xegpu-subgroup-distribute"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
using namespace mlir;
static const char *const resolveSIMTTypeMismatch =
"resolve_simt_type_mismatch"; // Attribute name for identifying
// UnrelizedConversionCastOp added to resolve
// SIMT type mismatches.
namespace {
//===----------------------------------------------------------------------===//
// SIMT Distribution Patterns
//===----------------------------------------------------------------------===//
/// In certain cases, we may need to favor XeGPU specific distribution patterns
/// over generic vector distribution patterns. In such cases, we can assign
/// priorities to patterns.
static constexpr unsigned regularPatternBenefit = 1;
static constexpr unsigned highPatternBenefit = 2;
/// Helper function to get distributed vector type for a source vector type
/// according to the lane_layout. We simply divide each dimension of tensor
/// descriptor shape by corresponding lane_layout dimension. If
/// array_length > 1, that is appended to the front of the ditributed shape.
/// NOTE: This is the vector type that will be returned by the
/// gpu.warp_execute_on_lane0 op.
///
/// Examples:
/// | original vector shape | lane_layout | distributed vector shape |
/// |-----------------------|-------------|--------------------------|
/// | 32x16 | [1, 16] | 32x1 |
/// | 32x16 | [2, 8] | 16x2 |
/// | 2x32x16 | [1, 16] | 2x32x1 |
static FailureOr<VectorType>
getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
VectorType originalType) {
if (!layout)
return failure();
assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
"Expecting a valid layout.");
SmallVector<int64_t> effectiveLaneLayout =
layout.getEffectiveLaneLayoutAsInt();
assert(static_cast<size_t>(originalType.getRank()) >=
effectiveLaneLayout.size() &&
"Rank of the original vector type should be greater or equal to the "
"size of the lane layout to distribute the vector type.");
SmallVector<int64_t> distributedShape(originalType.getShape());
// Only distribute the last `laneLayout.size()` dimensions. The remaining
// dimensions are not distributed.
unsigned distributionStart =
originalType.getRank() - effectiveLaneLayout.size();
for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
if (i < distributionStart)
continue;
// Check if the dimension can be distributed evenly.
if (dim % effectiveLaneLayout[i - distributionStart] != 0)
return failure();
distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
}
return VectorType::get(distributedShape, originalType.getElementType());
}
/// Helper function to resolve types if the distributed type out of
/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
/// Example 1:
/// distributed type: vector<8x1xf32>
/// expected type: vector<8xf32>
/// resolved using,
/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
/// Example 2:
/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
/// expected type: xegpu.tensor_desc<8x16xf32>
/// resolved using,
/// %0 = unrealized_conversion_cast %1 :
/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
/// xegpu.tensor_desc<8x16xf32>
template <typename T>
static Value resolveDistributedTy(Value orig, T expected,
PatternRewriter &rewriter) {
// If orig and expected types are the same, return orig.
if (orig.getType() == expected)
return orig;
// If orig is a vector type, create a shape cast op to reconcile the types.
if (isa<VectorType>(orig.getType())) {
auto castOp =
vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
return castOp.getResult();
}
// If orig is a tensor descriptor type, create an unrealized conversion cast
// op to reconcile the types.
if (isa<xegpu::TensorDescType>(orig.getType())) {
auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
expected, orig);
castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
return castOp.getResult(0);
}
llvm_unreachable("Unsupported type for reconciliation");
return orig;
}
/// Helper function to check if the layout is packed. Layout is packed if it is
/// 2D and lane_data[0] != 1 (data packed from col dimension).
static bool hasPackedLayout(xegpu::LayoutAttr layout) {
if (layout == xegpu::LayoutAttr())
return false;
DenseI32ArrayAttr laneData = layout.getLaneData();
if (!laneData || laneData.size() != 2)
return false;
return laneData.asArrayRef()[0] != 1;
}
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
/// contained within a WarpExecuteOnLane0Op.
/// Example:
///
/// ```
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
/// ...
/// ...
/// gpu.return %result: vector<8x16xf32>
/// }
/// ```
/// To
/// ```
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
/// %laneid = gpu.lane_id : index
/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
/// ...
/// ...
/// gpu.yield %result: vector<8x16xf32>
/// }
/// return %0
/// }
struct MoveFuncBodyToWarpExecuteOnLane0
: public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
// If the function only contains a single void return, skip.
if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
}))
return failure();
// If the function already moved inside a warp_execute_on_lane0, skip.
if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::WarpExecuteOnLane0Op>(op);
}))
return failure();
// Create a new function with the same signature and same attributes.
SmallVector<Type> workgroupAttributionsTypes =
llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
[](BlockArgument arg) { return arg.getType(); });
SmallVector<Type> privateAttributionsTypes =
llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
[](BlockArgument arg) { return arg.getType(); });
auto newGpuFunc = gpu::GPUFuncOp::create(
rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
privateAttributionsTypes);
newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
// Create a WarpExecuteOnLane0Op with same arguments and results as the
// original gpuFuncOp.
rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
auto laneId = gpu::LaneIdOp::create(
rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
/** upperBound = **/ mlir::IntegerAttr());
ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
auto warpOp = gpu::WarpExecuteOnLane0Op::create(
rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
newGpuFunc.getArgumentTypes());
Block &warpBodyBlock = warpOp.getBodyRegion().front();
// Replace the ReturnOp of the original gpu function with a YieldOp.
auto origRetunOp =
cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
rewriter.setInsertionPointAfter(origRetunOp);
gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
origRetunOp.getOperands());
rewriter.eraseOp(origRetunOp);
// Move the original function body to the WarpExecuteOnLane0Op body.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
warpOp.getBodyRegion().begin());
rewriter.eraseBlock(&warpBodyBlock);
// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
rewriter.setInsertionPointAfter(warpOp);
gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
rewriter.replaceOp(gpuFuncOp, newGpuFunc);
return success();
}
};
/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
/// still contain the original op that will not be used by the yield op (and
/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
/// arguments. Tensor descriptor shape is not distributed because it is a
/// uniform value across all work items within the subgroup. However, the
/// layout information is dropped in the new tensor descriptor type.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %td
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
/// ...
/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %arg0, %dead
/// }
/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
/// -> !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a xegpu::CreateNdDesc op");
auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
unsigned operandIdx = operand->getOperandNumber();
xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
/* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
SmallVector<Value> newDescOperands = llvm::map_to_vector(
newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
// does not contain layout info.
Value newDescOp = xegpu::CreateNdDescOp::create(
rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
descOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type to the expected type.
newDescOp =
resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
rewriter.replaceAllUsesWith(distributedVal, newDescOp);
return success();
}
};
/// Distribute a store_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
/// through the warp op interface they would be propagated as returned values.
/// Source vector is distributed based on lane layout. Appropriate cast ops are
/// inserted if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
/// #layout0>
/// }
/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0>
/// -> !xegpu.tensor_desc<4x8xf32>
/// xegpu.store_nd %0, %1: vector<4xf32>,
/// !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
if (!storeOp)
return failure();
int64_t offsetSize = static_cast<int64_t>(storeOp.getOffsets().size());
if ((offsetSize != 0) || storeOp.getConstOffsetsAttr())
return failure();
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
storeOp, "the source tensor descriptor lacks layout attribute");
FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
if (failed(distributedTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
VectorType distributedTypeByWarpOp =
distributedTypeByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp,
/* new yielded values = */
ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
/* new yielded types = */
TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
newRetIndices);
// Create a new store op outside the warp op with the distributed vector
// type. Tensor descriptor is not distributed.
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newStoreOperands;
// For the value operand, there can be a mismatch between the vector type
// distributed by the warp op and (xegpu-specific) distributed type
// supported by the store op. Type mismatch must be resolved using
// appropriate cast op.
FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
xegpu::getDistributedVectorType(storeOp.getTensorDescType());
if (failed(storeNdDistributedValueTyOrFailure))
return rewriter.notifyMatchFailure(
storeOp, "Failed to get distributed vector type for the store op");
newStoreOperands.push_back(resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]),
storeNdDistributedValueTyOrFailure.value(), rewriter));
// For the tensor descriptor operand, the layout attribute is dropped after
// distribution. Types needs to be resolved in this case also.
xegpu::TensorDescType distributedTensorDescTy =
storeOp.getTensorDescType().dropLayouts();
newStoreOperands.push_back(
resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
distributedTensorDescTy, rewriter));
auto newStoreOp =
xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
newStoreOperands, storeOp->getAttrs());
xegpu::removeLayoutAttrs(newStoreOp);
rewriter.eraseOp(storeOp);
return success();
}
};
/// Distribute a load_nd op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the load's arguments. Only the loaded vector is distributed
/// according to lane layout and, tensor descriptor types is not
/// distributed. Appropriate cast ops are inserted if the distributed types does
/// not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<4x1xf32>) {
/// ...
/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
/// ->
/// vector<4x8xf32>
/// gpu.yield %ld
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
/// vector<4x8xf32> gpu.yield %dead, %arg0
/// }
/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
///
/// ```
struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
if (!isa<xegpu::LoadNdOp>(op))
return false;
// Make sure the same load op is the last operation in the warp op body.
// This ensure that load op is not sinked earlier violating any barrier
// synchronizations.
gpu::YieldOp yield = warpOp.getTerminator();
return yield->getPrevNode() == op;
});
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a xegpu::LoadNd op");
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
int64_t offsetSize = static_cast<int64_t>(loadOp.getOffsets().size());
if ((offsetSize != 0) || loadOp.getConstOffsetsAttr())
return failure();
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
loadOp, "the source tensor descriptor lacks layout attribute");
unsigned operandIdx = operand->getOperandNumber();
VectorType distributedTypeByWarpOp =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp,
/* new yielded values = */ loadOp.getTensorDesc(),
/* new yielded types = */ tensorDescTy, newRetIndices);
// Create a new load op outside the warp op with the distributed vector
// type.
rewriter.setInsertionPointAfter(newWarpOp);
FailureOr<VectorType> loadNdDistValueTyOrFailure =
xegpu::getDistributedVectorType(loadOp.getTensorDescType());
if (failed(loadNdDistValueTyOrFailure))
return rewriter.notifyMatchFailure(
loadOp, "Failed to get distributed vector type for the load op");
xegpu::TensorDescType distributedTensorDescTy =
loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
// descriptor type does not
// contain layout info.
auto newLoadOp = xegpu::LoadNdOp::create(
rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
distributedTensorDescTy, rewriter),
loadOp->getAttrs());
xegpu::removeLayoutAttrs(newLoadOp);
// Set the packed attribute if the layout requires it.
newLoadOp.setPacked(hasPackedLayout(layout));
Value distributedVal = newWarpOp.getResult(operandIdx);
// There can be a conflict between the vector type distributed by the
// warp op and (xegpu-specific) distributed type supported by the load
// op. Resolve these mismatches by inserting a cast.
Value tyResolvedVal = resolveDistributedTy(
newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
return success();
}
};
/// Distribute a dpas op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
/// distributed types does not match expected xegpu SIMT types.
/// Example:
/// ```
/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<8x1xf32>) {
/// ...
/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
/// vector<8x16xf32>
/// gpu.yield %dpas
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
/// vector<8x1xf16>, vector<16x1xf16>) {
/// ...
/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
/// -> vector<8x16xf32>
/// gpu.yield %dead, %arg0, %arg1
/// }
/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
/// vector<8xf32>
/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
/// ```
struct DpasDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
if (!operand)
return rewriter.notifyMatchFailure(warpOp,
"warp result is not a xegpu::Dpas op");
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
xegpu::LayoutAttr layoutA =
dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
xegpu::LayoutAttr layoutB =
dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
xegpu::LayoutAttr layoutOut =
dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
"the xegpu::Dpas op lacks layout attribute for A, B or output");
FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
if (failed(distLhsTypeByWarpOpOrFailure) ||
failed(distRhsTypeByWarpOpOrFailure) ||
failed(distResultTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
"Failed to distribute the A, B or output types in xegpu::Dpas op");
llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
dpasOp.getRhs()};
llvm::SmallVector<Type, 3> newYieldTypes{
distLhsTypeByWarpOpOrFailure.value(),
distRhsTypeByWarpOpOrFailure.value()};
// Dpas acc operand is optional.
if (dpasOp.getAcc()) {
newYieldValues.push_back(dpasOp.getAcc());
newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
}
// Create a new warp op without the dpas.
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
FailureOr<VectorType> expectedDistLhsTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
FailureOr<VectorType> expectedDistRhsTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
FailureOr<VectorType> expectedDistResultTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
if (failed(expectedDistLhsTyOrFailure) ||
failed(expectedDistRhsTyOrFailure) ||
failed(expectedDistResultTyOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
"Failed to get distributed vector type for the dpas operands.");
// Create a new dpas op outside the warp op.
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newDpasOperands;
SmallVector<VectorType> newDpasOperandExpectedTypes;
// Resolve the distributed types with the original types.
newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
if (dpasOp.getAcc())
newDpasOperandExpectedTypes.push_back(distributedResultTy);
for (unsigned i = 0; i < newRetIndices.size(); i++) {
newDpasOperands.push_back(
resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
newDpasOperandExpectedTypes[i], rewriter));
}
auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
distributedResultTy, newDpasOperands,
dpasOp->getAttrs());
xegpu::removeLayoutAttrs(newDpasOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the output type.
Value typeResolved =
resolveDistributedTy(newDpasOp.getResult(),
distResultTypeByWarpOpOrFailure.value(), rewriter);
rewriter.replaceAllUsesWith(distributedVal, typeResolved);
return success();
}
};
/// Sink an update_nd_offset op feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
/// original op that will not be used by the yield op (and should be cleaned
/// up later). The yield op will bypass the updateOp's arguments. The tensor
/// descriptor type is not distributed. Appropriate cast ops are inserted if
/// the distributed types does not match expected xegpu SIMT types.
/// Example:
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
/// !xegpu.tensor_desc<4x8xf32, #layout0>
/// gpu.yield %update
/// }
/// ...
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
/// !xegpu.tensor_desc<4x8xf32, #layout0>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
/// ...
/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
/// !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0
/// gpu.yield %dead, %arg0, %c32, %c16
/// }
/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
/// %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]:
/// !xegpu.tensor_desc<4x8xf32>
/// ...
/// ```
struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a xegpu::UpdateNdOffset op");
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
unsigned operandIdx = operand->getOperandNumber();
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
// new update op does not have layout attribute.
xegpu::TensorDescType distributedTensorDescTy =
updateOp.getTensorDescType().dropLayouts();
SmallVector<Value> newUpdateOperands =
llvm::map_to_vector(newRetIndices, [&](size_t i) {
// For the tensor descriptor operand, the layout attribute is
// dropped after distribution. Types needs to be resolved in this
// case.
if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
return resolveDistributedTy(newWarpOp.getResult(i),
distributedTensorDescTy, rewriter);
}
return newWarpOp.getResult(i);
});
// Create a new update op outside the warp op.
auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
newUpdateOperands, updateOp->getAttrs());
xegpu::removeLayoutAttrs(newUpdateOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type with the original type.
Value typeResolved = resolveDistributedTy(
newUpdateOp.getResult(), distributedVal.getType(), rewriter);
rewriter.replaceAllUsesWith(distributedVal, typeResolved);
return success();
}
};
/// Distribute a prefetch_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
/// through the warp op interface they would be propagated as returned values.
/// Tensor descriptor shape is not distributed because it is a uniform value
/// across all work items within the subgroup. Appropriate cast ops are inserted
/// if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
/// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
/// xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
if (!prefetchOp)
return failure();
int64_t offsetSize = static_cast<int64_t>(prefetchOp.getOffsets().size());
if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr())
return failure();
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
prefetchOp, "the source tensor descriptor lacks layout attribute");
SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
// Create a new prefetch op outside the warp op with updated tensor
// descriptor type. Source tensor descriptor require type resolution.
xegpu::TensorDescType newTensorDescTy =
prefetchOp.getTensorDescType().dropLayouts();
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
newPrefetchOperands, prefetchOp->getAttrs());
xegpu::removeLayoutAttrs(prefetchOp);
rewriter.eraseOp(prefetchOp);
return success();
}
};
/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
/// region. This will simply move the barrier op outside of the warp op.
struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
// The last node must be a gpu::BarrierOp.
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
if (!barrierOp)
return failure();
// Move the barrier op outside of the warp op.
rewriter.setInsertionPointAfter(warpOp);
gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
barrierOp->getResultTypes(),
barrierOp->getOperands(), barrierOp->getAttrs());
rewriter.eraseOp(barrierOp);
return success();
}
};
/// Distribute a scattered store op. The offsets argument is required.
/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
/// The layouts are fixed and implicit: one offset/mask per lane.
/// The pass changes the offset/mask vector shapes to a
/// single-element vector, **it is assumed that their producer will also be
/// distributed**. The payload vector also has a fixed distribution:
/// no chunk size -> vector of one element.
/// chunk size -> vector of the innermost dimension of the SG-payload.
/// Example 1 (no chunk size):
/// %mask = producer_op : vector<16xi1>
/// %offset = producer_op : vector<16xindex>
/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
/// memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
/// %mask = producer_op : vector<1xi1>
/// %offset = producer_op : vector<1xindex>
/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
/// memref<256xf16>, vector<1xindex>, vector<1xi1>
/// Example 2 (chunk size, same mask and offsets):
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
Operation *lastNode = warpOp.getTerminator()->getPrevNode();
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
if (!storeScatterOp)
return failure();
auto offsets = storeScatterOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()))
return rewriter.notifyMatchFailure(
storeScatterOp, "Store op must have a vector of offsets argument");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
return rewriter.notifyMatchFailure(storeScatterOp,
"Expected 1D offsets and mask vector");
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
if (storeVecTy.getRank() > 2)
return rewriter.notifyMatchFailure(
storeScatterOp, "Expected at most 2D result at SG level");
std::string layoutPayloadName =
xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
std::string layoutOffsetsName =
xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
std::string layoutMaskName =
xegpu::getLayoutName(storeScatterOp->getOpOperand(3));
xegpu::LayoutAttr layoutPayload =
storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
xegpu::LayoutAttr layoutOffsets =
storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
xegpu::LayoutAttr layoutMask =
storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
FailureOr<VectorType> distMaskByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
if (failed(distStoreVecByWarpOpOrFailure) ||
failed(distOffsetsByWarpOpOrFailure) ||
failed(distMaskByWarpOpOrFailure)) {
return rewriter.notifyMatchFailure(
storeScatterOp,
"Some vector operands have no layouts, using defaults instead.");
}
VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
VectorType expectedPayloadTy = VectorType::get(
{distPayloadTy.getNumElements()}, distPayloadTy.getElementType());
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
SmallVector<Type> operandTypesToYield = {
expectedPayloadTy, operands[1].getType(),
distOffsetsByWarpOpOrFailure.value(),
distMaskByWarpOpOrFailure.value()};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
rewriter.eraseOp(storeScatterOp);
return success();
}
};
/// Distribute a scattered load op. The logic and requirements are the same as
/// for the scattered store distribution. The warpOp's payload vector is
/// expected to be distributed by the load's result consumer.
/// Example 1 (no chunk size):
/// %mask = producer_op : vector<16xi1>
/// %offset = producer_op : vector<16xindex>
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
/// To
/// %mask = producer_op : vector<1xi1>
/// %offset = producer_op : vector<1xindex>
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
/// Example 2 (chunk size, same mask and offsets):
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
/// To
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
struct LoadDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
// Check if the yield operand that was produced by the *last* scattered
// load op to avoid sinking it before barriers (maintain memory order).
return isa<xegpu::LoadGatherOp>(op) &&
warpOp.getTerminator()->getPrevNode() == op;
});
if (!producedByLastLoad)
return rewriter.notifyMatchFailure(
warpOp, "The last op is not xegpu::LoadGatherOp");
auto loadGatherOp =
producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
auto offsets = loadGatherOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()) ||
!isa<VectorType>(loadGatherOp.getMask().getType()))
return rewriter.notifyMatchFailure(
loadGatherOp,
"Load op must have a vector arguments for offsets and mask");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
return rewriter.notifyMatchFailure(loadGatherOp,
"Expected 1D offsets and mask vector");
// Assume offset and mask producers will be distributed as well.
std::string layoutOffsetsName =
xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
std::string layoutMaskName =
xegpu::getLayoutName(loadGatherOp->getOpOperand(2));
xegpu::LayoutAttr layoutOffsets =
loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
xegpu::LayoutAttr layoutMask =
loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
FailureOr<VectorType> distMaskByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
if (failed(distOffsetsByWarpOpOrFailure) ||
failed(distMaskByWarpOpOrFailure)) {
return rewriter.notifyMatchFailure(
loadGatherOp,
"Some vector operands have no layouts, using defaults instead.");
}
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = loadGatherOp->getOperands();
SmallVector<Type> operandTypesToYield = {
operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
distMaskByWarpOpOrFailure.value()};
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
VectorType loadVecTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
loadGatherOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0));
return success();
}
};
/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
/// VectorReductionOps.
static Value lowerToVectorReductions(TypedValue<VectorType> src,
TypedValue<VectorType> acc,
vector::CombiningKind kind,
int64_t reductionDim, Location loc,
PatternRewriter &rewriter) {
// Expecting a 2D source vector.
assert(src.getType().getRank() == 2 && "expected a 2D source vector");
VectorType sourceType = src.getType();
int64_t sourceH = sourceType.getShape()[0];
int64_t sourceW = sourceType.getShape()[1];
int nSlices = (reductionDim == 0) ? sourceW : sourceH;
// Create a constant vector to hold the result of the reduction.
TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
if (reductionDim == 1) {
sliceOffsets = {i, 0};
sliceSizes = {1, sourceW};
} else {
sliceOffsets = {0, i};
sliceSizes = {sourceH, 1};
}
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
Value slice = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
Value reduction =
vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract);
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
}
return reductionResult;
}
/// This patterns distribute the `vector.multi_reduction` operation across
/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
/// layouts for the source and accumulator vectors,
/// * If the reduction dimension is distributed across lanes, the reduction is
/// non-lane-local and the reduction is done using warp shuffles. Here we
/// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
/// the warp op body.
/// * If the reduction dimension is not distributed across lanes, the reduction
/// is lane-local. In this case, we yield the source and accumulator vectors
/// from the warp op and perform the lane-local reduction outside the warp op
/// using a sequence of ReductionOps.
/// Example 1 (Reduction is lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
/// %acc = "some_def"() : () -> (vector<32xf32>)
/// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
/// vector<32xf32> gpu.yield %1 : vector<32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
/// vector<1xf32>) {
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
/// %acc = "some_def"() : () -> (vector<32xf32>)
/// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
/// }
/// %c = arith.constant dense<0.0> : vector<1xf32>
/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
/// ```
/// Example 2 (Reduction is non-lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
/// %acc = "some_def"() : () -> (vector<2xf32>)
/// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
/// vector<2xf32>
/// gpu.yield %1 : vector<2xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
/// %acc = "some_def"() : () -> (vector<2xf32>)
/// %1 = arith.constant dense<0.0> : vector<2xf32>
/// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
/// %3 = ("warp.reduction %2") : f32
/// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
/// ... repeat for row 1
/// gpu.yield %1 : vector<2xf32>
/// }
struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *yieldOperand =
getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
if (!yieldOperand)
return failure();
auto reductionOp =
cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
unsigned operandNumber = yieldOperand->getOperandNumber();
VectorType sourceType = reductionOp.getSourceVectorType();
// Only 2D vectors are supported.
if (sourceType.getRank() != 2)
return rewriter.notifyMatchFailure(warpOp,
"Only 2D reductions are supported.");
ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
// Only 1 reduction dimension supported. This also ensures that the result
// is vector type.
if (reductionDims.size() != 1)
return rewriter.notifyMatchFailure(
warpOp, "Only 1 reduction dimension is supported.");
int64_t reductionDim = reductionDims[0];
VectorType distributedResultType =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
VectorType resultType = cast<VectorType>(reductionOp.getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getDistributeLayoutAttr(reductionOp.getSource());
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
if (failed(sourceDistTypeOrFailure))
return rewriter.notifyMatchFailure(
warpOp, "Failed to distribute the source vector type.");
VectorType sourceDistType = sourceDistTypeOrFailure.value();
// Only single dimension distribution is supported.
bool dim0Distributed =
sourceDistType.getShape()[0] != sourceType.getShape()[0];
bool dim1Distributed =
sourceDistType.getShape()[1] != sourceType.getShape()[1];
if (dim0Distributed && dim1Distributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting source to be distributed in a single dimension.");
int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
if (sourceDistDim == -1)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed source vector.");
bool resultDistributed =
distributedResultType.getNumElements() < resultType.getNumElements();
// If the lane owns all the data required for reduction (i.e. reduction is
// fully parallel accross lanes), then each lane owns part of the result
// (i.e. result is distributed). If the reduction require cross-lane
// shuffling, then the result is shared among all lanes (broadcasted).
// Therefore we expect following cases:
//
// | Source vector | Reduction dim | Result vector |
// |----------------------|----------------|----------------|
// | dim-0 distributed | 0 | broadcasted |
// | dim-0 distributed | 1 | distributed |
// | dim-1 distributed | 0 | distributed |
// | dim-1 distributed | 1 | broadcasted |
bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
(sourceDistDim == 1 && reductionDim == 0);
if (isReductionLaneLocal && !resultDistributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed result for lane-local reduction.");
if (!isReductionLaneLocal && resultDistributed)
return rewriter.notifyMatchFailure(
warpOp,
"Expecting a broadcasted result for non-lane-local reduction.");
// Handle lane-local reduction case. In this case we fully distribute the
// reduction result.
if (isReductionLaneLocal) {
// Yield the source and acc vectors from the WarpOp.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
{sourceDistType, distributedResultType}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value result = lowerToVectorReductions(
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
return success();
}
// For non-lane-local case, we simply rewrite the MultiReductionOp in terms
// of multiple ReductionOps. Actual distribution is done by the
// WarpOpReduction pattern.
rewriter.setInsertionPointAfter(reductionOp);
Value result = lowerToVectorReductions(
cast<TypedValue<VectorType>>(reductionOp.getSource()),
cast<TypedValue<VectorType>>(reductionOp.getAcc()),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
return success();
}
};
/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region.
struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *yieldOperand =
getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
if (!yieldOperand)
return failure();
auto shapeCastOp =
cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
unsigned operandNumber = yieldOperand->getOperandNumber();
auto resultDistTy =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getDistributeLayoutAttr(shapeCastOp.getSource());
xegpu::DistributeLayoutAttr resultLayout =
xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
if (!sourceLayout || !resultLayout)
return rewriter.notifyMatchFailure(
warpOp,
"the source or result of shape_cast op lacks distribution layout");
// For rank reducing or increasing shape_cast ops, the lower rank layout
// must be a slice of higher rank layout.
int64_t sourceRank = shapeCastOp.getSourceVectorType().getRank();
int64_t resultRank = shapeCastOp.getResultVectorType().getRank();
if (sourceRank < resultRank && !sourceLayout.isSliceOf(resultLayout))
return rewriter.notifyMatchFailure(
warpOp, "shape_cast is rank reducing but source layout is not a "
"slice of result layout");
if (sourceRank > resultRank && !resultLayout.isSliceOf(sourceLayout))
return rewriter.notifyMatchFailure(
warpOp, "shape_cast is rank increasing but result layout is not a "
"slice of source layout");
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout,
shapeCastOp.getSourceVectorType());
if (failed(sourceDistTypeOrFailure))
return rewriter.notifyMatchFailure(
warpOp, "failed to get distributed vector type for source");
VectorType sourceDistType = sourceDistTypeOrFailure.value();
// Create a new warp op that yields the source of the shape_cast op.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value source = newWarpOp.getResult(newRetIndices[0]);
// Create a new shape_cast op outside the warp op.
Value newShapeCast = vector::ShapeCastOp::create(
rewriter, shapeCastOp.getLoc(), resultDistTy, source);
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
newShapeCast);
return success();
}
};
} // namespace
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
XeGPUSubgroupDistributePass> {
XeGPUSubgroupDistributePass() = default;
XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
default;
XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
: XeGPUSubgroupDistributeBase(options) {}
void runOnOperation() override;
};
} // namespace
void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
patterns
.add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
GpuBarrierDistribution, VectorMultiReductionDistribution,
LoadDistribution, StoreDistribution>(
patterns.getContext(),
/*pattern benefit=*/regularPatternBenefit);
patterns.add<VectorShapeCastDistribution>(
patterns.getContext(),
/*pattern benefit=*/highPatternBenefit);
}
void XeGPUSubgroupDistributePass::runOnOperation() {
// Step 1: Attach layouts to op operands.
// TODO: Following assumptions are made:
// 1) It is assumed that there are no layout conflicts.
// 2) Any existing layout attributes attached to the operands are ignored.
Operation *op = getOperation();
op->walk([&](Operation *op) {
for (OpOperand &operand : op->getOpOperands()) {
// Layouts are needed for vector type only.
if (!isa<VectorType>(operand.get().getType()))
continue;
auto layout = xegpu::getDistributeLayoutAttr(operand.get());
if (!layout) {
op->emitError("Could not find layout attribute for operand ")
<< operand.getOperandNumber() << " of operation " << op->getName();
signalPassFailure();
return;
}
xegpu::setDistributeLayoutAttr(operand, layout);
}
});
// Step 2: Move all operations of a GPU function inside
// gpu.warp_execute_on_lane_0 operation.
{
RewritePatternSet patterns(&getContext());
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
signalPassFailure();
return;
}
// At this point, we have moved the entire function body inside the
// warpOp. Now move any scalar uniform code outside of the warpOp (like
// GPU index ops, scalar constants, etc.). This will simplify the
// later lowering and avoid custom patterns for these ops.
getOperation()->walk([&](Operation *op) {
if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
vector::moveScalarUniformCode(warpOp);
});
}
// Step 3: Apply subgroup to workitem distribution patterns.
RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
// distributionFn is used by vector distribution patterns to determine the
// distributed vector type for a given vector value. In XeGPU subgroup
// distribution context, we compute this based on lane layout.
auto distributionFn = [](Value val) {
VectorType vecType = dyn_cast<VectorType>(val.getType());
int64_t vecRank = vecType ? vecType.getRank() : 0;
if (vecRank == 0)
return AffineMap::get(val.getContext());
// Get the layout of the vector type.
xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
// If no layout is specified, assume the inner most dimension is distributed
// for now.
if (!layout)
return AffineMap::getMultiDimMapWithTargets(
vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
SmallVector<unsigned int> distributedDims;
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
if (v > 1)
distributedDims.push_back(i);
}
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
val.getContext());
};
// TODO: shuffleFn is not used.
auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
int64_t warpSz) { return Value(); };
auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind, uint32_t size) {
// First reduce on a single thread to get per lane reduction value.
Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
// Parallel reduction using butterfly shuffles.
for (uint64_t i = 1; i < size; i <<= 1) {
Value shuffled =
builder
.create<gpu::ShuffleOp>(loc, laneVal, i,
/*width=*/size,
/*mode=*/gpu::ShuffleMode::XOR)
.getShuffleResult();
laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
}
return laneVal;
};
if (enableSGReductions)
vector::populateDistributeReduction(
patterns, warpReduction,
/*pattern benefit=*/regularPatternBenefit);
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn,
/*pattern benefit=*/regularPatternBenefit);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
signalPassFailure();
return;
}
// Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted
// due to tensor desc type mismatches created by using upstream distribution
// patterns (scf.for)
getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
// We are only interested in UnrealizedConversionCastOps there were added
// for resolving SIMT type mismatches.
if (!op->getAttr(resolveSIMTTypeMismatch))
return WalkResult::skip();
Value input = op.getOperand(0);
Value output = op.getResult(0);
// Both input and output must have tensor descriptor types.
xegpu::TensorDescType inputDescType =
mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
xegpu::TensorDescType outputDescType =
mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
assert(inputDescType && outputDescType &&
"Unrealized conversion cast must have tensor descriptor types");
// tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
// This occurs iside scf.for body to resolve the block argument type to
// SIMT type.
if (inputDescType.getLayout()) {
auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
if (argument) {
argument.setType(output.getType());
output.replaceAllUsesWith(argument);
if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
argument.getOwner()->getParentOp())) {
auto result = loopOp.getTiedLoopResult(argument);
result.setType(output.getType());
}
}
}
// tensor_desc<shape> -> tensor_desc<shape, layout> Type of
// conversions. This occurs at the yield op of scf.for body to go back
// from SIMT type to original type.
if (outputDescType.getLayout())
output.replaceAllUsesWith(input);
if (op->use_empty())
op->erase();
return WalkResult::advance();
});
}