| //===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| #include "mlir/Dialect/Affine/Utils.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" |
| #include "mlir/Dialect/Index/IR/IndexDialect.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" |
| #include "mlir/Dialect/XeGPU/IR/XeGPU.h" |
| #include "mlir/Dialect/XeGPU/Transforms/Passes.h" |
| #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" |
| #include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h" |
| #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" |
| #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" |
| #include "mlir/IR/AffineMap.h" |
| #include "mlir/IR/Attributes.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/BuiltinTypes.h" |
| #include "mlir/IR/Operation.h" |
| #include "mlir/IR/PatternMatch.h" |
| #include "mlir/IR/TypeRange.h" |
| #include "mlir/IR/Value.h" |
| #include "mlir/IR/Visitors.h" |
| #include "mlir/Interfaces/FunctionInterfaces.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/DialectConversion.h" |
| #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| #include "mlir/Transforms/InliningUtils.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/SmallVectorExtras.h" |
| |
| namespace mlir { |
| namespace xegpu { |
| #define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE |
| #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" |
| } // namespace xegpu |
| } // namespace mlir |
| |
| #define DEBUG_TYPE "xegpu-subgroup-distribute" |
| #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| |
| using namespace mlir; |
| |
| static const char *const resolveSIMTTypeMismatch = |
| "resolve_simt_type_mismatch"; // Attribute name for identifying |
| // UnrelizedConversionCastOp added to resolve |
| // SIMT type mismatches. |
| |
| namespace { |
| |
| //===----------------------------------------------------------------------===// |
| // SIMT Distribution Patterns |
| //===----------------------------------------------------------------------===// |
| |
| /// In certain cases, we may need to favor XeGPU specific distribution patterns |
| /// over generic vector distribution patterns. In such cases, we can assign |
| /// priorities to patterns. |
| enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 }; |
| |
| /// Helper function to resolve types if the distributed type out of |
| /// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type. |
| /// Example 1: |
| /// distributed type: vector<8x1xf32> |
| /// expected type: vector<8xf32> |
| /// resolved using, |
| /// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32> |
| /// Example 2: |
| /// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>> |
| /// expected type: xegpu.tensor_desc<8x16xf32> |
| /// resolved using, |
| /// %0 = unrealized_conversion_cast %1 : |
| /// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> -> |
| /// xegpu.tensor_desc<8x16xf32> |
| template <typename T> |
| static Value resolveDistributedTy(Value orig, T expected, |
| PatternRewriter &rewriter) { |
| // If orig and expected types are the same, return orig. |
| if (orig.getType() == expected) |
| return orig; |
| // If orig is a vector type, create a shape cast op to reconcile the types. |
| if (isa<VectorType>(orig.getType())) { |
| auto castOp = |
| vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig); |
| return castOp.getResult(); |
| } |
| // If orig is a tensor descriptor type, create an unrealized conversion cast |
| // op to reconcile the types. |
| if (isa<xegpu::TensorDescType>(orig.getType())) { |
| auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(), |
| expected, orig); |
| castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr()); |
| return castOp.getResult(0); |
| } |
| llvm_unreachable("Unsupported type for reconciliation"); |
| return orig; |
| } |
| |
| /// Given a vector type and its distributed vector type, return the list of |
| /// dimensions that are distributed. |
| static SmallVector<int64_t> getDistributedDims(VectorType originalType, |
| VectorType distributedType) { |
| assert(originalType.getRank() == distributedType.getRank() && |
| "sequential and distributed vector types must have the same rank"); |
| SmallVector<int64_t> distributedDims; |
| for (int64_t i = 0; i < originalType.getRank(); ++i) { |
| if (distributedType.getDimSize(i) != originalType.getDimSize(i)) { |
| distributedDims.push_back(i); |
| } |
| } |
| return distributedDims; |
| } |
| |
| /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body |
| /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is |
| /// contained within a WarpExecuteOnLane0Op. |
| /// Example: |
| /// |
| /// ``` |
| /// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> { |
| /// ... |
| /// ... |
| /// gpu.return %result: vector<8x16xf32> |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> { |
| /// %laneid = gpu.lane_id : index |
| /// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> { |
| /// ... |
| /// ... |
| /// gpu.yield %result: vector<8x16xf32> |
| /// } |
| /// return %0 |
| /// } |
| struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { |
| using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern; |
| LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, |
| PatternRewriter &rewriter) const override { |
| auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or("")); |
| if (!uArch) |
| return rewriter.notifyMatchFailure( |
| gpuFuncOp, "Subgroup distribution requires target attribute attached " |
| "to set the warp size"); |
| // If the function only contains a single void return, skip. |
| if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { |
| return isa<gpu::ReturnOp>(op) && !op.getNumOperands(); |
| })) |
| return failure(); |
| // If the function already moved inside a warp_execute_on_lane0, skip. |
| if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { |
| return isa<gpu::WarpExecuteOnLane0Op>(op); |
| })) |
| return failure(); |
| // Create a new function with the same signature and same attributes. |
| SmallVector<Type> workgroupAttributionsTypes = |
| llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(), |
| [](BlockArgument arg) { return arg.getType(); }); |
| SmallVector<Type> privateAttributionsTypes = |
| llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(), |
| [](BlockArgument arg) { return arg.getType(); }); |
| auto newGpuFunc = gpu::GPUFuncOp::create( |
| rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), |
| gpuFuncOp.getFunctionType(), workgroupAttributionsTypes, |
| privateAttributionsTypes); |
| newGpuFunc->setAttrs(gpuFuncOp->getAttrs()); |
| // Create a WarpExecuteOnLane0Op with same arguments and results as the |
| // original gpuFuncOp. |
| rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front()); |
| auto laneId = gpu::LaneIdOp::create( |
| rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(), |
| /** upperBound = **/ mlir::IntegerAttr()); |
| ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); |
| auto warpOp = gpu::WarpExecuteOnLane0Op::create( |
| rewriter, laneId.getLoc(), gpuFuncResultType, laneId, |
| uArch->getSubgroupSize(), newGpuFunc.getArguments(), |
| newGpuFunc.getArgumentTypes()); |
| Block &warpBodyBlock = warpOp.getBodyRegion().front(); |
| // Replace the ReturnOp of the original gpu function with a YieldOp. |
| auto origRetunOp = |
| cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator()); |
| rewriter.setInsertionPointAfter(origRetunOp); |
| gpu::YieldOp::create(rewriter, origRetunOp.getLoc(), |
| origRetunOp.getOperands()); |
| rewriter.eraseOp(origRetunOp); |
| // Move the original function body to the WarpExecuteOnLane0Op body. |
| rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(), |
| warpOp.getBodyRegion().begin()); |
| rewriter.eraseBlock(&warpBodyBlock); |
| // Insert a new ReturnOp after the WarpExecuteOnLane0Op. |
| rewriter.setInsertionPointAfter(warpOp); |
| gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults()); |
| rewriter.replaceOp(gpuFuncOp, newGpuFunc); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing |
| /// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will |
| /// still contain the original op that will not be used by the yield op (and |
| /// should be cleaned up later). The yield op will bypass the create_nd_tdesc's |
| /// arguments. Tensor descriptor shape is not distributed because it is a |
| /// uniform value across all work items within the subgroup. However, the |
| /// layout information is dropped in the new tensor descriptor type. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]> |
| /// %r = gpu.warp_execute_on_lane_0(%laneid) -> |
| /// (!xegpu.tensor_desc<4x8xf32, #layout0>) { |
| /// ... |
| /// %td = xegpu.create_nd_tdesc %arg0 |
| /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> |
| /// vector.yield %td |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) { |
| /// ... |
| /// %dead = xegpu.create_nd_tdesc %arg0 |
| /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> |
| /// vector.yield %arg0, %dead |
| /// } |
| /// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32> |
| /// -> !xegpu.tensor_desc<4x8xf32> |
| /// |
| /// ``` |
| struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = |
| getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, "warp result is not a xegpu::CreateNdDesc op"); |
| auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| |
| xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| descOp, "the tensor descriptor lacks layout attribute"); |
| // CreateNdOp must not have offsets. |
| if (descOp.getMixedOffsets().size()) |
| return rewriter.notifyMatchFailure( |
| descOp, "xegpu::CreateNdDescOp must not have offsets"); |
| |
| SmallVector<size_t> newRetIndices; |
| rewriter.setInsertionPoint(warpOp); |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, /* new yieled values = */ descOp->getOperands(), |
| /* new yielded types = */ descOp.getOperandTypes(), newRetIndices); |
| |
| SmallVector<Value> newDescOperands = llvm::map_to_vector( |
| newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); }); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| xegpu::TensorDescType distributedTensorDescTy = |
| descOp.getType().dropLayouts(); // Distributed tensor descriptor type |
| // does not contain layout info. |
| Value newDescOp = xegpu::CreateNdDescOp::create( |
| rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands, |
| descOp->getAttrs()); |
| |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| // Resolve the distributed type to the expected type. |
| newDescOp = |
| resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter); |
| rewriter.replaceAllUsesWith(distributedVal, newDescOp); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a store_nd op at the end of enclosing |
| /// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed |
| /// through the warp op interface they would be propagated as returned values. |
| /// Source vector is distributed based on lane layout. Appropriate cast ops are |
| /// inserted if the distributed types does not match expected xegpu SIMT types. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]> |
| /// gpu.warp_execute_on_lane_0(%laneid) -> () { |
| /// ... |
| /// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>, |
| /// !xegpu.tensor_desc<4x8xf32, #layout0> |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, |
| /// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { |
| /// ... |
| /// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>, |
| /// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index |
| /// } |
| /// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32> |
| /// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, |
| /// #layout0> |
| /// -> !xegpu.tensor_desc<4x8xf32> |
| /// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>, |
| /// !xegpu.tensor_desc<4x8xf32> |
| /// |
| /// ``` |
| struct StoreNdDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| Operation *lastNode = yield->getPrevNode(); |
| auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode); |
| if (!storeOp) |
| return failure(); |
| |
| SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets(); |
| // Expecting offsets to be present. |
| if (offsets.empty()) |
| return rewriter.notifyMatchFailure(storeOp, |
| "the store op must have offsets"); |
| SmallVector<Value> offsetsAsValues = |
| vector::getAsValues(rewriter, storeOp.getLoc(), offsets); |
| SmallVector<Type> offsetTypes = llvm::map_to_vector( |
| offsetsAsValues, [](Value v) { return v.getType(); }); |
| xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType(); |
| xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| storeOp, "the source tensor descriptor lacks layout attribute"); |
| |
| FailureOr<VectorType> distributedTypeByWarpOpOrFailure = |
| xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType()); |
| if (failed(distributedTypeByWarpOpOrFailure)) |
| return rewriter.notifyMatchFailure(storeOp, |
| "Failed to distribute the type"); |
| VectorType distributedTypeByWarpOp = |
| distributedTypeByWarpOpOrFailure.value(); |
| |
| SmallVector<size_t> newRetIndices; |
| SmallVector<Value> newYieldedValues = {storeOp.getValue(), |
| storeOp.getTensorDesc()}; |
| SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy}; |
| newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); |
| newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); |
| // Create a new store op outside the warp op with the distributed vector |
| // type. Tensor descriptor is not distributed. |
| rewriter.setInsertionPointAfter(newWarpOp); |
| SmallVector<Value> newStoreOperands; |
| |
| // For the value operand, there can be a mismatch between the vector type |
| // distributed by the warp op and (xegpu-specific) distributed type |
| // supported by the store op. Type mismatch must be resolved using |
| // appropriate cast op. |
| FailureOr<VectorType> storeNdDistributedValueTyOrFailure = |
| xegpu::getDistributedVectorType(storeOp.getTensorDescType()); |
| if (failed(storeNdDistributedValueTyOrFailure)) |
| return rewriter.notifyMatchFailure( |
| storeOp, "Failed to get distributed vector type for the store op"); |
| newStoreOperands.push_back(resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[0]), |
| storeNdDistributedValueTyOrFailure.value(), rewriter)); |
| // For the tensor descriptor operand, the layout attribute is dropped after |
| // distribution. Types needs to be resolved in this case also. |
| xegpu::TensorDescType distributedTensorDescTy = |
| storeOp.getTensorDescType().dropLayouts(); |
| newStoreOperands.push_back( |
| resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]), |
| distributedTensorDescTy, rewriter)); |
| // Collect offsets. |
| for (size_t i = 2; i < newRetIndices.size(); ++i) |
| newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i])); |
| |
| auto newStoreOp = |
| xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{}, |
| newStoreOperands, storeOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newStoreOp); |
| rewriter.eraseOp(storeOp); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a load_nd op feeding into vector.yield op for the enclosing |
| /// `gpu.warp_execute_on_lane_0` and put it after the warp op. |
| /// The warp op will still contain the original op that will not be used by |
| /// the yield op (and should be cleaned up later). The yield op will |
| /// bypass the load's arguments. Only the loaded vector is distributed |
| /// according to lane layout and, tensor descriptor types is not |
| /// distributed. Appropriate cast ops are inserted if the distributed types does |
| /// not match expected xegpu SIMT types. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]> |
| /// %r = gpu.warp_execute_on_lane_0(%laneid) -> |
| /// (vector<4x1xf32>) { |
| /// ... |
| /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0> |
| /// -> |
| /// vector<4x8xf32> |
| /// gpu.yield %ld |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, |
| /// !xegpu.tensor_desc<4x8xf32, #layout0>) { |
| /// ... |
| /// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> -> |
| /// vector<4x8xf32> gpu.yield %dead, %arg0 |
| /// } |
| /// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, |
| /// #layout0> -> !xegpu.tensor_desc<4x8xf32> |
| /// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32> |
| /// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32> |
| /// |
| /// ``` |
| struct LoadNdDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) { |
| if (!isa<xegpu::LoadNdOp>(op)) |
| return false; |
| // Make sure the same load op is the last operation in the warp op body. |
| // This ensure that load op is not sinked earlier violating any barrier |
| // synchronizations. |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| return yield->getPrevNode() == op; |
| }); |
| |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, "warp result is not a xegpu::LoadNd op"); |
| |
| auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>(); |
| auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or("")); |
| if (!uArch) |
| return rewriter.notifyMatchFailure( |
| loadOp, "xegpu::LoadNdOp require target attribute attached to " |
| "determine transpose " |
| "requirement"); |
| // Chip information is required to decide if the layout requires transpose |
| // effect. |
| // Expecting offsets to be present. |
| SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets(); |
| if (offsets.empty()) |
| return rewriter.notifyMatchFailure(loadOp, |
| "the load op must have offsets"); |
| SmallVector<Value> offsetsAsValues = |
| vector::getAsValues(rewriter, loadOp.getLoc(), offsets); |
| SmallVector<Type> offsetTypes = llvm::map_to_vector( |
| offsetsAsValues, [](Value v) { return v.getType(); }); |
| |
| xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType(); |
| xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| loadOp, "the source tensor descriptor lacks layout attribute"); |
| |
| unsigned operandIdx = operand->getOperandNumber(); |
| VectorType distributedTypeByWarpOp = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| |
| SmallVector<size_t> newRetIndices; |
| SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()}; |
| SmallVector<Type> newYieldedTypes = {tensorDescTy}; |
| newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); |
| newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); |
| |
| // Create a new load op outside the warp op with the distributed vector |
| // type. |
| rewriter.setInsertionPointAfter(newWarpOp); |
| FailureOr<VectorType> loadNdDistValueTyOrFailure = |
| xegpu::getDistributedVectorType(loadOp.getTensorDescType()); |
| if (failed(loadNdDistValueTyOrFailure)) |
| return rewriter.notifyMatchFailure( |
| loadOp, "Failed to get distributed vector type for the load op"); |
| xegpu::TensorDescType distributedTensorDescTy = |
| loadOp.getTensorDescType().dropLayouts(); // Distributed tensor |
| // descriptor type does not |
| // contain layout info. |
| SmallVector<Value> newLoadOperands{ |
| resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]), |
| distributedTensorDescTy, rewriter)}; |
| // Collect offsets. |
| for (size_t i = 1; i < newRetIndices.size(); ++i) |
| newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i])); |
| auto newLoadOp = xegpu::LoadNdOp::create( |
| rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(), |
| newLoadOperands, loadOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newLoadOp); |
| // Set the packed attribute if the layout requires it. |
| newLoadOp.setPacked(xegpu::requirePacked(layout)); |
| // Set the transpose attribute if the layout requires it. |
| if (xegpu::requireTranspose(layout, uArch)) |
| newLoadOp.setTranspose( |
| DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| // There can be a conflict between the vector type distributed by the |
| // warp op and (xegpu-specific) distributed type supported by the load |
| // op. Resolve these mismatches by inserting a cast. |
| Value tyResolvedVal = resolveDistributedTy( |
| newLoadOp.getResult(), distributedTypeByWarpOp, rewriter); |
| rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a dpas op feeding into vector.yield op for the enclosing |
| /// `gpu.warp_execute_on_lane_0` and put it after the warp op. |
| /// The warp op will still contain the original op that will not be used by |
| /// the yield op (and should be cleaned up later). The yield op will |
| /// bypass the dpas's arguments. Appropriate cast ops are inserted if the |
| /// distributed types does not match expected xegpu SIMT types. |
| /// Example: |
| /// ``` |
| /// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]> |
| /// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]> |
| /// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]> |
| /// %r = gpu.warp_execute_on_lane_0(%laneid) -> |
| /// (vector<8x1xf32>) { |
| /// ... |
| /// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> -> |
| /// vector<8x16xf32> |
| /// gpu.yield %dpas |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>, |
| /// vector<8x1xf16>, vector<16x1xf16>) { |
| /// ... |
| /// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> |
| /// -> vector<8x16xf32> |
| /// gpu.yield %dead, %arg0, %arg1 |
| /// } |
| /// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16> |
| /// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16> |
| /// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> -> |
| /// vector<8xf32> |
| /// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32> |
| /// ``` |
| struct DpasDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure(warpOp, |
| "warp result is not a xegpu::Dpas op"); |
| |
| auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| |
| xegpu::LayoutAttr layoutA = |
| dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr()); |
| xegpu::LayoutAttr layoutB = |
| dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr()); |
| xegpu::LayoutAttr layoutOut = |
| dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr()); |
| |
| if (!layoutA || !layoutB || !layoutOut) |
| return rewriter.notifyMatchFailure( |
| dpasOp, |
| "the xegpu::Dpas op lacks layout attribute for A, B or output"); |
| |
| FailureOr<VectorType> distLhsTypeByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType()); |
| FailureOr<VectorType> distRhsTypeByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType()); |
| FailureOr<VectorType> distResultTypeByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType()); |
| |
| if (failed(distLhsTypeByWarpOpOrFailure) || |
| failed(distRhsTypeByWarpOpOrFailure) || |
| failed(distResultTypeByWarpOpOrFailure)) |
| return rewriter.notifyMatchFailure( |
| dpasOp, |
| "Failed to distribute the A, B or output types in xegpu::Dpas op"); |
| |
| llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), |
| dpasOp.getRhs()}; |
| llvm::SmallVector<Type, 3> newYieldTypes{ |
| distLhsTypeByWarpOpOrFailure.value(), |
| distRhsTypeByWarpOpOrFailure.value()}; |
| // Dpas acc operand is optional. |
| if (dpasOp.getAcc()) { |
| newYieldValues.push_back(dpasOp.getAcc()); |
| newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value()); |
| } |
| // Create a new warp op without the dpas. |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); |
| |
| FailureOr<VectorType> expectedDistLhsTyOrFailure = |
| xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA); |
| FailureOr<VectorType> expectedDistRhsTyOrFailure = |
| xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB); |
| FailureOr<VectorType> expectedDistResultTyOrFailure = |
| xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut); |
| |
| if (failed(expectedDistLhsTyOrFailure) || |
| failed(expectedDistRhsTyOrFailure) || |
| failed(expectedDistResultTyOrFailure)) |
| return rewriter.notifyMatchFailure( |
| dpasOp, |
| "Failed to get distributed vector type for the dpas operands."); |
| // Create a new dpas op outside the warp op. |
| rewriter.setInsertionPointAfter(newWarpOp); |
| SmallVector<Value> newDpasOperands; |
| SmallVector<VectorType> newDpasOperandExpectedTypes; |
| |
| // Resolve the distributed types with the original types. |
| newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value()); |
| newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value()); |
| VectorType distributedResultTy = expectedDistResultTyOrFailure.value(); |
| if (dpasOp.getAcc()) |
| newDpasOperandExpectedTypes.push_back(distributedResultTy); |
| |
| for (unsigned i = 0; i < newRetIndices.size(); i++) { |
| newDpasOperands.push_back( |
| resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]), |
| newDpasOperandExpectedTypes[i], rewriter)); |
| } |
| auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(), |
| distributedResultTy, newDpasOperands, |
| dpasOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newDpasOp); |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| // Resolve the output type. |
| Value typeResolved = |
| resolveDistributedTy(newDpasOp.getResult(), |
| distResultTypeByWarpOpOrFailure.value(), rewriter); |
| rewriter.replaceAllUsesWith(distributedVal, typeResolved); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a prefetch_nd op at the end of enclosing |
| /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed |
| /// through the warp op interface they would be propagated as returned values. |
| /// Tensor descriptor shape is not distributed because it is a uniform value |
| /// across all work items within the subgroup. Appropriate cast ops are inserted |
| /// if the distributed types does not match expected xegpu SIMT types. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]> |
| /// gpu.warp_execute_on_lane_0(%laneid) -> () { |
| /// ... |
| /// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0> |
| /// } |
| /// ``` |
| /// To |
| /// ``` |
| /// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> ( |
| /// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { |
| /// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index, |
| /// index |
| /// } |
| /// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32, |
| /// #layout0> -> !xegpu.tensor_desc<4x8xf32> |
| /// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32> |
| /// |
| /// ``` |
| struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| Operation *lastNode = yield->getPrevNode(); |
| auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode); |
| if (!prefetchOp) |
| return failure(); |
| |
| SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets(); |
| // PrefetchNdOp must have offsets. |
| if (offsets.empty()) |
| return rewriter.notifyMatchFailure(prefetchOp, |
| "the prefetch op must have offsets"); |
| SmallVector<Value> offsetsAsValues = |
| vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets); |
| SmallVector<Type> offsetTypes = llvm::map_to_vector( |
| offsetsAsValues, [](Value v) { return v.getType(); }); |
| |
| xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| prefetchOp, "the source tensor descriptor lacks layout attribute"); |
| |
| SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()}; |
| SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()}; |
| newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); |
| newYieldTypes.append(offsetTypes.begin(), offsetTypes.end()); |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); |
| // Create a new prefetch op outside the warp op with updated tensor |
| // descriptor type. Source tensor descriptor require type resolution. |
| xegpu::TensorDescType newTensorDescTy = |
| prefetchOp.getTensorDescType().dropLayouts(); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| SmallVector<Value> newPrefetchOperands = {resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)}; |
| // Collect offsets. |
| for (size_t i = 1; i < newRetIndices.size(); ++i) |
| newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i])); |
| Operation *newPrefetchOp = xegpu::PrefetchNdOp::create( |
| rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands, |
| prefetchOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newPrefetchOp); |
| rewriter.eraseOp(prefetchOp); |
| return success(); |
| } |
| }; |
| |
| /// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0` |
| /// region. This will simply move the barrier op outside of the warp op. |
| struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| Operation *lastNode = yield->getPrevNode(); |
| // The last node must be a gpu::BarrierOp. |
| auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode); |
| if (!barrierOp) |
| return failure(); |
| // Move the barrier op outside of the warp op. |
| rewriter.setInsertionPointAfter(warpOp); |
| gpu::BarrierOp::create(rewriter, barrierOp.getLoc(), |
| barrierOp->getResultTypes(), |
| barrierOp->getOperands(), barrierOp->getAttrs()); |
| rewriter.eraseOp(barrierOp); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a scattered store op. The offsets argument is required. |
| /// Both offset and mask vectors must be 1D and have #subgroup_size elements. |
| /// The layouts are fixed and implicit: one offset/mask per lane. |
| /// The pass changes the offset/mask vector shapes to a |
| /// single-element vector, **it is assumed that their producer will also be |
| /// distributed**. The payload vector also has a fixed distribution: |
| /// no chunk size -> vector of one element. |
| /// chunk size -> vector of the innermost dimension of the SG-payload. |
| /// Example 1 (no chunk size): |
| /// %mask = producer_op : vector<16xi1> |
| /// %offset = producer_op : vector<16xindex> |
| /// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>, |
| /// memref<256xf16>, vector<16xindex>, vector<16xi1> |
| /// To |
| /// %mask = producer_op : vector<1xi1> |
| /// %offset = producer_op : vector<1xindex> |
| /// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>, |
| /// memref<256xf16>, vector<1xindex>, vector<1xi1> |
| /// Example 2 (chunk size, same mask and offsets): |
| /// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> : |
| /// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| /// To |
| /// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> : |
| /// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| /// |
| /// Note that the store distribution pattern also handles leading unit |
| /// dimensions in the payload, mask and offsets vectors. In this case the store |
| /// distribution will only change the dimensions corresponding to the SG |
| /// distribution and keep the leading unit dimensions unchanged. |
| /// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ] |
| /// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the |
| /// offset/mask/payload when necessary so that the distributed store is workign |
| /// on 1D shape vector to match the HW capability. |
| struct StoreDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| Operation *lastNode = warpOp.getTerminator()->getPrevNode(); |
| auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode); |
| if (!storeScatterOp) |
| return failure(); |
| auto offsets = storeScatterOp.getOffsets(); |
| if (!offsets || !isa<VectorType>(offsets.getType())) |
| return rewriter.notifyMatchFailure( |
| storeScatterOp, "Store op must have a vector of offsets argument"); |
| VectorType offsetsTy = cast<VectorType>(offsets.getType()); |
| VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType()); |
| VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType()); |
| |
| // Add handling for leading unit dimensions support |
| int chunkSize = storeScatterOp.getChunkSize().value_or(1); |
| int effectiveVecRank = (chunkSize == 1) ? 1 : 2; |
| |
| // Check that all leading dimensions are unit dimensions |
| for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) { |
| if (storeVecTy.getShape()[i] != 1) { |
| return rewriter.notifyMatchFailure( |
| storeScatterOp, "Only unit dimensions allowed for the leading " |
| "dimensions of the store vector!"); |
| } |
| } |
| |
| auto layoutPayload = |
| xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0)); |
| auto layoutOffsets = |
| xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2)); |
| auto layoutMask = |
| xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3)); |
| |
| FailureOr<VectorType> distStoreVecByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy); |
| FailureOr<VectorType> distOffsetsByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy); |
| FailureOr<VectorType> distMaskByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy); |
| if (failed(distStoreVecByWarpOpOrFailure) || |
| failed(distOffsetsByWarpOpOrFailure) || |
| failed(distMaskByWarpOpOrFailure)) { |
| return rewriter.notifyMatchFailure( |
| storeScatterOp, |
| "Some vector operands have no layouts, using defaults instead."); |
| } |
| |
| VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value(); |
| VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value(); |
| VectorType distMaskTy = distMaskByWarpOpOrFailure.value(); |
| |
| SmallVector<size_t> newRetIndices; |
| SmallVector<Value> operands = storeScatterOp->getOperands(); |
| SmallVector<Type> operandTypesToYield = { |
| distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy}; |
| |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, operands, operandTypesToYield, newRetIndices); |
| |
| rewriter.setInsertionPointAfter(newWarpOp); |
| |
| // Distributed store payload type is always 1D without leading unit dims |
| VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()}, |
| distPayloadTy.getElementType()); |
| |
| VectorType distOffsetsTy1D = VectorType::get( |
| {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType()); |
| VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()}, |
| distMaskTy.getElementType()); |
| |
| // Resolve distributed types to 1D for SIMT execution |
| Value distPayloadVal = resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter); |
| Value distOffsetVal = resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter); |
| Value distMaskVal = resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter); |
| |
| SmallVector<Value> newStoreScatterOpOperands = { |
| distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal, |
| distMaskVal}; |
| |
| xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create( |
| rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands, |
| storeScatterOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newOp); |
| rewriter.eraseOp(storeScatterOp); |
| return success(); |
| } |
| }; |
| |
| static SmallVector<Value> computeDistributedCoordinatesForMatrixOp( |
| PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout, |
| Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) { |
| SmallVector<Value> newCoods; |
| auto maybeCoords = |
| layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape); |
| if (failed(maybeCoords)) |
| return {}; |
| assert(maybeCoords.value().size() == 1 && |
| "Expected one set of distributed offsets"); |
| SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned( |
| rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]), |
| getAsOpFoldResult(origOffsets)); |
| newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>); |
| return newCoods; |
| } |
| |
| /// Pattern for distributing xegpu::LoadMatrixOp. |
| struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| Operation *lastNode = yield->getPrevNode(); |
| auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode); |
| if (!matrixOp) |
| return failure(); |
| |
| OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) { |
| return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op; |
| }); |
| if (!producedByLastLoad) |
| return rewriter.notifyMatchFailure( |
| warpOp, "The last op is not xegpu::LoadMatrixOp"); |
| const int operandIdx = producedByLastLoad->getOperandNumber(); |
| |
| VectorType sgPayloadTy = |
| dyn_cast<VectorType>(matrixOp.getResult().getType()); |
| VectorType warpResultTy = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| if (!sgPayloadTy) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "the matrix op payload must be a vector type"); |
| |
| auto loc = matrixOp.getLoc(); |
| auto offsets = matrixOp.getMixedOffsets(); |
| if (offsets.empty()) |
| return rewriter.notifyMatchFailure(matrixOp, |
| "the load op must have offsets"); |
| SmallVector<Value> offsetsAsValues = |
| vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); |
| |
| auto layout = matrixOp.getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "the matrix operation lacks layout attribute"); |
| |
| FailureOr<VectorType> distPayloadByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); |
| if (failed(distPayloadByWarpOpOrFailure)) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "Failed to distribute matrix op payload based on layout."); |
| |
| SmallVector<Value> operands = {matrixOp.getMemDesc()}; |
| const unsigned offsetsStartIdx = operands.size(); |
| operands.append(offsetsAsValues); |
| |
| SmallVector<Type> operandTypes = |
| llvm::map_to_vector(operands, [](Value v) { return v.getType(); }); |
| |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, operands, operandTypes, newRetIndices); |
| SmallVector<Value> newOperands = llvm::map_to_vector( |
| newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); |
| |
| SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(), |
| ShapedType::kDynamic); |
| DenseI64ArrayAttr newConstOffsetsAttr = |
| rewriter.getDenseI64ArrayAttr(newConstOffsets); |
| ValueRange currentOffsets = |
| ValueRange(newOperands).drop_front(offsetsStartIdx); |
| |
| SmallVector<Value> newCoords = currentOffsets; |
| rewriter.setInsertionPointAfter(newWarpOp); |
| |
| if (!matrixOp.getSubgroupBlockIoAttr()) { |
| newCoords = computeDistributedCoordinatesForMatrixOp( |
| rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), |
| currentOffsets); |
| } |
| xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create( |
| rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure, |
| newOperands[0], ValueRange(newCoords), newConstOffsetsAttr, |
| matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); |
| // Resolve the output type and replace all uses. |
| rewriter.replaceAllUsesWith( |
| newWarpOp.getResult(operandIdx), |
| resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter)); |
| return success(); |
| } |
| }; |
| |
| /// Pattern for distributing xegpu::StoreMatrixOp. |
| struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| gpu::YieldOp yield = warpOp.getTerminator(); |
| Operation *lastNode = yield->getPrevNode(); |
| auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode); |
| if (!matrixOp) |
| return failure(); |
| |
| VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType()); |
| if (!sgPayloadTy) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "the matrix op payload must be a vector type"); |
| |
| auto loc = matrixOp.getLoc(); |
| auto offsets = matrixOp.getMixedOffsets(); |
| if (offsets.empty()) |
| return rewriter.notifyMatchFailure(matrixOp, |
| "the store op must have offsets"); |
| SmallVector<Value> offsetsAsValues = |
| vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); |
| |
| auto layout = matrixOp.getLayoutAttr(); |
| if (!layout) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "the matrix operation lacks layout attribute"); |
| |
| FailureOr<VectorType> distPayloadByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); |
| if (failed(distPayloadByWarpOpOrFailure)) |
| return rewriter.notifyMatchFailure( |
| matrixOp, "Failed to distribute matrix op payload based on layout."); |
| |
| SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()}; |
| const unsigned offsetsStartIdx = operands.size(); |
| operands.append(offsetsAsValues); |
| |
| SmallVector<Type> operandTypes = |
| llvm::map_to_vector(operands, [](Value v) { return v.getType(); }); |
| operandTypes[0] = *distPayloadByWarpOpOrFailure; |
| |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, operands, operandTypes, newRetIndices); |
| SmallVector<Value> newOperands = llvm::map_to_vector( |
| newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); |
| |
| SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(), |
| ShapedType::kDynamic); |
| DenseI64ArrayAttr newConstOffsetsAttr = |
| rewriter.getDenseI64ArrayAttr(newConstOffsets); |
| ValueRange currentOffsets = |
| ValueRange(newOperands).drop_front(offsetsStartIdx); |
| |
| SmallVector<Value> newCoords = currentOffsets; |
| rewriter.setInsertionPointAfter(newWarpOp); |
| |
| if (!matrixOp.getSubgroupBlockIoAttr()) { |
| newCoords = computeDistributedCoordinatesForMatrixOp( |
| rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), |
| currentOffsets); |
| } |
| |
| xegpu::StoreMatrixOp::create( |
| rewriter, loc, TypeRange{}, newOperands[0], newOperands[1], |
| ValueRange(newCoords), newConstOffsetsAttr, |
| matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); |
| rewriter.eraseOp(matrixOp); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a scattered load op. The logic and requirements are the same as |
| /// for the scattered store distribution. The warpOp's payload vector is |
| /// expected to be distributed by the load's result consumer. |
| /// Example 1 (no chunk size): |
| /// %mask = producer_op : vector<16xi1> |
| /// %offset = producer_op : vector<16xindex> |
| /// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>, |
| /// vector<16xindex>, vector<16xi1> -> vector<16xf16> |
| /// To |
| /// %mask = producer_op : vector<1xi1> |
| /// %offset = producer_op : vector<1xindex> |
| /// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>, |
| /// vector<1xindex>, vector<1xi1> -> vector<1xf16> |
| /// Example 2 (chunk size, same mask and offsets): |
| /// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> : |
| /// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> |
| /// To |
| /// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> : |
| /// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> |
| /// |
| /// Note that the load distribution pattern also handles leading unit dimensions |
| /// in the payload, mask, and offsets vector.The load distribution will only |
| /// change the dimensions corresponding to the SG distribution and keep the |
| /// leading unit dimensions unchanged. For example, a load with result type |
| /// vector<1x16xf16> with lane layout [1, 16 ] will be distributed |
| /// as result type vector<1x1xf16>. Shapecast ops are inserted for the |
| /// offset/mask/payload when necessary so that the distributed load is workign |
| /// on 1D shape vector to match the HW capability. |
| struct LoadDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) { |
| // Check if the yield operand that was produced by the *last* scattered |
| // load op to avoid sinking it before barriers (maintain memory order). |
| return isa<xegpu::LoadGatherOp>(op) && |
| warpOp.getTerminator()->getPrevNode() == op; |
| }); |
| if (!producedByLastLoad) |
| return rewriter.notifyMatchFailure( |
| warpOp, "The last op is not xegpu::LoadGatherOp"); |
| |
| auto loadGatherOp = |
| producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>(); |
| auto offsets = loadGatherOp.getOffsets(); |
| if (!offsets || !isa<VectorType>(offsets.getType()) || |
| !isa<VectorType>(loadGatherOp.getMask().getType())) |
| return rewriter.notifyMatchFailure( |
| loadGatherOp, |
| "Load op must have a vector arguments for offsets and mask"); |
| VectorType offsetsTy = cast<VectorType>(offsets.getType()); |
| VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType()); |
| VectorType resultVecTy = |
| cast<VectorType>(loadGatherOp.getResult().getType()); |
| // add handling leading unit dimensions support |
| int chunkSize = loadGatherOp.getChunkSize().value_or(1); |
| int effectiveVecRank = (chunkSize == 1) ? 1 : 2; |
| for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) { |
| if (resultVecTy.getShape()[i] != 1) { |
| return rewriter.notifyMatchFailure( |
| loadGatherOp, "Only unit dimensions allowed for the leading " |
| "dimensions of the load vector!"); |
| } |
| } |
| |
| auto layoutOffsets = |
| xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1)); |
| auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2)); |
| |
| FailureOr<VectorType> distOffsetsByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy); |
| FailureOr<VectorType> distMaskByWarpOpOrFailure = |
| getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy); |
| if (failed(distOffsetsByWarpOpOrFailure) || |
| failed(distMaskByWarpOpOrFailure)) { |
| return rewriter.notifyMatchFailure( |
| loadGatherOp, |
| "Some vector operands have no layouts, using defaults instead."); |
| } |
| |
| SmallVector<size_t> newRetIndices; |
| SmallVector<Value> operands = loadGatherOp->getOperands(); |
| |
| const unsigned operandIdx = producedByLastLoad->getOperandNumber(); |
| VectorType distResultTy = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value(); |
| VectorType distMaskTy = distMaskByWarpOpOrFailure.value(); |
| |
| SmallVector<Type> operandTypesToYield = {operands[0].getType(), |
| distOffsetsTy, distMaskTy}; |
| |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, operands, operandTypesToYield, newRetIndices); |
| |
| rewriter.setInsertionPointAfter(newWarpOp); |
| |
| // Distributed load op will always be 1D. |
| VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()}, |
| distResultTy.getElementType()); |
| |
| VectorType distOffsetsTy1D = |
| VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()}, |
| distOffsetsByWarpOpOrFailure.value().getElementType()); |
| VectorType distMaskTy1D = |
| VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()}, |
| distMaskByWarpOpOrFailure.value().getElementType()); |
| |
| Value distOffsetVal = resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter); |
| Value distmaskVal = resolveDistributedTy( |
| newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter); |
| |
| SmallVector<Value> newLoadGatherOperands = { |
| newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal}; |
| |
| xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create( |
| rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands, |
| loadGatherOp->getAttrs()); |
| xegpu::removeLayoutAttrs(newOp); |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| // Resolve the output type and replace all uses. |
| rewriter.replaceAllUsesWith( |
| distributedVal, |
| resolveDistributedTy(newOp.getResult(), distResultTy, rewriter)); |
| return success(); |
| } |
| }; |
| |
| // Sink SG-uniform ops. An op is uniform if none |
| // of its operands/results has a distribution layout attribute. |
| // Non-uniform vectors are handled by dedicated patterns. |
| // This pattern must have a higher priority than vector dialect distribution |
| // patterns, because a distributable shape may be logically intended as |
| // uniform (i.e., no layout), so we want to omit its distribution. |
| struct SinkUniformOps final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| // Take the last op |
| Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode(); |
| // Any ops with nested regions must be handled carefully in dedicated |
| // patterns. |
| if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions()) |
| return failure(); |
| int operandIdx = -1; |
| if (warpRegionPreYieldOp->getNumResults()) { |
| OpOperand *operand = getWarpResult( |
| warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; }); |
| if (!operand) |
| return failure(); |
| operandIdx = operand->getOperandNumber(); |
| if (warpRegionPreYieldOp->getResult(0).getType() != |
| warpOp.getResult(operandIdx).getType()) |
| return rewriter.notifyMatchFailure(warpOp, |
| "The op result is not uniform."); |
| } |
| |
| // The op must have no layout-based operands or results. |
| bool uniformValuesOnly = |
| llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) { |
| return !xegpu::getDistributeLayoutAttr(v); |
| }); |
| uniformValuesOnly &= |
| llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) { |
| return !xegpu::getDistributeLayoutAttr(opr); |
| }); |
| if (!uniformValuesOnly) |
| return rewriter.notifyMatchFailure(warpOp, |
| "Some values are not uniform."); |
| SmallVector<size_t> newRetIndices; |
| SmallVector<Value> operands = |
| llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands()); |
| SmallVector<Type> operandTypes = |
| llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes()); |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, operands, operandTypes, newRetIndices); |
| |
| rewriter.setInsertionPointAfter(newWarpOp); |
| IRMapping operandMapper; |
| for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices)) |
| operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx), |
| newWarpOp->getResult(newOperandIdx)); |
| Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper); |
| if (!clonedOp->getNumResults()) |
| rewriter.eraseOp(warpRegionPreYieldOp); |
| else { |
| assert(operandIdx != -1 && "Expected a warp result for the operation"); |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), |
| clonedOp->getResult(0)); |
| } |
| return success(); |
| } |
| }; |
| |
| /// This patterns distribute the `vector.multi_reduction` operation across |
| /// lanes in a warp. Currently only 2D to 1D reductions are supported. Given |
| /// layouts for the source and accumulator vectors, |
| /// * If the reduction dimension is distributed across lanes, the reduction is |
| /// non-lane-local and the reduction is done using warp shuffles. Here we |
| /// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in |
| /// the warp op body. |
| /// * If the reduction dimension is not distributed across lanes, the reduction |
| /// is lane-local. In this case, we yield the source and accumulator vectors |
| /// from the warp op and perform the lane-local reduction outside the warp op |
| /// using a sequence of ReductionOps. |
| /// Example 1 (Reduction is lane-local): |
| /// ``` |
| /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { |
| /// %0 = "some_def"() : () -> (vector<16x32xf32>) |
| /// %acc = "some_def"() : () -> (vector<32xf32>) |
| /// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to |
| /// vector<32xf32> gpu.yield %1 : vector<32xf32> |
| /// } |
| /// ``` |
| /// is lowered to: |
| /// ``` |
| /// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>, |
| /// vector<1xf32>) { |
| /// %0 = "some_def"() : () -> (vector<16x32xf32>) |
| /// %acc = "some_def"() : () -> (vector<32xf32>) |
| /// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32> |
| /// } |
| /// %c = arith.constant dense<0.0> : vector<1xf32> |
| /// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32> |
| /// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32 |
| /// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32> |
| /// ``` |
| /// Example 2 (Reduction is non-lane-local): |
| /// ``` |
| /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { |
| /// %0 = "some_def"() : () -> (vector<2x32xf32>) |
| /// %acc = "some_def"() : () -> (vector<2xf32>) |
| /// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to |
| /// vector<2xf32> |
| /// gpu.yield %1 : vector<2xf32> |
| /// } |
| /// ``` |
| /// is lowered to: |
| /// ``` |
| /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { |
| /// %0 = "some_def"() : () -> (vector<2x32xf32>) |
| /// %acc = "some_def"() : () -> (vector<2xf32>) |
| /// %1 = arith.constant dense<0.0> : vector<2xf32> |
| /// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>> |
| /// %3 = ("warp.reduction %2") : f32 |
| /// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32> |
| /// ... repeat for row 1 |
| /// gpu.yield %1 : vector<2xf32> |
| /// } |
| struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *yieldOperand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>); |
| if (!yieldOperand) |
| return failure(); |
| auto reductionOp = |
| cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp()); |
| unsigned operandIdx = yieldOperand->getOperandNumber(); |
| VectorType sourceType = reductionOp.getSourceVectorType(); |
| // Only 2D vectors are supported. |
| if (sourceType.getRank() != 2) |
| return rewriter.notifyMatchFailure(warpOp, |
| "Only 2D reductions are supported."); |
| ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims(); |
| // Only 1 reduction dimension supported. This also ensures that the result |
| // is vector type. |
| if (reductionDims.size() != 1) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Only 1 reduction dimension is supported."); |
| int64_t reductionDim = reductionDims[0]; |
| VectorType distributedResultType = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| VectorType resultType = cast<VectorType>(reductionOp.getType()); |
| xegpu::DistributeLayoutAttr sourceLayout = |
| xegpu::getTemporaryLayout(reductionOp->getOpOperand(0)); |
| |
| FailureOr<VectorType> sourceDistTypeOrFailure = |
| getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); |
| if (failed(sourceDistTypeOrFailure)) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Failed to distribute the source vector type."); |
| VectorType sourceDistType = sourceDistTypeOrFailure.value(); |
| // Only single dimension distribution is supported. |
| bool dim0Distributed = |
| sourceDistType.getShape()[0] != sourceType.getShape()[0]; |
| bool dim1Distributed = |
| sourceDistType.getShape()[1] != sourceType.getShape()[1]; |
| if (dim0Distributed && dim1Distributed) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Expecting source to be distributed in a single dimension."); |
| int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1); |
| if (sourceDistDim == -1) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Expecting a distributed source vector."); |
| bool resultDistributed = |
| distributedResultType.getNumElements() < resultType.getNumElements(); |
| // If the lane owns all the data required for reduction (i.e. reduction is |
| // fully parallel accross lanes), then each lane owns part of the result |
| // (i.e. result is distributed). If the reduction require cross-lane |
| // shuffling, then the result is shared among all lanes (broadcasted). |
| // Therefore we expect following cases: |
| // |
| // | Source vector | Reduction dim | Result vector | |
| // |----------------------|----------------|----------------| |
| // | dim-0 distributed | 0 | broadcasted | |
| // | dim-0 distributed | 1 | distributed | |
| // | dim-1 distributed | 0 | distributed | |
| // | dim-1 distributed | 1 | broadcasted | |
| |
| bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) || |
| (sourceDistDim == 1 && reductionDim == 0); |
| if (isReductionLaneLocal && !resultDistributed) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Expecting a distributed result for lane-local reduction."); |
| |
| if (!isReductionLaneLocal && resultDistributed) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "Expecting a broadcasted result for non-lane-local reduction."); |
| |
| // Handle lane-local reduction case. In this case we fully distribute the |
| // reduction result. |
| if (isReductionLaneLocal) { |
| // Yield the source and acc vectors from the WarpOp. |
| SmallVector<size_t> newRetIndices; |
| auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()}, |
| {sourceDistType, distributedResultType}, newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| Value result = xegpu::lowerToVectorReductions( |
| cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])), |
| cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])), |
| reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter); |
| // Replace the warp op result with the final result. |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result); |
| return success(); |
| } |
| // For non-lane-local case, we simply rewrite the MultiReductionOp in terms |
| // of multiple ReductionOps. Actual distribution is done by the |
| // WarpOpReduction pattern. |
| rewriter.setInsertionPointAfter(reductionOp); |
| Value result = xegpu::lowerToVectorReductions( |
| cast<TypedValue<VectorType>>(reductionOp.getSource()), |
| cast<TypedValue<VectorType>>(reductionOp.getAcc()), |
| reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter); |
| // Replace the warp op result with the final result. |
| rewriter.replaceAllUsesWith(reductionOp.getResult(), result); |
| return success(); |
| } |
| }; |
| |
| /// This pattern distributes the `vector.broadcast` operation across lanes in a |
| /// warp. The pattern supports three use cases: |
| /// |
| /// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input |
| /// vector |
| /// must have a slice layout of the result. If the distributed source and |
| /// target vector types are identical, this lowers to a no-op; otherwise, it |
| /// remains a broadcast but operates on distributed vectors. |
| /// |
| /// 2) Broadcast a same-rank vector with identical layouts for source and |
| /// target: |
| /// The source vector must have unit dimensions, and lane_data must be unit |
| /// size for those unit dims. This always lowers to a no-op. |
| /// |
| /// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from |
| /// scalar to distributed result type. |
| /// |
| /// Example 1 (lowering to a broadcast with distributed types): |
| /// ``` |
| /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) { |
| /// %0 = "some_def"() {layout_result_0 = |
| /// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>, |
| /// dims = [0]> } : () -> (vector<32xf32>) |
| /// %2 = vector.broadcast %0 {layout_result_0 = |
| /// #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>} |
| /// : vector<32xf32> to vector<8x32xf32> |
| /// gpu.yield %1 : vector<8x32xf32> |
| /// } |
| /// ``` |
| /// is lowered to: |
| /// ``` |
| /// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { |
| /// %0 = "some_def"() {layout_result_0 = |
| /// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>, |
| /// dims = [0]> } : () -> (vector<32xf32>) |
| /// gpu.yield %0 : vector<32xf32> |
| /// } |
| /// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32> |
| /// |
| /// Example 2 (no-op): |
| /// ``` |
| /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) { |
| /// %0 = "some_def"() {layout_result_0 = |
| /// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>, |
| /// dims = [1]> } : () -> (vector<8xf32>) |
| /// %1 = vector.shape_cast %0 |
| /// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1, |
| /// 1]>}: vector<8xf32> to vector<8x1xf32> |
| /// %2 = vector.broadcast %1 |
| /// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1, |
| /// 1]>}: vector<8x1xf32> to vector<8x32xf32> |
| /// gpu.yield %1 : vector<8x32xf32> |
| /// } |
| /// ``` |
| /// is lowered to: |
| /// ``` |
| /// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) { |
| /// %0 = "some_def"() {layout_result_0 = |
| /// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>, |
| /// dims = [1]> } : () -> (vector<8xf32>) |
| /// %1 = vector.shape_cast %0 |
| /// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1, |
| /// 1]>}: vector<8xf32> to vector<8x1xf32> |
| /// gpu.yield %1 : vector<8x1xf32> |
| /// } |
| /// // The broadcast is implicit through layout transformation (no-op) |
| /// "some_use"(%r#0) |
| /// ``` |
| struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *yieldOperand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>); |
| if (!yieldOperand) |
| return failure(); |
| auto broadcastOp = |
| cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp()); |
| unsigned operandIdx = yieldOperand->getOperandNumber(); |
| |
| VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType()); |
| VectorType destType = |
| dyn_cast<VectorType>(broadcastOp.getResult().getType()); |
| |
| xegpu::DistributeLayoutAttr sourceLayout = |
| xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0)); |
| xegpu::DistributeLayoutAttr resultLayout = |
| xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult())); |
| |
| FailureOr<VectorType> sourceDistType; |
| Type sourceElemOrDistType; |
| if (sourceType) { |
| |
| // Case 1 and 2: source is a vector type. |
| int64_t rankDiff = destType.getRank() - sourceType.getRank(); |
| if (rankDiff > 0) { |
| // Case 1: source is lower-rank than result. |
| bool isSliceOf = sourceLayout.isSliceOf(resultLayout); |
| if (!isSliceOf) |
| broadcastOp.emitWarning() |
| << "Broadcast input layout must be a slice of result layout."; |
| } |
| // case 2: source and result have same rank |
| if (rankDiff == 0) { |
| auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims(); |
| SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(), |
| broadcastUnitDimsSet.end()); |
| bool isEqualTo = sourceLayout.isEqualTo(resultLayout); |
| if (!isEqualTo) |
| return rewriter.notifyMatchFailure( |
| warpOp, "For same-rank broadcast, source must be identical to " |
| "adjusted result layouts with unit dims."); |
| resultLayout = resultLayout.setUnitDimData(broadcastUnitDims); |
| sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims); |
| } |
| |
| sourceDistType = |
| getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType); |
| if (failed(sourceDistType)) { |
| return rewriter.notifyMatchFailure( |
| warpOp, "Failed to distribute the source vector type."); |
| } |
| sourceElemOrDistType = sourceDistType.value(); |
| |
| } else { |
| // Case 3: source is a scalar type. |
| if (sourceLayout) { |
| return rewriter.notifyMatchFailure( |
| warpOp, "Broadcast from scalar must not have a layout attribute."); |
| } |
| sourceElemOrDistType = broadcastOp.getSourceType(); |
| } |
| FailureOr<VectorType> destDistType = |
| getDistVecTypeBasedOnLaneLayout(resultLayout, destType); |
| if (failed(destDistType)) { |
| return rewriter.notifyMatchFailure( |
| warpOp, "Failed to distribute the dest vector type."); |
| } |
| |
| SmallVector<size_t> newRetIndices; |
| auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType, |
| newRetIndices); |
| |
| Value distributedSource = newWarpOp.getResult(newRetIndices[0]); |
| |
| Value newBroadcast = distributedSource; |
| |
| if (sourceElemOrDistType != destDistType.value()) { |
| rewriter.setInsertionPointAfter(newWarpOp); |
| newBroadcast = |
| vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(), |
| destDistType.value(), distributedSource); |
| } |
| |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing |
| /// `gpu.warp_execute_on_lane_0` region. |
| struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *yieldOperand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>); |
| if (!yieldOperand) |
| return failure(); |
| auto shapeCastOp = |
| cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp()); |
| unsigned operandNumber = yieldOperand->getOperandNumber(); |
| auto resultDistTy = |
| cast<VectorType>(warpOp.getResult(operandNumber).getType()); |
| xegpu::DistributeLayoutAttr sourceLayout = |
| xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0)); |
| xegpu::DistributeLayoutAttr resultLayout = |
| xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult())); |
| if (!sourceLayout || !resultLayout) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "the source or result of shape_cast op lacks distribution layout"); |
| |
| FailureOr<VectorType> sourceDistTypeOrFailure = |
| getDistVecTypeBasedOnLaneLayout(sourceLayout, |
| shapeCastOp.getSourceVectorType()); |
| if (failed(sourceDistTypeOrFailure)) |
| return rewriter.notifyMatchFailure( |
| warpOp, "failed to get distributed vector type for source"); |
| VectorType sourceDistType = sourceDistTypeOrFailure.value(); |
| // Create a new warp op that yields the source of the shape_cast op. |
| SmallVector<size_t> newRetIndices; |
| auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType}, |
| newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| Value source = newWarpOp.getResult(newRetIndices[0]); |
| // Create a new shape_cast op outside the warp op. |
| Value newShapeCast = vector::ShapeCastOp::create( |
| rewriter, shapeCastOp.getLoc(), resultDistTy, source); |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber), |
| newShapeCast); |
| return success(); |
| } |
| }; |
| |
| // Distribute a `vector.extract_strided_slice` op feeding into yield op of an |
| // enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers |
| // advanced cases where the distributed dimension is partially extracted and |
| // currently not supported by the generic vector distribution patterns. |
| struct VectorExtractStridedSliceDistribution |
| : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>); |
| if (!operand) |
| return failure(); |
| auto extractOp = |
| cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp()); |
| unsigned operandIdx = operand->getOperandNumber(); |
| auto distributedType = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| // Find the distributed dimensions. |
| auto extractResultType = cast<VectorType>(operand->get().getType()); |
| auto distributedDims = |
| getDistributedDims(extractResultType, distributedType); |
| // Collect updated source type, sizes and offsets. They may be adjusted |
| // later if the data is distributed to lanes (as opposed to being owned by |
| // all lanes uniformly). |
| VectorType updatedSourceType = extractOp.getSourceVectorType(); |
| SmallVector<Attribute> updatedSizes = llvm::map_to_vector( |
| extractOp.getSizes(), [](Attribute attr) { return attr; }); |
| SmallVector<Attribute> updatedOffsets = llvm::map_to_vector( |
| extractOp.getOffsets(), [](Attribute attr) { return attr; }); |
| SmallVector<Attribute> updatedStrides = llvm::map_to_vector( |
| extractOp.getStrides(), [](Attribute attr) { return attr; }); |
| // If the provided sizes, offsets, strides are less than the rank, pad them |
| // with full sizes, zero offsets, and unit strides. This makes it easier to |
| // adjust them later. |
| int64_t sourceRank = extractOp.getSourceVectorType().getRank(); |
| for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) { |
| updatedSizes.push_back(rewriter.getI64IntegerAttr( |
| extractOp.getSourceVectorType().getDimSize(i))); |
| updatedOffsets.push_back(rewriter.getI64IntegerAttr(0)); |
| updatedStrides.push_back( |
| rewriter.getI64IntegerAttr(1)); // stride is always 1. |
| } |
| // If the result is distributed, it must be distributed in exactly one |
| // dimension. In this case, we adjust the sourceDistType, distributedSizes |
| // and distributedOffsets accordingly. |
| if (distributedDims.size() > 0) { |
| if (distributedDims.size() != 1) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Source can not be distributed in multiple dimensions."); |
| int64_t distributedDim = distributedDims[0]; |
| int sourceDistrDimSize = |
| extractOp.getSourceVectorType().getShape()[distributedDim]; |
| auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0)); |
| if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty()) |
| return rewriter.notifyMatchFailure( |
| warpOp, "the source of extract_strided_slice op lacks distribution " |
| "layout"); |
| auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt(); |
| // Because only single dimension distribution is supported, lane layout |
| // size at the distributed dim must be the subgroup size. |
| int subgroupSize = sourceLaneLayout[distributedDim]; |
| // Check if the source size in the distributed dimension is a multiple of |
| // subgroup size. |
| if (sourceDistrDimSize % subgroupSize != 0) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "Source size along distributed dimension is not a multiple of " |
| "subgroup size."); |
| auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt(); |
| // We expect lane data to be all ones in this case. |
| if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; })) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Expecting unit lane data in source layout"); |
| // The offsets in the distributed dimention must be a multiple of subgroup |
| // size. |
| int64_t distrDimOffset = |
| cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt(); |
| if (distrDimOffset % subgroupSize != 0) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Offset along distributed dimension " |
| "is not a multiple of subgroup size."); |
| updatedSourceType = getDistVecTypeBasedOnLaneLayout( |
| sourceLayout, extractOp.getSourceVectorType()) |
| .value(); |
| // Update the distributed sizes to match the distributed type. |
| updatedSizes[distributedDim] = rewriter.getI64IntegerAttr( |
| distributedType.getDimSize(distributedDim)); |
| // Update the distributed offsets to match round robin distribution (i.e. |
| // each lane owns data at `subgroupSize` stride given unit lane data). |
| updatedOffsets[distributedDim] = |
| rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize); |
| } |
| // Do the distribution by yielding the source of the extract op from |
| // the warp op and creating a new extract op outside the warp op. |
| SmallVector<size_t> newRetIndices; |
| auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType}, |
| newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| Value source = newWarpOp.getResult(newRetIndices[0]); |
| // Create a new extract op outside the warp op. |
| Value newExtractOp = vector::ExtractStridedSliceOp::create( |
| rewriter, extractOp.getLoc(), distributedType, source, |
| ArrayAttr::get(rewriter.getContext(), updatedOffsets), |
| ArrayAttr::get(rewriter.getContext(), updatedSizes), |
| ArrayAttr::get(rewriter.getContext(), updatedStrides)); |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a `vector.insert_strided_slice` op feeding into yield op of an |
| /// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers |
| /// advanced cases where the distributed dimension is partially inserted and |
| /// currently not supported by the generic vector distribution patterns. |
| struct VectorInsertStridedSliceDistribution |
| : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) { |
| // Check if the InsertStridedSliceOp is the last op before yield op |
| return llvm::IsaPred<vector::InsertStridedSliceOp>(op) && |
| warpOp.getTerminator()->getPrevNode() == op; |
| }); |
| if (!operand) |
| return failure(); |
| unsigned int operandNumber = operand->getOperandNumber(); |
| auto insertOp = |
| operand->get().getDefiningOp<vector::InsertStridedSliceOp>(); |
| auto distributedType = |
| cast<VectorType>(warpOp.getResult(operandNumber).getType()); |
| // Find the distributed dimensions of the dest vector. |
| auto insertResultType = cast<VectorType>(operand->get().getType()); |
| auto destDistributedDims = |
| getDistributedDims(insertResultType, distributedType); |
| // Collect updated offsets, source type and dest type. They may be adjusted |
| // later if the data is distributed to lanes (as opposed to being owned by |
| // all lanes uniformly). |
| SmallVector<Attribute> updatedOffsets = llvm::map_to_vector( |
| insertOp.getOffsets(), [](Attribute attr) { return attr; }); |
| VectorType updatedSourceType = insertOp.getSourceVectorType(); |
| VectorType updatedDestType = insertOp.getDestVectorType(); |
| if (destDistributedDims.size() > 0) { |
| // Only single dimension distribution is supported. |
| if (destDistributedDims.size() != 1) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "Expecting source to be distributed in a single dimension."); |
| int64_t destDistributedDim = destDistributedDims[0]; |
| |
| VectorType srcType = insertOp.getSourceVectorType(); |
| VectorType destType = insertOp.getDestVectorType(); |
| // Currently we require that both source (kD) and dest (nD) vectors are |
| // distributed. This requires that distributedDim (d) is contained in the |
| // last k dims of the dest vector (d >= n - k). |
| int64_t sourceDistributedDim = |
| destDistributedDim - (destType.getRank() - srcType.getRank()); |
| if (sourceDistributedDim < 0) |
| return rewriter.notifyMatchFailure( |
| insertOp, |
| "distributed dimension must be in the last k (i.e. source " |
| "rank) dims of dest vector"); |
| int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim); |
| // Obtain the source and dest layouts. |
| auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1)); |
| auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0)); |
| if (!destLayout || !sourceLayout || |
| destLayout.getEffectiveLaneLayoutAsInt().empty() || |
| sourceLayout.getEffectiveLaneLayoutAsInt().empty()) |
| return rewriter.notifyMatchFailure( |
| warpOp, "the source or dest of insert_strided_slice op lacks " |
| "distribution layout"); |
| // Because only single dimension distribution is supported, lane layout |
| // size at the distributed dim must be the subgroup size. |
| int subgroupSize = |
| destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim]; |
| // We require that source and dest lane data are all ones to ensure |
| // uniform round robin distribution. |
| auto destLaneData = destLayout.getEffectiveLaneDataAsInt(); |
| auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt(); |
| if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) || |
| !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; })) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Expecting unit lane data in source and dest layouts"); |
| // Source distributed dim size must be multiples of subgroup size. |
| if (srcDistrDimSize % subgroupSize != 0) |
| return rewriter.notifyMatchFailure( |
| warpOp, "Distributed dimension size in source is not a multiple of " |
| "subgroup size."); |
| // Offsets in the distributed dimension must be multiples of subgroup |
| // size. |
| int64_t destDistrDimOffset = |
| cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt(); |
| if (destDistrDimOffset % subgroupSize != 0) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "Offset along distributed dimension in dest is not a multiple of " |
| "subgroup size."); |
| // Update the source and dest types based on their layouts. |
| updatedSourceType = getDistVecTypeBasedOnLaneLayout( |
| sourceLayout, insertOp.getSourceVectorType()) |
| .value(); |
| updatedDestType = getDistVecTypeBasedOnLaneLayout( |
| destLayout, insertOp.getDestVectorType()) |
| .value(); |
| // Update the distributed offsets to match round robin distribution (i.e. |
| // each lane owns data at `subgroupSize` stride given unit lane data). |
| updatedOffsets[destDistributedDim] = |
| rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize); |
| } |
| // Do the distribution by yielding the source and dest of the insert op |
| // from the warp op and creating a new insert op outside the warp op. |
| SmallVector<size_t> newRetIndices; |
| auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()}, |
| {updatedSourceType, updatedDestType}, newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| |
| Value valueToStore = newWarpOp.getResult(newRetIndices[0]); |
| Value dest = newWarpOp.getResult(newRetIndices[1]); |
| // Create a new insert op outside the warp op. |
| Value newInsertOp = vector::InsertStridedSliceOp::create( |
| rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest, |
| ArrayAttr::get(rewriter.getContext(), updatedOffsets), |
| insertOp.getStrides()); |
| rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber), |
| newInsertOp); |
| return success(); |
| } |
| }; |
| |
| /// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an |
| /// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op |
| /// outside of the warp op. |
| struct MemrefExtractAlignedPointerAsIndexDistribution final |
| : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = getWarpResult( |
| warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, |
| "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op"); |
| auto extractOp = |
| operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, extractOp.getSource(), |
| TypeRange{extractOp.getSource().getType()}, newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create( |
| rewriter, newWarpOp.getLoc(), extractOp.getType(), |
| newWarpOp.getResult(newRetIndices[0])); |
| Value resultVal = newWarpOp.getResult(operandIdx); |
| rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult()); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a vector::BitCastOp feeding into yield op of an enclosing |
| /// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost |
| /// diemension of the source/result vectors. Equivalent vector::BitCastOp is |
| /// created outside of the warp op with distributed source vector type (computed |
| /// using assigned layout). |
| struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, "warp result is not a vector::BitCast op"); |
| auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| VectorType distributedSourceType = |
| getDistVecTypeBasedOnLaneLayout( |
| xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)), |
| bitcastOp.getSourceVectorType()) |
| .value_or(VectorType()); |
| if (!distributedSourceType) |
| return rewriter.notifyMatchFailure( |
| bitcastOp, "Failed to distribute the source vector type in " |
| "vector::BitCast op"); |
| VectorType distributedResultType = |
| cast<VectorType>(warpOp.getResult(operandIdx).getType()); |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, bitcastOp.getSource(), |
| TypeRange{distributedSourceType}, newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| auto newBitcastOp = vector::BitCastOp::create( |
| rewriter, newWarpOp.getLoc(), distributedResultType, |
| newWarpOp.getResult(newRetIndices[0])); |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult()); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a vector::TransposeOp feeding into yield op of an enclosing |
| /// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are |
| /// supported. In most cases, transpose is a no op because it is entirely |
| /// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns |
| /// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local |
| /// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent |
| /// vector::TransposeOp outside of the warp op with distributed source vector |
| /// type (computed using assigned layout). |
| struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = |
| getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, "warp result is not a vector::Transpose op"); |
| auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| xegpu::DistributeLayoutAttr sourceLayout = |
| xegpu::getTemporaryLayout(transposeOp->getOpOperand(0)); |
| xegpu::DistributeLayoutAttr resultLayout = |
| xegpu::getTemporaryLayout(transposeOp->getOpResult(0)); |
| if (!sourceLayout || !resultLayout) |
| return rewriter.notifyMatchFailure( |
| transposeOp, |
| "the source or result vector of the transpose op lacks layout " |
| "attribute"); |
| int64_t sourceRank = transposeOp.getSourceVectorType().getRank(); |
| int64_t resultRank = transposeOp.getResultVectorType().getRank(); |
| // Only 2D transposes are supported for now. |
| // TODO: Support nD transposes. |
| if (sourceRank != 2 || resultRank != 2) |
| return rewriter.notifyMatchFailure( |
| transposeOp, "the source or result vector of the transpose op " |
| "does not have 2D layout"); |
| ArrayRef<int64_t> perm = transposeOp.getPermutation(); |
| // Result layout must be a transpose of source layout. |
| if (!resultLayout.isTransposeOf(sourceLayout, perm, |
| xegpu::LayoutKind::Lane)) |
| return rewriter.notifyMatchFailure( |
| transposeOp, |
| "the source or result vector layouts must be 2D transposes of each " |
| "other"); |
| FailureOr<VectorType> distributedSourceTypeOrFailure = |
| getDistVecTypeBasedOnLaneLayout(sourceLayout, |
| transposeOp.getSourceVectorType()); |
| if (failed(distributedSourceTypeOrFailure)) |
| return rewriter.notifyMatchFailure( |
| transposeOp, "Failed to distribute the source vector type in " |
| "vector::Transpose op"); |
| SmallVector<size_t> newRetIndices; |
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( |
| rewriter, warpOp, transposeOp.getVector(), |
| TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices); |
| rewriter.setInsertionPointAfter(newWarpOp); |
| auto newTransposeOp = vector::TransposeOp::create( |
| rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]), |
| perm); |
| Value distributedVal = newWarpOp.getResult(operandIdx); |
| rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult()); |
| return success(); |
| } |
| }; |
| |
| /// Distribute a vector::StepOp with the sliced result layout. |
| /// The sliced layout must have exactly 1 effective lane dimension. |
| /// We completely resolve the vector::StepOp by computing the lane_data-sized |
| /// subranges. |
| struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern { |
| using gpu::WarpDistributionPattern::WarpDistributionPattern; |
| LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, |
| PatternRewriter &rewriter) const override { |
| OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>); |
| if (!operand) |
| return rewriter.notifyMatchFailure( |
| warpOp, "warp result is not a vector::StepOp op"); |
| auto stepOp = operand->get().getDefiningOp<vector::StepOp>(); |
| unsigned operandIdx = operand->getOperandNumber(); |
| xegpu::DistributeLayoutAttr resultLayout = |
| xegpu::getTemporaryLayout(stepOp->getResult(0)); |
| if (!resultLayout) |
| return rewriter.notifyMatchFailure( |
| stepOp, "the result vector of the step op lacks layout " |
| "attribute"); |
| auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout); |
| if (!sliceLayout) |
| return rewriter.notifyMatchFailure( |
| stepOp, "the result layout must be a slice layout"); |
| if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1) |
| return rewriter.notifyMatchFailure( |
| stepOp, "expecting 1 dim in the effective result layout"); |
| |
| rewriter.setInsertionPointAfter(warpOp); |
| auto loc = stepOp.getLoc(); |
| auto stepResultVecTy = stepOp.getResult().getType(); |
| Value distributedVal = warpOp.getResult(operandIdx); |
| VectorType newVecTy = cast<VectorType>(distributedVal.getType()); |
| |
| auto laneDataBlockCoords = resultLayout.computeDistributedCoords( |
| rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape()); |
| if (failed(laneDataBlockCoords)) |
| return rewriter.notifyMatchFailure( |
| stepOp, "failed to compute lane data block coordinates"); |
| |
| auto laneDataBlockCoordsVec = laneDataBlockCoords.value(); |
| auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0]; |
| assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) == |
| newVecTy.getNumElements() / laneDataBlockLength); |
| SmallVector<Value> stepVals; |
| // For each lane_data block, reconstruct its sub-range |
| // from the range of SG-level vector.step. Example: vector.step |
| // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} : |
| // vector<16xindex> |
| // Each logical lane holds 4 elements as 2 blocks of 2 elements each. |
| // The blocks are round-robin distributed, so logical lane id 0 |
| // holds values [0,1, 8,9]. |
| for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) { |
| auto laneDataBlockStartCoord = laneDataBlockCoords[0]; |
| stepVals.push_back(laneDataBlockStartCoord); |
| for (int i = 1; i < laneDataBlockLength; ++i) { |
| auto offset = arith::ConstantIndexOp::create(rewriter, loc, i); |
| stepVals.push_back(arith::AddIOp::create( |
| rewriter, loc, laneDataBlockStartCoord, offset)); |
| } |
| } |
| assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() && |
| "Expecting the number of step values to match the number of " |
| "elements in the vector"); |
| auto stepOpVal = |
| vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals); |
| rewriter.replaceAllUsesWith(distributedVal, stepOpVal); |
| return success(); |
| } |
| }; |
| |
| struct ConvertLayoutDistribution |
| : public OpRewritePattern<xegpu::ConvertLayoutOp> { |
| using OpRewritePattern::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, |
| PatternRewriter &rewriter) const override { |
| auto inputLayout = op.getInputLayoutAttr(); |
| auto targetLayout = op.getTargetLayoutAttr(); |
| |
| if (!inputLayout || !targetLayout) |
| return rewriter.notifyMatchFailure(op, "missing layout attributes"); |
| |
| if (!inputLayout.isCompatibleWith(targetLayout, xegpu::LayoutKind::Lane)) { |
| return rewriter.notifyMatchFailure( |
| op, "lowering incompatible convert_layout not yet supported"); |
| } |
| rewriter.replaceOp(op, op.getSource()); |
| return success(); |
| } |
| }; |
| |
| } // namespace |
| |
| namespace { |
| struct XeGPUSubgroupDistributePass final |
| : public xegpu::impl::XeGPUSubgroupDistributeBase< |
| XeGPUSubgroupDistributePass> { |
| void runOnOperation() override; |
| }; |
| } // namespace |
| |
| void xegpu::populateXeGPUSubgroupDistributePatterns( |
| RewritePatternSet &patterns) { |
| patterns.add<CreateNdDescDistribution, StoreNdDistribution, |
| LoadNdDistribution, DpasDistribution, PrefetchNdDistribution, |
| GpuBarrierDistribution, VectorMultiReductionDistribution, |
| LoadDistribution, StoreDistribution, VectorTransposeDistribution, |
| VectorBitcastDistribution, LoadMatrixDistribution, |
| StoreMatrixDistribution, ConvertLayoutDistribution, |
| MemrefExtractAlignedPointerAsIndexDistribution>( |
| patterns.getContext(), |
| /*pattern benefit=*/PatternHierarchy::Regular); |
| // For following patterns, we need to override the regular vector distribution |
| // patterns. Therefore, assign higher benefit. |
| patterns |
| .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution, |
| VectorInsertStridedSliceDistribution, VectorBroadcastDistribution, |
| VectorStepSliceDistribution, SinkUniformOps>( |
| patterns.getContext(), |
| /*pattern benefit=*/PatternHierarchy::AboveRegular); |
| } |
| |
| void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns( |
| RewritePatternSet &patterns) { |
| patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext()); |
| } |
| |
| void XeGPUSubgroupDistributePass::runOnOperation() { |
| // Step 1: Attach layouts to op operands. |
| // TODO: Following assumptions are made: |
| // 1) It is assumed that there are no layout conflicts. |
| // 2) Any existing layout attributes attached to the operands are ignored. |
| Operation *op = getOperation(); |
| if (!xegpu::recoverTemporaryLayouts(op)) { |
| signalPassFailure(); |
| return; |
| } |
| |
| // Step 2: Move all operations of a GPU function inside |
| // gpu.warp_execute_on_lane_0 operation. |
| { |
| RewritePatternSet patterns(&getContext()); |
| xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns); |
| |
| if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { |
| signalPassFailure(); |
| return; |
| } |
| // At this point, we have moved the entire function body inside the |
| // warpOp. Now move any scalar uniform code outside of the warpOp (like |
| // GPU index ops, scalar constants, etc.). This will simplify the |
| // later lowering and avoid custom patterns for these ops. |
| getOperation()->walk([&](Operation *op) { |
| if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) |
| vector::moveScalarUniformCode(warpOp); |
| }); |
| } |
| // Step 3: Apply subgroup to workitem distribution patterns. |
| RewritePatternSet patterns(&getContext()); |
| xegpu::populateXeGPUSubgroupDistributePatterns(patterns); |
| // distributionFn is used by vector distribution patterns to determine the |
| // distributed vector type for a given vector value. In XeGPU subgroup |
| // distribution context, we compute this based on lane layout. |
| auto distributionFn = [](Value val) { |
| VectorType vecType = dyn_cast<VectorType>(val.getType()); |
| int64_t vecRank = vecType ? vecType.getRank() : 0; |
| if (vecRank == 0) |
| return AffineMap::get(val.getContext()); |
| // Get the layout of the vector type. |
| xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val); |
| // If no layout is specified, assume uniform case (no distribution). |
| if (!layout) |
| return AffineMap::get(val.getContext()); |
| // Expecting vector and layout rank to match. |
| assert(layout.getRank() == vecRank && |
| "Expecting vector and layout rank to match"); |
| // A dimension is distributed only if layout suggests there are |
| // multiple lanes assigned for this dimension and the shape can be evenly |
| // distributed to those lanes. |
| SmallVector<unsigned int> distributedDims; |
| for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) { |
| if (v > 1 && vecType.getShape()[i] % v == 0) |
| distributedDims.push_back(i); |
| } |
| return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims, |
| val.getContext()); |
| }; |
| // TODO: shuffleFn is not used. |
| auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, |
| int64_t warpSz) { return Value(); }; |
| |
| vector::populateDistributeReduction( |
| patterns, xegpu::subgroupReduction, |
| /*pattern benefit=*/PatternHierarchy::Regular); |
| |
| vector::populatePropagateWarpVectorDistributionPatterns( |
| patterns, distributionFn, shuffleFn, |
| /*pattern benefit=*/PatternHierarchy::Regular); |
| if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { |
| signalPassFailure(); |
| return; |
| } |
| |
| // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted |
| // due to tensor desc type mismatches created by using upstream distribution |
| // patterns (scf.for). This cleanup should only be done if all the ops are |
| // distributed successfully, if some ops are still not distributed and remains |
| // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid |
| // breaking the IR. |
| bool foundWarpOp = false; |
| getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) { |
| // Look for WarpOps that are not trivially dead. |
| if (isOpTriviallyDead(warpOp)) |
| return WalkResult::advance(); |
| foundWarpOp = true; |
| return WalkResult::interrupt(); |
| }); |
| if (foundWarpOp) |
| return; |
| |
| getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { |
| // We are only interested in UnrealizedConversionCastOps there were added |
| // for resolving SIMT type mismatches. |
| if (!op->getAttr(resolveSIMTTypeMismatch)) |
| return WalkResult::skip(); |
| |
| Value input = op.getOperand(0); |
| Value output = op.getResult(0); |
| |
| // Both input and output must have tensor descriptor types. |
| xegpu::TensorDescType inputDescType = |
| mlir::dyn_cast<xegpu::TensorDescType>(input.getType()); |
| xegpu::TensorDescType outputDescType = |
| mlir::dyn_cast<xegpu::TensorDescType>(output.getType()); |
| assert(inputDescType && outputDescType && |
| "Unrealized conversion cast must have tensor descriptor types"); |
| |
| // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions. |
| // This occurs inside scf.for body to resolve the block argument type to |
| // SIMT type. |
| if (inputDescType.getLayout()) { |
| auto argument = mlir::dyn_cast<mlir::BlockArgument>(input); |
| if (argument) { |
| argument.setType(output.getType()); |
| output.replaceAllUsesWith(argument); |
| if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>( |
| argument.getOwner()->getParentOp())) { |
| auto result = loopOp.getTiedLoopResult(argument); |
| result.setType(output.getType()); |
| } |
| } |
| } |
| |
| // tensor_desc<shape> -> tensor_desc<shape, layout> Type of |
| // conversions. This occurs at the yield op of scf.for body to go back |
| // from SIMT type to original type. |
| if (outputDescType.getLayout()) |
| output.replaceAllUsesWith(input); |
| |
| if (op->use_empty()) |
| op->erase(); |
| return WalkResult::advance(); |
| }); |
| } |