Revert "[MLIR][XeGPU] Scattered ops sg-to-wi distribution" (#156761)
Reverts llvm/llvm-project#154949 due to suspected buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/55/builds/16630/steps/11/logs/stdio).
Previously commented on the original pull request:
https://github.com/llvm/llvm-project/pull/154949#issuecomment-3250709417
```
******************** TEST 'MLIR :: Dialect/XeGPU/subgroup-distribute.mlir' FAILED ********************
...
# | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
# | Stack dump:
# | 0. Program arguments: /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm_build_hwasan/bin/mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
# | #0 0x0000c0af4b066df0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/llvm/lib/Support/Unix/Signals.inc:834:13
# | #1 0x0000c0af4b060e20 llvm::sys::RunSignalHandlers() /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/llvm/lib/Support/Signals.cpp:105:18
# | #2 0x0000c0af4b0691b4 SignalHandler(int, siginfo_t*, void*) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/llvm/lib/Support/Unix/Signals.inc:426:38
# | #3 0x0000ee25a3dcb8f8 (linux-vdso.so.1+0x8f8)
# | #4 0x0000ee25a36c7608 (/lib/aarch64-linux-gnu/libc.so.6+0x87608)
# | #5 0x0000ee25a367cb3c raise (/lib/aarch64-linux-gnu/libc.so.6+0x3cb3c)
# | #6 0x0000ee25a3667e00 abort (/lib/aarch64-linux-gnu/libc.so.6+0x27e00)
# | #7 0x0000c0af4ae7e4b0 __sanitizer::Atexit(void (*)()) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp:168:10
# | #8 0x0000c0af4ae7c354 __sanitizer::Die() /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_termination.cpp:52:5
# | #9 0x0000c0af4ae66a30 Unlock /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/../sanitizer_common/sanitizer_mutex.h:250:16
# | #10 0x0000c0af4ae66a30 ~GenericScopedLock /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/../sanitizer_common/sanitizer_mutex.h:386:51
# | #11 0x0000c0af4ae66a30 __hwasan::ScopedReport::~ScopedReport() /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/hwasan_report.cpp:54:5
# | #12 0x0000c0af4ae661b8 __hwasan::(anonymous namespace)::BaseReport::~BaseReport() /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/hwasan_report.cpp:477:7
# | #13 0x0000c0af4ae63f5c __hwasan::ReportTagMismatch(__sanitizer::StackTrace*, unsigned long, unsigned long, bool, bool, unsigned long*) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/hwasan_report.cpp:1094:1
# | #14 0x0000c0af4ae4f8e0 Destroy /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/../sanitizer_common/sanitizer_common.h:532:31
# | #15 0x0000c0af4ae4f8e0 ~InternalMmapVector /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/../sanitizer_common/sanitizer_common.h:642:56
# | #16 0x0000c0af4ae4f8e0 __hwasan::HandleTagMismatch(__hwasan::AccessInfo, unsigned long, unsigned long, void*, unsigned long*) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/hwasan.cpp:245:1
# | #17 0x0000c0af4ae51e8c __hwasan_tag_mismatch4 /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/hwasan/hwasan.cpp:764:1
# | #18 0x0000c0af4ae67b30 __interception::InterceptFunction(char const*, unsigned long*, unsigned long, unsigned long) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/compiler-rt/lib/interception/interception_linux.cpp:60:0
# | #19 0x0000c0af5641cd24 getNumResults /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/Operation.h:404:37
# | #20 0x0000c0af5641cd24 getOpResultImpl /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/Operation.h:1010:5
# | #21 0x0000c0af5641cd24 getResult /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/Operation.h:407:54
# | #22 0x0000c0af5641cd24 mlir::OpTrait::detail::MultiResultTraitBase<mlir::gpu::WarpExecuteOnLane0Op, mlir::OpTrait::VariadicResults>::getResult(unsigned int) /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/OpDefinition.h:638:62
# | #23 0x0000c0af56426b60 getType /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/Value.h:63:33
# | #24 0x0000c0af56426b60 getType /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/include/mlir/IR/Value.h:105:39
# | #25 0x0000c0af56426b60 (anonymous namespace)::LoadDistribution::matchAndRewrite(mlir::gpu::WarpExecuteOnLane0Op, mlir::PatternRewriter&) const /home/b/sanitizer-aarch64-linux-bootstrap-hwasan/build/llvm-project/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp:991:55
...
```
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index c0c4394..5cb47b2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -194,8 +194,7 @@
}
/// Helper to get the default layout for a vector type.
-static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
- bool isScattered = false) {
+static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy) {
// Expecting a 1D or 2D vector.
assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
"Expected 1D or 2D vector.");
@@ -208,14 +207,6 @@
// Packing factor is determined by the element type bitwidth.
int packingFactor = 1;
unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
- if (isScattered) {
- packingFactor =
- bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
- ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
- : 1;
- return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize, 1}),
- LaneData({1, packingFactor}));
- }
if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}),
@@ -223,8 +214,7 @@
}
/// Helper to get the default layout for a vector type.
-static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
- bool isScattered = false) {
+static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy) {
// Expecting a 1D or 2D vector.
assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
"Expected 1D or 2D TensorDesc.");
@@ -237,7 +227,7 @@
// Packing factor is determined by the element type bitwidth.
unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
- if (isScattered) {
+ if (tdescTy.isScattered()) {
int packingFactor =
bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
@@ -551,29 +541,21 @@
propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
}
-/// Propagate the layout of the result to the tensor descriptor, mask and offset
+/// Propagate the layout of the result to the tensor descriptor and mask
/// operands in LoadGatherOp.
void LayoutInfoPropagation::visitLoadGatherOp(
xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- // The layout is strictly determined by the payload type.
- auto payloadTy = dyn_cast<VectorType>(load.getValueType());
- if (!payloadTy) {
- load.emitWarning("Not propagating, non-vector payload supplied.");
- return;
- }
- LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+ // The layout is strictly determined by the tensor descriptor type.
+ LayoutInfo layout = getDefaultSIMTLayoutInfo(load.getTensorDescType());
// Mask operand should have 1D default layout.
LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1);
// Propagate the new layout to the tensor descriptor operand.
- if (isa<xegpu::TensorDescType>(load.getSourceType()))
- propagateIfChanged(operands[0], operands[0]->meet(layout));
- // Propagate the new layout to the mask and optional offset operand.
+ propagateIfChanged(operands[0], operands[0]->meet(layout));
+ // Propagate the new layout to the mask operand.
propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
- if (load.getOffsets())
- propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
}
/// Propagate the layout of the descriptor to the vector offset operand in
@@ -590,39 +572,31 @@
propagateIfChanged(operands[1], operands[1]->meet(layout));
}
-/// Set the layout for the value, tensor descriptor, offset and mask operands in
-/// the StoreScatterOp.
+/// Set the layout for the value, tensor descriptor, and mask operands in the
+/// StoreScatterOp.
void LayoutInfoPropagation::visitStoreScatterOp(
xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
// Currently, for 2D StoreScatterOp we expect that the height dimension of
// the tensor descriptor is equal to the subgroup size. This is ensured by
// the op verifier.
- auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
- if (!payloadTy) {
- storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
- return;
- }
- auto payloadShape = payloadTy.getShape();
- if (payloadShape.size() > 1)
+ ArrayRef<int64_t> tdescShape = storeScatter.getTensorDescType().getShape();
+ if (tdescShape.size() > 1)
assert(
- payloadShape[0] == xegpu::targetinfo::subgroupSize &&
+ tdescShape[0] == xegpu::targetinfo::subgroupSize &&
"Expected the first dimension of 2D tensor descriptor to be equal to "
"subgroup size.");
- LayoutInfo payloadLayout =
- getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+ LayoutInfo layout =
+ getDefaultSIMTLayoutInfo(storeScatter.getTensorDescType());
+ // Propagate the value layout.
+ propagateIfChanged(operands[0], operands[0]->meet(layout));
+ // Propagate the tensor descriptor layout.
+ propagateIfChanged(operands[1], operands[1]->meet(layout));
+ // Use default 1D layout for mask operand.
LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1);
- // Propagate the payload operand layout
- propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
- // Propagate the destination (if tdesc) operand layout
- if (isa<xegpu::TensorDescType>(storeScatter.getDestType()))
- propagateIfChanged(operands[1], operands[1]->meet(payloadLayout));
- // Propagate the new layout to the mask and optional offset operand.
propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
- if (storeScatter.getOffsets())
- propagateIfChanged(operands[3], operands[3]->meet(maskLayout));
}
namespace {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index b491993..dddb5ea 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -807,200 +807,6 @@
}
};
-/// Distribute a scattered store op. The offsets argument is required.
-/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
-/// The layouts are fixed and implicit: one offset/mask per lane.
-/// The pass changes the offset/mask vector shapes to a
-/// single-element vector, **it is assumed that their producer will also be
-/// distributed**. The payload vector also has a fixed distribution:
-/// no chunk size -> vector of one element.
-/// chunk size -> vector of the innermost dimension of the SG-payload.
-/// Example 1 (no chunk size):
-/// %mask = producer_op : vector<16xi1>
-/// %offset = producer_op : vector<16xindex>
-/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
-/// memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-/// %mask = producer_op : vector<1xi1>
-/// %offset = producer_op : vector<1xindex>
-/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
-/// memref<256xf16>, vector<1xindex>, vector<1xi1>
-/// Example 2 (chunk size, same mask and offsets):
-/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-struct StoreDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- Operation *lastNode = warpOp.getTerminator()->getPrevNode();
- auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
- if (!storeScatterOp)
- return failure();
- auto offsets = storeScatterOp.getOffsets();
- if (!offsets || !isa<VectorType>(offsets.getType()))
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Store op must have a vector of offsets argument");
- VectorType offsetsTy = cast<VectorType>(offsets.getType());
- VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(storeScatterOp,
- "Expected 1D offsets and mask vector");
- VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
- if (storeVecTy.getRank() > 2)
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Expected at most 2D result at SG level");
-
- std::string layoutPayloadName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
- std::string layoutOffsetsName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
- std::string layoutMaskName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(3));
-
- xegpu::LayoutAttr layoutPayload =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
- xegpu::LayoutAttr layoutOffsets =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
- xegpu::LayoutAttr layoutMask =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
-
- FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
- FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
- FailureOr<VectorType> distMaskByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
- if (failed(distStoreVecByWarpOpOrFailure) ||
- failed(distOffsetsByWarpOpOrFailure) ||
- failed(distMaskByWarpOpOrFailure)) {
- return rewriter.notifyMatchFailure(
- storeScatterOp,
- "Some vector operands have no layouts, using defaults instead.");
- }
- VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
- VectorType expectedPayloadTy = VectorType::get(
- {distPayloadTy.getNumElements()}, distPayloadTy.getElementType());
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> operands = storeScatterOp->getOperands();
- SmallVector<Type> operandTypesToYield = {
- expectedPayloadTy, operands[1].getType(),
- distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
-
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
- rewriter.setInsertionPointAfter(newWarpOp);
- xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
- rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
- storeScatterOp->getAttrs());
- xegpu::removeLayoutAttrs(newOp);
- rewriter.eraseOp(storeScatterOp);
- return success();
- }
-};
-
-/// Distribute a scattered load op. The logic and requirements are the same as
-/// for the scattered store distribution. The warpOp's payload vector is
-/// expected to be distributed by the load's result consumer.
-/// Example 1 (no chunk size):
-/// %mask = producer_op : vector<16xi1>
-/// %offset = producer_op : vector<16xindex>
-/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
-/// To
-/// %mask = producer_op : vector<1xi1>
-/// %offset = producer_op : vector<1xindex>
-/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
-/// Example 2 (chunk size, same mask and offsets):
-/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-/// To
-/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-struct LoadDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
- // Check if the yield operand that was produced by the *last* scattered
- // load op to avoid sinking it before barriers (maintain memory order).
- return isa<xegpu::LoadGatherOp>(op) &&
- warpOp.getTerminator()->getPrevNode() == op;
- });
- if (!producedByLastLoad)
- return rewriter.notifyMatchFailure(
- warpOp, "The last op is not xegpu::LoadGatherOp");
-
- auto loadGatherOp =
- producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
- auto offsets = loadGatherOp.getOffsets();
- if (!offsets || !isa<VectorType>(offsets.getType()) ||
- !isa<VectorType>(loadGatherOp.getMask().getType()))
- return rewriter.notifyMatchFailure(
- loadGatherOp,
- "Load op must have a vector arguments for offsets and mask");
- VectorType offsetsTy = cast<VectorType>(offsets.getType());
- VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(loadGatherOp,
- "Expected 1D offsets and mask vector");
- // Assume offset and mask producers will be distributed as well.
- std::string layoutOffsetsName =
- xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
- std::string layoutMaskName =
- xegpu::getLayoutName(loadGatherOp->getOpOperand(2));
-
- xegpu::LayoutAttr layoutOffsets =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
- xegpu::LayoutAttr layoutMask =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
-
- FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
- FailureOr<VectorType> distMaskByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
- if (failed(distOffsetsByWarpOpOrFailure) ||
- failed(distMaskByWarpOpOrFailure)) {
- return rewriter.notifyMatchFailure(
- loadGatherOp,
- "Some vector operands have no layouts, using defaults instead.");
- }
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> operands = loadGatherOp->getOperands();
- SmallVector<Type> operandTypesToYield = {
- operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
-
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-
- SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
- const unsigned operandIdx = producedByLastLoad->getOperandNumber();
- VectorType loadVecTy =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
-
- rewriter.setInsertionPointAfter(newWarpOp);
- xegpu::LoadGatherOp newOp = rewriter.create<xegpu::LoadGatherOp>(
- newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
- loadGatherOp->getAttrs());
- xegpu::removeLayoutAttrs(newOp);
- Value distributedVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0));
- return success();
- }
-};
-
} // namespace
namespace {
@@ -1013,11 +819,10 @@
void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
- patterns
- .add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
- DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
- GpuBarrierDistribution, LoadDistribution, StoreDistribution>(
- patterns.getContext());
+ patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+ LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
+ UpdateNdOffsetDistribution, GpuBarrierDistribution>(
+ patterns.getContext());
}
void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index cba3f0b..0214d84 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -163,40 +163,6 @@
}
// -----
-// CHECK-LABEL: func.func @scatter_ops_chunksize(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
- %1 = arith.constant dense<1>: vector<16xi1>
- %offset = arith.constant dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
- : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
- : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- return
-}
-
-// -----
-// CHECK-LABEL: func.func @scatter_ops(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-func.func @scatter_ops(%src: memref<256xf16>) {
- %1 = arith.constant dense<1>: vector<16xi1>
- %offset = arith.constant dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
- xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- return
-}
-
-// -----
// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xi16> to vector<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index a39aa90..54ef56e 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -319,39 +319,3 @@
gpu.return
}
}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @test {
- gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) {
- %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @test {
- gpu.func @scatter_ops(%src: memref<256xf16>) {
- %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 {
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
- xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- gpu.return
- }
-}