| // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute \ |
| // RUN: -allow-unregistered-dialect -canonicalize -cse %s | FileCheck %s |
| gpu.module @xevm_module{ |
| // CHECK-LABEL: gpu.func @store_nd_1d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] |
| // CHECK-SAME: -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { |
| // CHECK: gpu.yield %{{.*}} : vector<16xf32>, |
| // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2] : vector<1xf32>, !xegpu.tensor_desc<16xf32> |
| gpu.func @store_nd_1d(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| %cst = "some_op"() : () -> vector<16xf32> |
| xegpu.store_nd %cst, %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>} |
| : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| } |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @store_nd_2d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] |
| // CHECK-SAME: -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { |
| // CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16> |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> |
| gpu.func @store_nd_2d(%laneid : index) { |
| %c0 = arith.constant 0 : index |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| %cst = "some_op"() : () -> vector<16x16xf16> |
| xegpu.store_nd %cst, %0 [%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| } |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @load_nd_1d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>, |
| // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { |
| // CHECK: gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: xegpu.load_nd %[[T1]][%[[W]]#2] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> |
| gpu.func @load_nd_1d(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| %1 = xegpu.load_nd %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : |
| !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32> |
| gpu.yield %1 : vector<16xf32> |
| } |
| "some_user_op"(%r) : (vector<1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @load_nd_2d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { |
| // CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> |
| // CHECK: vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16> |
| gpu.func @load_nd_2d(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> |
| gpu.yield %1 : vector<16x16xf16> |
| } |
| "some_user_op"(%r) : (vector<16x1xf16>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @load_nd_array_length |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>, |
| // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { |
| // CHECK: gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr< |
| // CHECK-SAME: array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], |
| // CHECK-SAME: lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> |
| // CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16> |
| // CHECK-NEXT: vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16> |
| gpu.func @load_nd_array_length(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, |
| #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, |
| #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16> |
| gpu.yield %1 : vector<2x16x16xf16> |
| } |
| "some_user_op"(%r) : (vector<2x16x1xf16>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @dpas |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> |
| // CHECK-SAME: (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) { |
| // CHECK: gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> |
| // CHECK-NEXT: } |
| // CHECK-DAG: %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16> |
| // CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16> |
| // CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32> |
| // CHECK-NEXT: %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> |
| // CHECK-NEXT: vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> |
| gpu.func @dpas(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { |
| %0 = "some_op"() : () -> vector<8x16xf16> |
| %1 = "some_op"() : () -> vector<16x16xf16> |
| %2 = "some_op"() : () -> vector<8x16xf32> |
| %3 = xegpu.dpas %0, %1, %2 |
| { |
| layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, |
| layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> |
| gpu.yield %3 : vector<8x16xf32> |
| } |
| "some_user_op"(%r) : (vector<8x1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| |
| // CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) { |
| // CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64 |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16> |
| // CHECK-NEXT: builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch} |
| gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { |
| %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 -> |
| !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| } |
| "some_user_op"(%r) |
| : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @prefetch_2d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { |
| // CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| // CHECK-SAME: , index, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2] |
| // CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> |
| gpu.func @prefetch_2d(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %0 = "some_op"() : () |
| -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| xegpu.prefetch_nd %0[%c0, %c0] |
| {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} |
| : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| } |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @prefetch_1d |
| // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { |
| // CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16, |
| // CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch} |
| // CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>, |
| // CHECK-SAME: l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> |
| gpu.func @prefetch_1d(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %0 = "some_op"() : () |
| -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| xegpu.prefetch_nd %0[%c0] |
| {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} |
| : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| } |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { |
| // CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) { |
| // CHECK: gpu.yield %{{.*}} |
| // CHECK: } |
| // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16xf16> -> vector<1xf16> |
| // CHECK: gpu.barrier |
| gpu.func @gpu_barrier(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) { |
| %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> |
| %1 = xegpu.load_nd %0[%c0] |
| {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} |
| : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16> |
| gpu.barrier |
| gpu.yield %1 : vector<16xf16> |
| } |
| "some_user_op"(%r) : (vector<1xf16>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction |
| // CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32> |
| // CHECK-NEXT: } |
| // CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> |
| // CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32> |
| // CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> |
| // CHECK: %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32 |
| // CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> |
| // CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32> |
| // CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> |
| // CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32 |
| // CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32> |
| gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : () -> (vector<16x32xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} |
| dense<0.0> : vector<32xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]> |
| } [0] |
| : vector<16x32xf32> to vector<32xf32> |
| gpu.yield %1 : vector<32xf32> |
| } |
| "some_user_op"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction |
| // CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) { |
| // CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32> |
| // CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> |
| // CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32 |
| // CHECK-NEXT: %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> |
| // CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32 |
| gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : () -> (vector<2x16xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} |
| dense<0.0> : vector<2xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]> |
| } |
| [1] : vector<2x16xf32> to vector<2xf32> |
| gpu.yield %1 : vector<2xf32> |
| } |
| "some_user_op"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction |
| // CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32> |
| // CHECK: gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32> |
| // CHECK: } |
| // CHECK: %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> |
| // CHECK: %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> |
| // CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32 |
| // CHECK: %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> |
| // CHECK: %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> |
| // CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32 |
| // CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> |
| gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} |
| : () -> (vector<32x16xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} |
| dense<0.0> : vector<32xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]> |
| } |
| [1] : vector<32x16xf32> to vector<32xf32> |
| gpu.yield %1 : vector<32xf32> |
| } |
| "some_user_op"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction |
| // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 |
| // CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_def"() |
| // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} |
| // CHECK-SAME: : () -> vector<16x2xf32> |
| // CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]] |
| // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| // CHECK-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> |
| // CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] |
| // CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| // CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} |
| // CHECK-SAME: : vector<16x1xf32> to vector<16xf32> |
| // CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32 |
| // CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]] |
| // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| // CHECK-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> |
| // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] |
| // CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| // CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} |
| // CHECK-SAME: : vector<16x1xf32> to vector<16xf32> |
| // CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32 |
| gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} |
| : () -> (vector<16x2xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} |
| dense<0.0> : vector<2xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]> |
| } |
| [0] : vector<16x2xf32> to vector<2xf32> |
| gpu.yield %1 : vector<2xf32> |
| } |
| "some_user_op"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim |
| // CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<1x32xf32> |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<1x2xf32>, vector<1x16x2xf32>, vector<1x2xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x16x32xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<1x32xf32>, vector<1x16x32xf32>, vector<1x32xf32> |
| // CHECK-NEXT: } |
| // CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [0, 0, 0], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32> |
| // CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<1x16x1xf32> to vector<16xf32> |
| // CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x2xf32> |
| // CHECK: %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32 |
| // CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [0, 0, 1], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32> |
| // CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<1x16x1xf32> to vector<16xf32> |
| // CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[0, 1] : f32 from vector<1x2xf32> |
| // CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32 |
| // CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<1x2xf32> |
| gpu.func @vector_multi_reduction_3d_leading_unit_dim(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x2xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} |
| : () -> (vector<1x16x32xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>} |
| dense<0.0> : vector<1x32xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]> |
| } |
| [1] : vector<1x16x32xf32> to vector<1x32xf32> |
| gpu.yield %1 : vector<1x32xf32> |
| } |
| "some_user_op"(%r) : (vector<1x2xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_multi_reduction_3d_trivial_reduction |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<1x1xf32>, vector<1x1x1xf32>, vector<1x1xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x1x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[SRC]], %{{.*}} : vector<1x16xf32>, vector<1x1x16xf32>, vector<1x16xf32> |
| // CHECK-NEXT: } |
| // CHECK: %[[A:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x1xf32> |
| // CHECK: %[[S:.*]] = vector.extract %[[W]]#1[0, 0, 0] : f32 from vector<1x1x1xf32> |
| // CHECK: %[[ADD:.*]] = arith.addf %[[S]], %[[A]] : f32 |
| // CHECK: %[[BC:.*]] = vector.broadcast %[[ADD]] : f32 to vector<1x1xf32> |
| gpu.func @vector_multi_reduction_3d_trivial_reduction(%laneid: index) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) { |
| %src = "some_def"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} |
| : () -> (vector<1x1x16xf32>) |
| %acc = arith.constant |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>} |
| dense<0.0> : vector<1x16xf32> |
| %1 = vector.multi_reduction <add>, %src, %acc |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, |
| layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]> |
| } |
| [1] : vector<1x1x16xf32> to vector<1x16xf32> |
| gpu.yield %1 : vector<1x16xf32> |
| } |
| "some_user_op"(%r) : (vector<1x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { |
| // CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex> |
| // CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1> |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { |
| // CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] : |
| // CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> |
| // CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> |
| // CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> |
| // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) { |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %1 = arith.constant dense<1>: vector<16xi1> |
| %offset = arith.constant dense<12> : vector<16xindex> |
| %3 = xegpu.load %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> |
| : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> |
| xegpu.store %3, %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> |
| : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| } |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { |
| // CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex> |
| // CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1> |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { |
| // CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] |
| // CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 |
| // CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> |
| // CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 |
| // CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) { |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %1 = arith.constant dense<1> : vector<16xi1> |
| %offset = arith.constant dense<12> : vector<16xindex> |
| %3 = xegpu.load %src[%offset], %1 |
| { |
| layout = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> |
| xegpu.store %3, %src[%offset], %1 |
| { |
| layout = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| } |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @scatter_ops_with_leading_dims({{.*}}) { |
| // CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<1x1x16xindex> |
| // CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<1x1x16xi1> |
| // CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] |
| // CHECK-SAME: -> (vector<1x1x1xf16>, memref<256xf16>, vector<1x1x1xindex>, vector<1x1x1xi1>) { |
| // CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] |
| // CHECK-SAME: : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[V1:.*]] = vector.shape_cast %[[W]]#2 : vector<1x1x1xindex> to vector<1xindex> |
| // CHECK-NEXT: %[[V2:.*]] = vector.shape_cast %[[W]]#3 : vector<1x1x1xi1> to vector<1xi1> |
| // CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[V1]]], %[[V2]] |
| // CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> |
| // CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[V1]]], %[[V2]] |
| // CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>, %laneid: index) { |
| gpu.warp_execute_on_lane_0(%laneid)[16] { |
| %1 = arith.constant |
| dense<1> : vector<1x1x16xi1> |
| %offset = arith.constant |
| dense<12> : vector<1x1x16xindex> |
| %3 = xegpu.load %src[%offset], %1 {layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} |
| : memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16> |
| xegpu.store %3, %src[%offset], %1 { layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} |
| : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> |
| } |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) { |
| // CHECK: gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index |
| // CHECK-NEXT: arith.index_cast %[[INTPTR]] : index to i64 |
| gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) { |
| %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index |
| gpu.yield %ptr : index |
| } |
| %ptr_i64 = arith.index_cast %r : index to i64 |
| "some_user_op"(%ptr_i64) : (i64) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @memref_alloca( |
| // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() : memref<2048xi8, 3> |
| // CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ALLOCA]] : memref<2048xi8, 3> -> index |
| // CHECK-NEXT: %[[CAST:.*]] = arith.index_cast %[[INTPTR]] : index to i64 |
| gpu.func @memref_alloca(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (memref<2048xi8, 3>) { |
| %alloca = memref.alloca() : memref<2048xi8, 3> |
| gpu.yield %alloca : memref<2048xi8, 3> |
| } |
| %ptr = memref.extract_aligned_pointer_as_index %r : memref<2048xi8, 3> -> index |
| %ptr_i64 = arith.index_cast %ptr : index to i64 |
| "some_user_op"(%ptr_i64) : (i64) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @create_memdesc( |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3>) { |
| // CHECK: gpu.yield %{{.*}}, %{{.*}} : !xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[MDesc:.*]] = xegpu.create_mem_desc %[[W]]#1 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32> |
| gpu.func @create_memdesc(%laneid: index, %arg0 : memref<2048xi8, 3>) { |
| %c0 = arith.constant 0 : index |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.mem_desc<4x128xf32>) { |
| %mdesc = xegpu.create_mem_desc %arg0 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32> |
| gpu.yield %mdesc : !xegpu.mem_desc<4x128xf32> |
| } |
| %25 = xegpu.load_matrix %r[%c0, %c0]: !xegpu.mem_desc<4x128xf32>, index, index -> vector<1x16xf32> |
| "some_user_op"(%25) : (vector<1x16xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_transpose( |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) { |
| // CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32> |
| gpu.func @vector_transpose(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) { |
| %cst = "some_op"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>} |
| : () -> (vector<16x2xf32>) |
| %transpose = vector.transpose %cst, [1, 0] |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1], order = [0, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16x2xf32> to vector<2x16xf32> |
| gpu.yield %transpose : vector<2x16xf32> |
| } |
| "some_user_op"(%r) : (vector<2x1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_bitcast( |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) { |
| // CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8> |
| // CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8> |
| // CHECK: } |
| // CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16> |
| gpu.func @vector_bitcast(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) { |
| %cst = "some_op"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} |
| : () -> (vector<4x32xi8>) |
| %bitcast = vector.bitcast %cst |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<4x32xi8> to vector<4x16xi16> |
| gpu.yield %bitcast : vector<4x16xi16> |
| } |
| "some_user_op"(%r) : (vector<4x1xi16>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing |
| // CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) { |
| // CHECK: gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32> |
| // CHECK: } |
| // CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32> |
| gpu.func @vector_shapecast_rank_increasing(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) { |
| %cst = "some_op"() |
| {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} |
| : () -> (vector<16xf32>) |
| %cast = vector.shape_cast %cst |
| { |
| layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16xf32> to vector<1x16xf32> |
| gpu.yield %cast : vector<1x16xf32> |
| } |
| "some_user_op"(%r) : (vector<1x1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing( |
| // CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) { |
| // CHECK: gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32> |
| // CHECK: } |
| // CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32> |
| gpu.func @vector_shapecast_rank_reducing(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { |
| %cst = "some_op"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| : () -> (vector<1x16xf32>) |
| %cast = vector.shape_cast %cst |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]> |
| } |
| : vector<1x16xf32> to vector<16xf32> |
| gpu.yield %cast : vector<16xf32> |
| } |
| "some_user_op"(%r) : (vector<1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing_without_slicing_layout |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) { |
| // CHECK: %[[T1:.*]] = vector.shape_cast %{{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32> |
| // CHECK: gpu.yield %[[T1]], %{{.*}} : vector<1x16xf32>, vector<16xf32> |
| // CHECK: } |
| // CHECK: %{{.*}} = vector.shape_cast %[[W]]#1 : vector<1xf32> to vector<1x1xf32> |
| // CHECK: gpu.return |
| gpu.module @xevm_module{ |
| gpu.func @vector_shapecast_rank_increasing_without_slicing_layout(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) { |
| %cst = "some_op"() |
| {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> } |
| : () -> (vector<16xf32>) |
| %cast = vector.shape_cast %cst |
| { |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16xf32> to vector<1x16xf32> |
| gpu.yield %cast : vector<1x16xf32> |
| } |
| "some_user_op"(%r) : (vector<1x1xf32>) -> () |
| gpu.return |
| } |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted |
| // CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () |
| gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { |
| %0 = "some_def"() : () -> (vector<24x16xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<24x16xf32> to vector<8x16xf32> |
| gpu.yield %1 : vector<8x16xf32> |
| } |
| "some_use"(%r) : (vector<8x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed |
| // CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x1xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x1xf32>, vector<24x1xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () |
| gpu.func @vector_extract_strided_slice_non_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { |
| %0 = "some_def"() : () -> (vector<24x1xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<24x1xf32> to vector<8x1xf32> |
| gpu.yield %1 : vector<8x1xf32> |
| } |
| "some_use"(%r) : (vector<8x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x4xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x64xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x64xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () |
| gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { |
| %0 = "some_def"() : () -> (vector<24x64xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<24x64xf32> to vector<8x16xf32> |
| gpu.yield %1 : vector<8x16xf32> |
| } |
| "some_use"(%r) : (vector<8x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32>, vector<2x16xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<32x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32>, vector<32x16xf32> |
| // CHECK: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> |
| // CHECK-NEXT: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32> to vector<1x16xf32> |
| // CHECK-NEXT: "some_use"(%[[T2]]) : (vector<1x16xf32>) -> () |
| gpu.func @vector_extract_strided_slice_outer_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x16xf32>) { |
| %0 = "some_def"() : () -> (vector<32x16xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]> |
| } |
| : vector<32x16xf32> to vector<16x16xf32> |
| gpu.yield %1 : vector<16x16xf32> |
| } |
| "some_use"(%r) : (vector<1x16xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d |
| // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) { |
| // CHECK: %[[S:.*]] = "some_def"() : () -> vector<64xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<32xf32>, vector<64xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<2xf32>) -> () |
| gpu.func @vector_extract_strided_slice_1d(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %0 = "some_def"() : () -> (vector<64xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<64xf32> to vector<32xf32> |
| gpu.yield %1 : vector<32xf32> |
| } |
| "some_use"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset |
| // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { |
| // CHECK: } |
| // CHECK-NOT: %{{.*}} = vector.extract_strided_slice |
| gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %0 = "some_def"() : () -> (vector<64xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<64xf32> to vector<32xf32> |
| gpu.yield %1 : vector<32xf32> |
| } |
| "some_use"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source |
| // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { |
| // CHECK: } |
| // CHECK-NOT: %{{.*}} = vector.extract_strided_slice |
| gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { |
| %0 = "some_def"() : () -> (vector<54xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<54xf32> to vector<32xf32> |
| gpu.yield %1 : vector<32xf32> |
| } |
| "some_use"(%r) : (vector<2xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets |
| // CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 |
| // CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () |
| gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { |
| %0 = "some_def"() : () -> (vector<24x16xf32>) |
| %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<24x16xf32> to vector<8x16xf32> |
| gpu.yield %1 : vector<8x16xf32> |
| } |
| "some_use"(%r) : (vector<8x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted |
| // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16x16xf32>, vector<64x16xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> () |
| gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) { |
| %0 = "some_def"() : () -> (vector<16x16xf32>) |
| %1 = "some_def"() : () -> (vector<64x16xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16x16xf32> into vector<64x16xf32> |
| gpu.yield %2 : vector<64x16xf32> |
| } |
| "some_use"(%r) : (vector<64x1xf32>) -> () |
| gpu.return |
| } |
| |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed |
| // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x1xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x1xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> () |
| gpu.func @vector_insert_strided_slice_non_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) { |
| %0 = "some_def"() : () -> (vector<16x1xf32>) |
| %1 = "some_def"() : () -> (vector<64x1xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16x1xf32> into vector<64x1xf32> |
| gpu.yield %2 : vector<64x1xf32> |
| } |
| "some_use"(%r) : (vector<64x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x32xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x32xf32>, vector<16x16xf32>, vector<64x32xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x2xf32>) -> () |
| gpu.func @vector_insert_strided_slice_inner_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x2xf32>) { |
| %0 = "some_def"() : () -> (vector<16x16xf32>) |
| %1 = "some_def"() : () -> (vector<64x32xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16x16xf32> into vector<64x32xf32> |
| gpu.yield %2 : vector<64x32xf32> |
| } |
| "some_use"(%r) : (vector<64x2xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3x32xf32>, vector<1x16xf32>, vector<3x32xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48x32xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48x32xf32>, vector<16x16xf32>, vector<48x32xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3x32xf32>) -> () |
| gpu.func @vector_insert_strided_slice_outer_distributed(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3x32xf32>) { |
| %0 = "some_def"() : () -> (vector<16x16xf32>) |
| %1 = "some_def"() : () -> (vector<48x32xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]> |
| } |
| : vector<16x16xf32> into vector<48x32xf32> |
| gpu.yield %2 : vector<48x32xf32> |
| } |
| "some_use"(%r) : (vector<3x32xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d |
| // CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>, vector<1xf32>, vector<3xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48xf32>, vector<16xf32>, vector<48xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3xf32>) -> () |
| gpu.func @vector_insert_strided_slice_1d(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) { |
| %0 = "some_def"() : () -> (vector<16xf32>) |
| %1 = "some_def"() : () -> (vector<48xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<16xf32> into vector<48xf32> |
| gpu.yield %2 : vector<48xf32> |
| } |
| "some_use"(%r) : (vector<3xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks |
| // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<1xf32>, vector<64x1xf32>) { |
| // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32> |
| // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32> |
| // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16xf32>, vector<64x16xf32> |
| // CHECK-NEXT: } |
| // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 |
| // CHECK-SAME: {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32> |
| // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> () |
| gpu.func @vector_insert_strided_slice_different_ranks(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) { |
| %0 = "some_def"() : () -> (vector<16xf32>) |
| %1 = "some_def"() : () -> (vector<64x16xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } |
| : vector<16xf32> into vector<64x16xf32> |
| gpu.yield %2 : vector<64x16xf32> |
| } |
| "some_use"(%r) : (vector<64x1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_source |
| // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) { |
| // CHECK: } |
| // CHECK-NOT: %{{.*}} = vector.insert_strided_slice |
| gpu.func @vector_insert_strided_slice_unsupported_source(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) { |
| %0 = "some_def"() : () -> (vector<8xf32>) |
| %1 = "some_def"() : () -> (vector<48xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<8xf32> into vector<48xf32> |
| gpu.yield %2 : vector<48xf32> |
| } |
| "some_use"(%r) : (vector<3xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_offset |
| // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) { |
| // CHECK: } |
| // CHECK-NOT: %{{.*}} = vector.insert_strided_slice |
| gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) { |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) { |
| %0 = "some_def"() : () -> (vector<16xf32>) |
| %1 = "some_def"() : () -> (vector<48xf32>) |
| %2 = vector.insert_strided_slice %0, %1 { offsets = [3], strides = [1], |
| layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> |
| } |
| : vector<16xf32> into vector<48xf32> |
| gpu.yield %2 : vector<48xf32> |
| } |
| "some_use"(%r) : (vector<3xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane |
| // CHECK-SAME: (%[[ARG0:.*]]: index) { |
| // CHECK: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1x16x1xf16>, vector<1xf16>, vector<16x1xf16>) |
| // CHECK: %[[DEF0:.*]] = "some_def"() : () -> vector<16xf16> |
| // CHECK: %[[DEF1:.*]] = "some_def"() : () -> vector<16x16xf16> |
| // CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF0]] |
| // CHECK: %[[CAST_INNER:.*]] = vector.shape_cast %[[DEF1]] : vector<16x16xf16> to vector<1x16x16xf16> |
| // CHECK: gpu.yield %[[BCAST_INNER]], %[[CAST_INNER]], %[[DEF0]], %[[DEF1]] |
| // CHECK: %[[CAST:.*]] = vector.shape_cast %[[R]]#3 : vector<16x1xf16> to vector<1x16x1xf16> |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#2 : vector<1xf16> to vector<16x1xf16> |
| // CHECK: "some_use"(%[[BCAST]]) : (vector<16x1xf16>) -> () |
| // CHECK: "some_use"(%[[CAST]]) : (vector<1x16x1xf16>) -> () |
| gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane(%laneid: index) { |
| |
| %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>, vector<1x16x1xf16>) { |
| |
| %1 = "some_def"() : () -> vector<16xf16> |
| %3 = "some_def"() : () -> vector<16x16xf16> |
| |
| %2 = vector.broadcast %1 { |
| layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } : vector<16xf16> to vector<16x16xf16> |
| |
| %4 = vector.broadcast %3 { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]> |
| } : vector<16x16xf16> to vector<1x16x16xf16> |
| |
| gpu.yield %2, %4 : vector<16x16xf16>, vector<1x16x16xf16> |
| } |
| "some_use"(%r#0) : (vector<16x1xf16>) -> () |
| "some_use"(%r#1) : (vector<1x16x1xf16>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case |
| // CHECK-SAME: (%[[ARG0:.*]]: index) |
| // CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>) |
| // CHECK: %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16> |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] |
| // CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16> |
| // CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16> |
| // CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> () |
| gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) { |
| %1 = "some_def"() : () -> vector<16x1xf16> |
| %2 = vector.broadcast %1 { |
| layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, |
| layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> |
| } : vector<16x1xf16> to vector<16x16xf16> |
| gpu.yield %2: vector<16x16xf16> |
| } |
| "some_use"(%0) : (vector<16x1xf16>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector |
| // CHECK-SAME: (%[[ARG0:.*]]: index) |
| // CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16) |
| // CHECK: %[[DEF:.*]] = "some_def"() |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16> |
| // CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16 |
| // CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16> |
| // CHECK: "some_use"(%[[RESULT]]) |
| gpu.func |
| @vector_shape_cast_scalar_to_vector(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) { |
| %1 = "some_def"() : () -> f16 |
| %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16> |
| gpu.yield %2 : vector<16x16xf16> |
| } |
| "some_use"(%0) : (vector<16x1xf16>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector_uniform |
| // CHECK-SAME: (%[[ARG0:.*]]: index) |
| // CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x16xf16>, f16) |
| // CHECK: %[[DEF:.*]] = "some_def"() |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] : f16 to vector<16x16xf16> |
| // CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16 |
| // CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x16xf16> |
| // CHECK: "some_use"(%[[RESULT]]) |
| gpu.func @vector_shape_cast_scalar_to_vector_uniform(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x16xf16>) { |
| %1 = "some_def"() : () -> f16 |
| %2 = vector.broadcast %1 : f16 to vector<16x16xf16> |
| gpu.yield %2 : vector<16x16xf16> |
| } |
| "some_use"(%0) : (vector<16x16xf16>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_step_slice |
| // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) { |
| // CHECK: %[[LANE_ID_IN_SLICED_DIM:.*]] = arith.remui %[[LANE_ID]], %c16 : index |
| // CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM1:.*]] = arith.remui %[[LANE_ID_IN_SLICED_DIM]], %c16 : index |
| // CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = vector.broadcast %[[LANE_ID_IN_SLICED_DIM1]] : index to vector<1xindex> |
| // CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> () |
| gpu.func @vector_step_slice(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) { |
| %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex> |
| gpu.yield %5 : vector<16xindex> |
| } |
| "some_use"(%0) : (vector<1xindex>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_step_slice_unit |
| // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) { |
| // CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = arith.constant dense<0> : vector<1xindex> |
| // CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> () |
| gpu.func @vector_step_slice_unit(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) { |
| %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex> |
| gpu.yield %5 : vector<1xindex> |
| } |
| "some_use"(%0) : (vector<1xindex>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @vector_step_slice_multi_dist_unit |
| // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) { |
| // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index |
| // CHECK-DAG: %[[DIST_UNIT_SIZE:.*]] = arith.constant 8 : index |
| // CHECK-DAG: %[[SG_LEVEL_VECSIZE:.*]] = arith.constant 16 : index |
| // CHECK-DAG: %[[LANE_LAYOUT:.*]] = arith.constant 4 : index |
| // CHECK-DAG: %[[LANE_DATA:.*]] = arith.constant 2 : index |
| // CHECK-DAG: %[[LANE_DIST_UNIT_START_IDX:.*]] = arith.divui %[[LANE_ID]], %[[LANE_DATA]] : index |
| // CHECK-DAG: %[[DIST_UNIT_0_IDX:.*]] = arith.remui %[[LANE_DIST_UNIT_START_IDX]], %[[LANE_LAYOUT]] : index |
| // CHECK-DAG: %[[DIST_UNIT_0_OFFSET:.*]] = arith.muli %[[DIST_UNIT_0_IDX]], %[[LANE_DATA]] : index |
| // CHECK-DAG: %[[DIST_UNIT_0_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_0_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index |
| // CHECK-DAG: %[[DIST_UNIT_1_OFFSET:.*]] = arith.addi %[[DIST_UNIT_0_OFFSET]], %[[DIST_UNIT_SIZE]] : index |
| // CHECK-DAG: %[[DIST_UNIT_1_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_1_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index |
| // CHECK-DAG: %[[V6:.*]] = arith.addi %[[DIST_UNIT_0_SUBRANGE_START]], %[[C1]] : index |
| // CHECK-DAG: %[[V7:.*]] = arith.addi %[[DIST_UNIT_1_SUBRANGE_START]], %[[C1]] : index |
| // CHECK-DAG: %[[VEC:.*]] = vector.from_elements |
| // CHECK-SAME: %[[DIST_UNIT_0_SUBRANGE_START]], %[[V6]], |
| // CHECK-SAME: %[[DIST_UNIT_1_SUBRANGE_START]], %[[V7]] |
| // CHECK-SAME: : vector<4xindex> |
| // CHECK-NEXT: "some_use"(%[[VEC]]) : (vector<4xindex>) -> () |
| gpu.func @vector_step_slice_multi_dist_unit(%arg0: index) { |
| %0 = gpu.warp_execute_on_lane_0(%arg0)[4] -> (vector<4xindex>) { |
| %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1,2,1]>, dims = [0, 2]>} : vector<16xindex> |
| gpu.yield %5 : vector<16xindex> |
| } |
| "some_use"(%0) : (vector<4xindex>) -> () |
| gpu.return |
| } |
| |
| // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible( |
| // CHECK: %[[R:.*]] = gpu.warp_execute_on_lane_0 |
| // CHECK-NOT: xegpu.convert_layout |
| // CHECK: gpu.yield %{{.*}} : vector<16xf32> |
| gpu.func @convert_layout_removed_when_compatible(%laneid: index){ |
| %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>, vector<1xf32>) { |
| %0 = "some_op"() : () -> vector<16xf32> |
| %2 = "some_op"() : () -> vector<1xf32> |
| %1 = xegpu.convert_layout %0 |
| <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, |
| target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> |
| : vector<16xf32> |
| %3 = xegpu.convert_layout %2 |
| <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, |
| target_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> |
| : vector<1xf32> |
| %4 = xegpu.convert_layout %3 |
| <{input_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>, |
| target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [0, 1]>}> |
| : vector<1xf32> |
| gpu.yield %1, %4 : vector<16xf32>, vector<1xf32> |
| } |
| "some_user_op"(%r#0, %r#1) : (vector<1xf32>, vector<1xf32>) -> () |
| gpu.return |
| } |
| |
| // CHECK-NOT: xegpu.convert_layout |
| // CHECK: gpu.yield %{{.*}} : f32 |
| gpu.func @convert_layout_scalar(%laneid: index){ |
| %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (f32) { |
| %0 = "some_op"() : () -> f32 |
| %1 = xegpu.convert_layout %0 |
| <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>, |
| target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>}> |
| : f32 |
| gpu.yield %1 : f32 |
| } |
| "some_user_op"(%r) : (f32) -> () |
| gpu.return |
| } |
| } |