| // RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=LOAD-ND |
| // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=LOAD-GATHER |
| |
| gpu.module @xevm_module { |
| gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector<8xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %offset, %offset], %c0 |
| {in_bounds = [true]} : memref<8x16x32xf32>, vector<8xf32> |
| gpu.return %0 : vector<8xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_1D_vector( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, |
| // LOAD-ND: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1> |
| // LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> |
| // LOAD-ND-COUNT2: arith.muli {{.*}} : index |
| // LOAD-ND-COUNT2: arith.addi {{.*}} : index |
| // LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> |
| // LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> |
| // LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index |
| // LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32> |
| |
| // LOAD-GATHER-LABEL: @load_1D_vector( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1> |
| // LOAD-GATHER: %[[STEP:.+]] = vector.step : vector<8xindex> |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32> |
| |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_2D_vector(%source: memref<8x16x32xf32>, |
| %offset: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %offset, %offset], %c0 |
| {in_bounds = [true, true]} : memref<8x16x32xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_2D_vector( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, |
| // LOAD-ND-SAME: %[[OFFSET:.+]]: index |
| // LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index |
| // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] |
| // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] |
| // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] |
| // LOAD-ND-SAME: : memref<f32> -> index |
| // LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFF1]], %[[ELEM_BYTES]] : index |
| // LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index |
| // LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 |
| // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [16, 32], |
| // LOAD-ND-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, |
| // LOAD-ND-SAME: boundary_check = false |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> |
| // LOAD-ND: return %[[VEC]] |
| |
| // LOAD-GATHER-LABEL: @load_2D_vector( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}}: vector<8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> |
| |
| } |
| |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>, |
| %offset: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %offset], %c0 |
| {in_bounds = [false, true]} : memref<32x64xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_zero_pad_out_of_bounds( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, |
| // LOAD-ND-SAME: %[[OFFSET:.+]]: index |
| // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] |
| // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32> |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> |
| // LOAD-ND: return %[[VEC]] |
| |
| // LOAD-GATHER-LABEL: @load_zero_pad_out_of_bounds( |
| // LOAD-GATHER: vector.transfer_read |
| |
| } |
| |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_transposed(%source: memref<32x64xf32>, |
| %i: index, %j: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%i, %j], %c0 |
| {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, |
| in_bounds = [true, true]} : memref<32x64xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_transposed( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, |
| // LOAD-ND-SAME: %[[OFFSET1:.+]]: index, |
| // LOAD-ND-SAME: %[[OFFSET2:.+]]: index |
| // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] |
| // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32 |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] |
| // LOAD-ND-SAME: -> vector<16x8xf32> |
| // LOAD-ND: %[[VEC_TRANSPOSED:.+]] = vector.transpose %[[VEC]], [1, 0] : vector<16x8xf32> to vector<8x16xf32> |
| // LOAD-ND: return %[[VEC_TRANSPOSED]] |
| |
| |
| // LOAD-GATHER-LABEL: @load_transposed( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<32x64xf32>, |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}}: vector<8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<32x64xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[LOAD:.*]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> |
| |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>, |
| %i: index, %j: index, %k: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%i, %j, %k], %c0 |
| {in_bounds = [true, true]} : memref<?x?x?xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| // LOAD-ND-LABEL: @load_dynamic_source( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, |
| // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index |
| // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] |
| // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] |
| // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref<f32> -> index |
| // LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index |
| // LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index |
| // LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 |
| // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], |
| // LOAD-ND-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, |
| // LOAD-ND-SAME: #xegpu.block_tdesc_attr<boundary_check = false>> |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> |
| // LOAD-ND: return %[[VEC]] |
| |
| |
| // LOAD-GATHER-LABEL: @load_dynamic_source( |
| // LOAD-GATHER-SAME: %[[ARG0:.+]]: memref<?x?x?xf32>, |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER: memref.extract_strided_metadata %[[ARG0]] |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[BROADIDX:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex> |
| // LOAD-GATHER: %[[FINALIDX:.+]] = arith.addi %[[BROADIDX]], {{.*}} : vector<8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<?x?x?xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[RES:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[FINALIDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> |
| // LOAD-GATHER: gpu.return %[[RES]] : vector<8x16xf32> |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>, |
| %i: index, %j: index, %k: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%i, %j, %k], %c0 |
| {in_bounds = [true, true]} : memref<?x8x16xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_dynamic_source2( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<?x8x16xf32>, |
| // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index |
| // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] |
| // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] |
| // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] |
| // LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index |
| // LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index |
| // LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 |
| // LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [8, 16], strides : [16, 1] : |
| // LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32> |
| // LOAD-ND: return %[[VEC]] : vector<8x16xf32> |
| |
| // LOAD-GATHER-LABEL: @load_dynamic_source2( |
| // LOAD-GATHER-DAG: %[[CST_0:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER-DAG: %[[BCASTIDX:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex> |
| // LOAD-GATHER-DAG: %[[OFFSETS:.+]] = arith.addi %[[BCASTIDX]], {{.*}} : vector<8x16xindex> |
| // LOAD-GATHER-DAG: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<?x8x16xf32> -> index |
| // LOAD-GATHER-DAG: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> |
| |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_dynamic_source3(%source: memref<?x?x?x?x?xf32>, |
| %i: index, %j: index, %k: index, %l: index, %m: index) -> vector<2x4x8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%i, %j, %k, %l, %m], %c0 |
| {in_bounds = [true, true, true, true]} : memref<?x?x?x?x?xf32>, vector<2x4x8x16xf32> |
| gpu.return %0 : vector<2x4x8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_dynamic_source3( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @load_dynamic_source3( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<?x?x?x?x?xf32> |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<2x4x8x16xi1> |
| // LOAD-GATHER: memref.extract_strided_metadata %[[SRC]] : memref<?x?x?x?x?xf32> -> memref<f32>, index, index, index, index, index, index, index, index, index, index, index |
| // LOAD-GATHER-COUNT4: vector.step |
| // LOAD-GATHER-COUNT3: vector.broadcast |
| // LOAD-GATHER-COUNT4: vector.shape_cast |
| // LOAD-GATHER-COUNT4: vector.broadcast {{.*}} : vector<2x4x8x16xindex> |
| // LOAD-GATHER-COUNT3: arith.addi {{.*}} : vector<2x4x8x16xindex> |
| // LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<2x4x8x16xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<2x4x8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<?x?x?x?x?xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<2x4x8x16xindex>, vector<2x4x8x16xi1> -> vector<2x4x8x16xf32> |
| // LOAD-GATHER: return %[[VEC]] |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_high_dim_vector(%source: memref<16x32x64xf32>, |
| %offset: index, %arg2: index) -> vector<8x16x32xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %arg2, %offset], %c0 |
| {in_bounds = [true, true, true]} : memref<16x32x64xf32>, vector<8x16x32xf32> |
| gpu.return %0 : vector<8x16x32xf32> |
| } |
| |
| // LOAD-ND-LABEL: @load_high_dim_vector( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @load_high_dim_vector( |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16x32xi1> |
| // LOAD-GATHER: %[[CST_0:.+]] = arith.constant dense<64> : vector<16xindex> |
| // LOAD-GATHER: %[[CST_1:.+]] = arith.constant dense<2048> : vector<8xindex> |
| // LOAD-GATHER: %[[C2048:.+]] = arith.constant 2048 : index |
| // LOAD-GATHER: %[[C64:.+]] = arith.constant 64 : index |
| // LOAD-GATHER-COUNT3: vector.step |
| // LOAD-GATHER-COUNT3: vector.shape_cast |
| // LOAD-GATHER-COUNT3: vector.broadcast {{.*}} : vector<8x16x32xindex> |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : vector<8x16x32xindex> |
| // LOAD-GATHER: %[[BCASTOFF:.+]] = vector.broadcast {{.*}} : index to vector<8x16x32xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[BCASTOFF]], {{.*}} : vector<8x16x32xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<16x32x64xf32> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16x32xindex>, vector<8x16x32xi1> -> vector<8x16x32xf32> |
| |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_transpose_f16(%source: memref<32x64xf16>, |
| %offset: index) -> vector<8x16xf16> { |
| %c0 = arith.constant 0.0 : f16 |
| %0 = vector.transfer_read %source[%offset, %offset], %c0 |
| {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, |
| in_bounds = [true, true]} : memref<32x64xf16>, vector<8x16xf16> |
| gpu.return %0 : vector<8x16xf16> |
| } |
| |
| // LOAD-ND-LABEL: @load_transpose_f16( |
| // LOAD-ND: %[[LOAD:.*]] = xegpu.load_nd |
| // LOAD-ND: vector.transpose %[[LOAD]], [1, 0] : vector<16x8xf16> to vector<8x16xf16> |
| |
| // LOAD-GATHER-LABEL: @load_transpose_f16( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<32x64xf16>, |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[BCAST2:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[BCAST2]], {{.*}}: vector<8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<32x64xf16> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[LOAD:.*]] = xegpu.load %[[COLLAPSE_I]][%[[IDX]]], %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16> |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_out_of_bounds_non_zero_pad(%source: memref<32x64xf32>, |
| %offset: index, %arg2: index, %pad: f32) -> (vector<8x16xf32>, vector<8x16xf32>) { |
| %c1 = arith.constant 1.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %arg2], %c1 |
| {in_bounds = [true, false]} : memref<32x64xf32>, vector<8x16xf32> |
| %1 = vector.transfer_read %source[%arg2, %offset], %pad |
| {in_bounds = [false, true]} : memref<32x64xf32>, vector<8x16xf32> |
| gpu.return %0, %1 : vector<8x16xf32>, vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_out_of_bounds_non_zero_pad( |
| // LOAD-ND-COUNT-2: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_out_of_bounds_non_zero_pad( |
| // LOAD-GATHER-COUNT-2: vector.transfer_read |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_out_of_bounds_1D_vector(%source: memref<8x16x32xf32>, |
| %offset: index) -> vector<8xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %offset, %offset], %c0 |
| {in_bounds = [false]} : memref<8x16x32xf32>, vector<8xf32> |
| gpu.return %0 : vector<8xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_out_of_bounds_1D_vector( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_out_of_bounds_1D_vector( |
| // LOAD-GATHER: vector.transfer_read |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_masked(%source : memref<4xf32>, |
| %offset : index) -> vector<4xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %mask = arith.constant dense<[0, 1, 0, 1]> : vector<4xi1> |
| %0 = vector.transfer_read %source[%offset], %c0, %mask |
| {in_bounds = [true]} : memref<4xf32>, vector<4xf32> |
| gpu.return %0 : vector<4xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_masked( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_masked( |
| // LOAD-GATHER: vector.transfer_read |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_tensor(%source: tensor<32x64xf32>, |
| %offset: index, %arg2: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %arg2], %c0 |
| {in_bounds = [true, true]} : tensor<32x64xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_tensor( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_tensor( |
| // LOAD-GATHER: vector.transfer_read |
| } |
| |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_non_unit_inner_stride( |
| %source: memref<32xf32, strided<[?], offset: ?>>, |
| %offset: index) -> vector<8xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset], %c0 {in_bounds = [true]} |
| : memref<32xf32, strided<[?], offset: ?>>, vector<8xf32> |
| gpu.return %0 : vector<8xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_non_unit_inner_stride( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_non_unit_inner_stride( |
| // LOAD-GATHER: vector.transfer_read |
| } |
| |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>, |
| %offset: index) -> vector<8x16xf32> { |
| %c0 = arith.constant 0.0 : f32 |
| %0 = vector.transfer_read %source[%offset, %offset, %offset], %c0 |
| {permutation_map = affine_map<(d0, d1, d2) -> (d0, d2)>, |
| in_bounds = [true, true]} : memref<16x32x64xf32>, vector<8x16xf32> |
| gpu.return %0 : vector<8x16xf32> |
| } |
| |
| // LOAD-ND-LABEL: @no_load_unsupported_map( |
| // LOAD-ND: vector.transfer_read |
| |
| // LOAD-GATHER-LABEL: @no_load_unsupported_map( |
| // LOAD-GATHER: vector.transfer_read |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> { |
| %c0 = arith.constant 0.0 : f16 |
| %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| %0 = vector.transfer_read %subview[%off2, %off2], %c0 |
| {in_bounds = [true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8xf16> |
| gpu.return %0 : vector<8xf16> |
| } |
| |
| // LOAD-ND-LABEL: @load_from_subview_1D( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, |
| // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-ND: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1> |
| // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| // LOAD-ND: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index |
| // LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> |
| // LOAD-ND: arith.muli {{.*}} : index |
| // LOAD-ND: arith.addi %[[OFFSET]]{{.*}} : index |
| // LOAD-ND: arith.addi {{.*}} : index |
| // LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> |
| // LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> |
| // LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index |
| // LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> |
| |
| // LOAD-GATHER-LABEL: @load_from_subview_1D( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, |
| // LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1> |
| // LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| // LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index |
| // LOAD-GATHER: %[[STEP:.+]] = vector.step : vector<8xindex> |
| // LOAD-GATHER: arith.muli {{.*}} : index |
| // LOAD-GATHER: arith.addi %[[OFFSET]]{{.*}} : index |
| // LOAD-GATHER: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> |
| } |
| |
| // ----- |
| gpu.module @xevm_module { |
| gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8x16xf16> { |
| %c0 = arith.constant 0.0 : f16 |
| %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| %0 = vector.transfer_read %subview[%off2, %off2], %c0 |
| {in_bounds = [true, true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8x16xf16> |
| gpu.return %0 : vector<8x16xf16> |
| } |
| |
| // LOAD-ND-LABEL: @load_from_subview_2D( |
| // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, |
| // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 2 : index |
| // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[SUBVIEW]] |
| // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] |
| // LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index |
| // LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index |
| // LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 |
| // LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [256, 256], strides : [4096, 1] : |
| // LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<boundary_check = false>> |
| // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16> |
| // LOAD-ND: return %[[VEC]] |
| |
| // LOAD-GATHER-LABEL: @load_from_subview_2D( |
| // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, |
| // LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index |
| // LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1> |
| // LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> |
| // LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index |
| // LOAD-GATHER-COUNT2: vector.step |
| // LOAD-GATHER-COUNT2: vector.shape_cast |
| // LOAD-GATHER-COUNT2: vector.broadcast |
| // LOAD-GATHER-COUNT2: arith.muli {{.*}} : index |
| // LOAD-GATHER-COUNT2: arith.addi {{.*}} : index |
| // LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex> |
| // LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex> |
| // LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index |
| // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 |
| // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16> |
| } |