Mirko Brkusanin | 5bd1feb | 2020-08-21 11:29:32 +0200 | [diff] [blame] | 1 | ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s |
Changpeng Fang | 4cabf6d3 | 2019-02-18 23:00:26 +0000 | [diff] [blame] | 2 | |
| 3 | |
Michael Liao | 0d09230 | 2020-10-11 23:51:53 -0400 | [diff] [blame] | 4 | ; There is no dependence between the store and the two loads. So we can combine |
| 5 | ; the loads and schedule it freely. |
Changpeng Fang | 4cabf6d3 | 2019-02-18 23:00:26 +0000 | [diff] [blame] | 6 | |
| 7 | ; GCN-LABEL: {{^}}ds_combine_nodep |
| 8 | |
Michael Liao | 0d09230 | 2020-10-11 23:51:53 -0400 | [diff] [blame] | 9 | ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 |
| 10 | ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 |
| 11 | ; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) |
Changpeng Fang | 4cabf6d3 | 2019-02-18 23:00:26 +0000 | [diff] [blame] | 12 | define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) { |
| 13 | |
| 14 | %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* |
| 15 | %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 |
| 16 | %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* |
| 17 | %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* |
| 18 | %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 |
| 19 | %v0 = extractelement <3 x float> %load0, i32 2 |
| 20 | |
| 21 | %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 |
| 22 | %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 |
| 23 | |
| 24 | %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 25 | %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* |
| 26 | store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 |
| 27 | |
| 28 | %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 |
| 29 | %v1 = load float, float addrspace(3)* %vaddr1, align 4 |
| 30 | |
| 31 | %sum = fadd float %v0, %v1 |
| 32 | store float %sum, float addrspace(1)* %out, align 4 |
| 33 | ret void |
| 34 | } |
| 35 | |
| 36 | |
| 37 | ; The store depends on the first load, so we could not move the first load down to combine with |
| 38 | ; the second load directly. However, we can move the store after the combined load. |
| 39 | |
| 40 | ; GCN-LABEL: {{^}}ds_combine_WAR |
| 41 | |
| 42 | ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 |
| 43 | ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 |
| 44 | define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) { |
| 45 | |
| 46 | %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* |
| 47 | %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 |
| 48 | %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* |
| 49 | %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* |
| 50 | %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 |
| 51 | %v0 = extractelement <3 x float> %load0, i32 2 |
| 52 | |
| 53 | %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 |
| 54 | %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 |
| 55 | |
| 56 | %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 57 | %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* |
| 58 | store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 |
| 59 | |
| 60 | %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 |
| 61 | %v1 = load float, float addrspace(3)* %vaddr1, align 4 |
| 62 | |
| 63 | %sum = fadd float %v0, %v1 |
| 64 | store float %sum, float addrspace(1)* %out, align 4 |
| 65 | ret void |
| 66 | } |
| 67 | |
| 68 | |
| 69 | ; The second load depends on the store. We can combine the two loads, and the combined load is |
| 70 | ; at the original place of the second load. |
| 71 | |
| 72 | ; GCN-LABEL: {{^}}ds_combine_RAW |
| 73 | |
| 74 | ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 |
| 75 | ; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26 |
| 76 | define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { |
| 77 | |
| 78 | %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* |
| 79 | %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 |
| 80 | %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* |
| 81 | %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* |
| 82 | %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 |
| 83 | %v0 = extractelement <3 x float> %load0, i32 2 |
| 84 | |
| 85 | %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 |
| 86 | %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 |
| 87 | |
| 88 | %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 89 | %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* |
| 90 | store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 |
| 91 | |
| 92 | %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 93 | %v1 = load float, float addrspace(3)* %vaddr1, align 4 |
| 94 | |
| 95 | %sum = fadd float %v0, %v1 |
| 96 | store float %sum, float addrspace(1)* %out, align 4 |
| 97 | ret void |
| 98 | } |
| 99 | |
| 100 | |
| 101 | ; The store depends on the first load, also the second load depends on the store. |
| 102 | ; So we can not combine the two loads. |
| 103 | |
| 104 | ; GCN-LABEL: {{^}}ds_combine_WAR_RAW |
| 105 | |
| 106 | ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 |
| 107 | ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 |
| 108 | ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 |
| 109 | define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { |
| 110 | |
| 111 | %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* |
| 112 | %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 |
| 113 | %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* |
| 114 | %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* |
| 115 | %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 |
| 116 | %v0 = extractelement <3 x float> %load0, i32 2 |
| 117 | |
| 118 | %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 |
| 119 | %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 |
| 120 | |
| 121 | %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 122 | %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* |
| 123 | store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 |
| 124 | |
| 125 | %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 |
| 126 | %v1 = load float, float addrspace(3)* %vaddr1, align 4 |
| 127 | |
| 128 | %sum = fadd float %v0, %v1 |
| 129 | store float %sum, float addrspace(1)* %out, align 4 |
| 130 | ret void |
| 131 | } |