|  | ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s | 
|  | ; | 
|  | ; This test checks that the lds input queue will is empty at the end of | 
|  | ; the ALU clause. | 
|  |  | 
|  | ; CHECK-LABEL: {{^}}lds_input_queue: | 
|  | ; CHECK: LDS_READ_RET * OQAP | 
|  | ; CHECK-NOT: ALU clause | 
|  | ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP | 
|  |  | 
|  | @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] poison, align 4 | 
|  |  | 
|  | define amdgpu_kernel void @lds_input_queue(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %index) { | 
|  | entry: | 
|  | %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index | 
|  | %1 = load i32, ptr addrspace(3) %0 | 
|  | call void @llvm.r600.group.barrier() | 
|  |  | 
|  | ; This will start a new clause for the vertex fetch | 
|  | %2 = load i32, ptr addrspace(1) %in | 
|  | %3 = add i32 %1, %2 | 
|  | store i32 %3, ptr addrspace(1) %out | 
|  | ret void | 
|  | } | 
|  |  | 
|  | declare void @llvm.r600.group.barrier() nounwind convergent | 
|  |  | 
|  | ; The machine scheduler does not do proper alias analysis and assumes that | 
|  | ; loads from global values (Note that a global value is different that a | 
|  | ; value from global memory.  A global value is a value that is declared | 
|  | ; outside of a function, it can reside in any address space) alias with | 
|  | ; all other loads. | 
|  | ; | 
|  | ; This is a problem for scheduling the reads from the local data share (lds). | 
|  | ; These reads are implemented using two instructions.  The first copies the | 
|  | ; data from lds into the lds output queue, and the second moves the data from | 
|  | ; the input queue into main memory.  These two instructions don't have to be | 
|  | ; scheduled one after the other, but they do need to be scheduled in the same | 
|  | ; clause.  The aliasing problem mentioned above causes problems when there is a | 
|  | ; load from global memory which immediately follows a load from a global value that | 
|  | ; has been declared in the local memory space: | 
|  | ; | 
|  | ;  %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index | 
|  | ;  %1 = load i32, ptr addrspace(3) %0 | 
|  | ;  %2 = load i32, ptr addrspace(1) %in | 
|  | ; | 
|  | ; The instruction selection phase will generate ISA that looks like this: | 
|  | ; %oqap = LDS_READ_RET | 
|  | ; %0 = MOV %oqap | 
|  | ; %1 = VTX_READ_32 | 
|  | ; %2 = ADD_INT %1, %0 | 
|  | ; | 
|  | ; The bottom scheduler will schedule the two ALU instructions first: | 
|  | ; | 
|  | ; UNSCHEDULED: | 
|  | ; %oqap = LDS_READ_RET | 
|  | ; %1 = VTX_READ_32 | 
|  | ; | 
|  | ; SCHEDULED: | 
|  | ; | 
|  | ; %0 = MOV %oqap | 
|  | ; %2 = ADD_INT %1, %2 | 
|  | ; | 
|  | ; The lack of proper aliasing results in the local memory read (LDS_READ_RET) | 
|  | ; to consider the global memory read (VTX_READ_32) has a chain dependency, so | 
|  | ; the global memory read will always be scheduled first.  This will give us a | 
|  | ; final program which looks like this: | 
|  | ; | 
|  | ; Alu clause: | 
|  | ; %oqap = LDS_READ_RET | 
|  | ; VTX clause: | 
|  | ; %1 = VTX_READ_32 | 
|  | ; Alu clause: | 
|  | ; %0 = MOV %oqap | 
|  | ; %2 = ADD_INT %1, %2 | 
|  | ; | 
|  | ; This is an illegal program because the oqap def and use know occur in | 
|  | ; different ALU clauses. | 
|  | ; | 
|  | ; This test checks this scenario and makes sure it doesn't result in an | 
|  | ; illegal program.  For now, we have fixed this issue by merging the | 
|  | ; LDS_READ_RET and MOV together during instruction selection and then | 
|  | ; expanding them after scheduling.  Once the scheduler has better alias | 
|  | ; analysis, we should be able to keep these instructions sparate before | 
|  | ; scheduling. | 
|  | ; | 
|  | ; CHECK-LABEL: {{^}}local_global_alias: | 
|  | ; CHECK: LDS_READ_RET | 
|  | ; CHECK-NOT: ALU clause | 
|  | ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP | 
|  | define amdgpu_kernel void @local_global_alias(ptr addrspace(1) %out, ptr addrspace(1) %in) { | 
|  | entry: | 
|  | %0 = load i32, ptr addrspace(3) @local_mem | 
|  | %1 = load i32, ptr addrspace(1) %in | 
|  | %2 = add i32 %1, %0 | 
|  | store i32 %2, ptr addrspace(1) %out | 
|  | ret void | 
|  | } |