blob: c349709ea0f050d3e2446187aa54dedce3cadb40 [file] [log] [blame] [edit]
; RUN: opt -passes=fix-irreducible,unify-loop-exits,structurizecfg -S %s | FileCheck %s
;
target triple = "amdgcn-amd-amdhsa"
;
; Reduced from rocPRIM block_sort_kernel compiled with code coverage.
; The sort comparator `operator<` for custom_type uses nested short-circuit:
; less<T>(x) || (equal_to<T>(x) && less<U>(y))
; With PGO instrumentation, each functor call gets a counter block,
; creating a deeply nested CFG that StructurizeCFG must linearize.
;
; The bug: hoistZeroCostElseBlockPhiValues() hoists shufflevector out of
; the "swap" block into a dominator, and simplifyHoistedPhis() then
; fills poison entries in Flow phis indiscriminately — causing the
; hoisted shufflevector result to reach the "no-swap" path, swapping
; two sort keys that should not be swapped.
;
; After structurizecfg, the shufflevector that performs the conditional
; swap must NOT appear before the first comparison block.
@counter_less = external addrspace(1) global [4 x i64]
@counter_eq = external addrspace(1) global [2 x i64]
@counter_swap = external addrspace(1) global [2 x i64]
; Two consecutive compare-and-swap iterations from a sorting network.
; Each iteration compares key_a vs key_b using:
; less(a.x, b.x) || (equal(a.x, b.x) && less(a.y, b.y))
; If true, swap both keys and their index vector.
;
; iter1 compares keys[0] vs keys[1]
; iter2 compares keys[1] vs keys[2], using the (possibly swapped) result of iter1
;
; The values_vec (<4 x i32>) tracks the value indices associated with the keys.
; A shufflevector <0,2,1,3> swaps the middle two elements (the values for the two keys being compared).
define amdgpu_kernel void @sort_two_iters(
i1 %do_iter1, i1 %do_iter2,
<2 x float> %key_a, <2 x float> %key_b, <2 x float> %key_c,
<4 x i32> %values_vec,
ptr addrspace(1) %out_values, ptr addrspace(1) %out_key
) {
entry:
%cnt_less0 = load i64, ptr addrspace(1) @counter_less, align 8
%cnt_eq0 = load i64, ptr addrspace(1) @counter_eq, align 8
%cnt_swap0 = load i64, ptr addrspace(1) @counter_swap, align 8
br i1 %do_iter1, label %iter1.cmp_x, label %iter1.done
; --- Iteration 1: compare key_a vs key_b ---
iter1.cmp_x:
; PGO: counter for less<float>(a.x, b.x)
%cnt_less1 = add i64 %cnt_less0, 1
store i64 %cnt_less1, ptr addrspace(1) @counter_less, align 8
%ax = extractelement <2 x float> %key_a, i64 0
%bx = extractelement <2 x float> %key_b, i64 0
%x_less = fcmp olt float %ax, %bx
br i1 %x_less, label %iter1.do_swap, label %iter1.check_eq
iter1.check_eq:
; PGO: counter for equal_to<float>(a.x, b.x)
%cnt_eq1 = add i64 %cnt_eq0, 1
store i64 %cnt_eq1, ptr addrspace(1) @counter_eq, align 8
%x_eq = fcmp oeq float %ax, %bx
br i1 %x_eq, label %iter1.cmp_y, label %iter1.done
iter1.cmp_y:
; PGO: counter for less<float>(a.y, b.y)
%cnt_less2 = add i64 %cnt_less0, 2
store i64 %cnt_less2, ptr addrspace(1) @counter_less, align 8
%ay = extractelement <2 x float> %key_a, i64 1
%by = extractelement <2 x float> %key_b, i64 1
%y_less = fcmp olt float %ay, %by
br i1 %y_less, label %iter1.do_swap, label %iter1.done
iter1.do_swap:
; PGO: counter for swap
%cnt_swap1 = add i64 %cnt_swap0, 1
store i64 %cnt_swap1, ptr addrspace(1) @counter_swap, align 8
; Swap middle two elements of values_vec (the indices for the two compared keys)
%swapped1 = shufflevector <4 x i32> %values_vec, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
br label %iter1.done
iter1.done:
; Select swapped or original values, and swapped keys
%vals1 = phi <4 x i32> [ %swapped1, %iter1.do_swap ], [ %values_vec, %iter1.check_eq ], [ %values_vec, %iter1.cmp_y ], [ %values_vec, %entry ]
%cnt_eq_out1 = phi i64 [ %cnt_eq0, %iter1.do_swap ], [ %cnt_eq1, %iter1.check_eq ], [ %cnt_eq1, %iter1.cmp_y ], [ %cnt_eq0, %entry ]
%cnt_less_out1 = phi i64 [ %cnt_less1, %iter1.do_swap ], [ %cnt_less1, %iter1.check_eq ], [ %cnt_less2, %iter1.cmp_y ], [ %cnt_less0, %entry ]
; Swapped keys: if swap happened, key_a and key_b switch
%key_lo1 = phi <2 x float> [ %key_b, %iter1.do_swap ], [ %key_a, %iter1.check_eq ], [ %key_a, %iter1.cmp_y ], [ %key_a, %entry ]
%key_hi1 = phi <2 x float> [ %key_a, %iter1.do_swap ], [ %key_b, %iter1.check_eq ], [ %key_b, %iter1.cmp_y ], [ %key_b, %entry ]
br i1 %do_iter2, label %iter2.cmp_x, label %iter2.done
; --- Iteration 2: compare key_hi1 (from iter1) vs key_c ---
iter2.cmp_x:
; PGO: counter for less<float>
%cnt_less3 = add i64 %cnt_less_out1, 1
store i64 %cnt_less3, ptr addrspace(1) @counter_less, align 8
%cx = extractelement <2 x float> %key_c, i64 0
%hi1x = extractelement <2 x float> %key_hi1, i64 0
%x_less2 = fcmp olt float %hi1x, %cx
br i1 %x_less2, label %iter2.do_swap, label %iter2.check_eq
iter2.check_eq:
; PGO: counter for equal_to<float>
%cnt_eq2 = add i64 %cnt_eq_out1, 1
store i64 %cnt_eq2, ptr addrspace(1) @counter_eq, align 8
%x_eq2 = fcmp oeq float %hi1x, %cx
br i1 %x_eq2, label %iter2.cmp_y, label %iter2.done
iter2.cmp_y:
; PGO: counter for less<float>
%cnt_less4 = add i64 %cnt_less_out1, 2
store i64 %cnt_less4, ptr addrspace(1) @counter_less, align 8
%hi1y = extractelement <2 x float> %key_hi1, i64 1
%cy = extractelement <2 x float> %key_c, i64 1
%y_less2 = fcmp olt float %hi1y, %cy
br i1 %y_less2, label %iter2.do_swap, label %iter2.done
iter2.do_swap:
; PGO: counter for swap
%cnt_swap2 = add i64 %cnt_swap0, 2
store i64 %cnt_swap2, ptr addrspace(1) @counter_swap, align 8
; Swap middle two elements of vals1 (the values that resulted from iter1)
%swapped2 = shufflevector <4 x i32> %vals1, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
br label %iter2.done
iter2.done:
%vals2 = phi <4 x i32> [ %swapped2, %iter2.do_swap ], [ %vals1, %iter2.check_eq ], [ %vals1, %iter2.cmp_y ], [ %vals1, %iter1.done ]
store <4 x i32> %vals2, ptr addrspace(1) %out_values, align 16
ret void
}
; After structurizecfg, verify that the shufflevector for iter1's swap
; is NOT hoisted into iter1.cmp_x. It must stay in iter1.do_swap (or
; a Flow block gated by the swap predicate) so that the no-swap path
; never sees the swapped values.
;
; CHECK-LABEL: @sort_two_iters
;
; The shufflevector must NOT appear in iter1.cmp_x (it would be hoisted there by the bug):
; CHECK: iter1.cmp_x:
; CHECK-NOT: shufflevector
; CHECK: br i1
;
; It should remain in iter1.do_swap:
; CHECK: iter1.do_swap:
; CHECK: %swapped1 = shufflevector <4 x i32> %values_vec, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
;
; The vals1 phi at iter1.done must select between the Flow block result
; (which should carry both swapped and unswapped correctly) and the
; entry bypass. It must NOT directly carry the hoisted shufflevector result
; from a block that's reachable without going through the swap predicate.
; CHECK: iter1.done:
; CHECK: %vals1 = phi <4 x i32>