test/Transforms/PhaseOrdering/struct-to-vector-before-memcpyopt.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes='default<O3>' %s | FileCheck %s

 %pair = type { i64, i64 }
 %quad = type { i64, i64, i64, i64 }

 declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg)
 declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg)

 ; This test verifies that the default O3 pipeline canonicalizes struct allocas
 ; to vectors only after memcpyopt has run. The input pattern is:
 ;
 ;   memcpy tmp, obj, 16
 ;   memset obj + 16, 0, 16
 ;
 ;  ----- SWAP(other, tmp) -----
 ;
 ;   memcpy swap.tmp, tmp, 16
 ;   memcpy tmp, other, 16
 ;   memcpy other, swap.tmp, 16
 ;
 ; It swaps the first 16-bytes of other and tmp, but the first 16-bytes of tmp
 ; are the same as the first 16-bytes of obj. This comes from real code from
 ; DuckDB, where the swap function is inlined. If struct-to-vector canonicalization
 ; runs before memcpyopt, swap.tmp gets promoted to an SSA value and we are stuck
 ; saving tmp to swap.tmp. Delaying canonicalization until after memcpyopt lets
 ; memcpyopt notice that tmp and obj share the same first 16-bytes, so swap.tmp
 ; is no longer needed and the IR collapses to a single load/memmove/store.
 define void @move_then_swap(ptr %dst, ptr %src, ptr %other) {
 ; CHECK-LABEL: define void @move_then_swap(
 ; CHECK-SAME: ptr nofree writeonly captures(none) initializes((0, 16)) [[DST:%.*]], ptr nofree readonly captures(none) [[SRC:%.*]], ptr nofree captures(none) [[OTHER:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP_SROA_0_0_COPYLOAD:%.*]] = load <2 x i64>, ptr [[OTHER]], align 8
 ; CHECK-NEXT:    tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[OTHER]], ptr noundef nonnull align 8 dereferenceable(16) [[SRC]], i64 16, i1 false)
 ; CHECK-NEXT:    store <2 x i64> [[TMP_SROA_0_0_COPYLOAD]], ptr [[DST]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %tmp = alloca %pair, align 8
   %obj = alloca %quad, align 8
   %swap.tmp = alloca %pair, align 8
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %obj, ptr align 8 %src, i64 32, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %obj, i64 16, i1 false)
   %obj.tail = getelementptr inbounds i8, ptr %obj, i64 16
   call void @llvm.memset.p0.i64(ptr align 8 %obj.tail, i8 0, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %swap.tmp, ptr align 8 %tmp, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %other, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %other, ptr align 8 %swap.tmp, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 16, i1 false)
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
	; RUN: opt -S -passes='default<O3>' %s \| FileCheck %s

	%pair = type { i64, i64 }
	%quad = type { i64, i64, i64, i64 }

	declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg)
	declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg)

	; This test verifies that the default O3 pipeline canonicalizes struct allocas
	; to vectors only after memcpyopt has run. The input pattern is:
	;
	; memcpy tmp, obj, 16
	; memset obj + 16, 0, 16
	;
	; ----- SWAP(other, tmp) -----
	;
	; memcpy swap.tmp, tmp, 16
	; memcpy tmp, other, 16
	; memcpy other, swap.tmp, 16
	;
	; It swaps the first 16-bytes of other and tmp, but the first 16-bytes of tmp
	; are the same as the first 16-bytes of obj. This comes from real code from
	; DuckDB, where the swap function is inlined. If struct-to-vector canonicalization
	; runs before memcpyopt, swap.tmp gets promoted to an SSA value and we are stuck
	; saving tmp to swap.tmp. Delaying canonicalization until after memcpyopt lets
	; memcpyopt notice that tmp and obj share the same first 16-bytes, so swap.tmp
	; is no longer needed and the IR collapses to a single load/memmove/store.
	define void @move_then_swap(ptr %dst, ptr %src, ptr %other) {
	; CHECK-LABEL: define void @move_then_swap(
	; CHECK-SAME: ptr nofree writeonly captures(none) initializes((0, 16)) [[DST:%.]], ptr nofree readonly captures(none) [[SRC:%.]], ptr nofree captures(none) [[OTHER:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
	; CHECK-NEXT: [[ENTRY:.*:]]
	; CHECK-NEXT: [[TMP_SROA_0_0_COPYLOAD:%.*]] = load <2 x i64>, ptr [[OTHER]], align 8
	; CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[OTHER]], ptr noundef nonnull align 8 dereferenceable(16) [[SRC]], i64 16, i1 false)
	; CHECK-NEXT: store <2 x i64> [[TMP_SROA_0_0_COPYLOAD]], ptr [[DST]], align 8
	; CHECK-NEXT: ret void
	;
	entry:
	%tmp = alloca %pair, align 8
	%obj = alloca %quad, align 8
	%swap.tmp = alloca %pair, align 8
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %obj, ptr align 8 %src, i64 32, i1 false)
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %obj, i64 16, i1 false)
	%obj.tail = getelementptr inbounds i8, ptr %obj, i64 16
	call void @llvm.memset.p0.i64(ptr align 8 %obj.tail, i8 0, i64 16, i1 false)
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %swap.tmp, ptr align 8 %tmp, i64 16, i1 false)
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %other, i64 16, i1 false)
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %other, ptr align 8 %swap.tmp, i64 16, i1 false)
	call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 16, i1 false)
	ret void
	}