test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll - llvm-project/llvm - Git at Google

 ; REQUIRES: aarch64-registered-target

 ; This test needs to be target specific due to the cost estimate in the output.

 ; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck  %s

 ; Test the propagation of matrix expressions along to inlined-at chain. The IR
 ; in the test roughly corresponds to the C++ code below, with the IR containing
 ; references to a few more functions.

 ; matrix.h
 ; template <typename Ty, unsigned R, unsigned C>
 ; struct Matrix {
 ;   using matrix_t = Ty __attribute__((matrix_type(R, C)));
 ;
 ;   matrix_t value;
 ; };
 ;
 ; ; add.h
 ; template <typename Ty, unsigned R, unsigned C>
 ; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
 ;   Matrix<Ty, R, C> Result;
 ;   Result.value = __builtin_matrix_add(M1.value, M2.value);
 ;   return Result;
 ; }
 ;
 ; load.h:
 ; template <typename Ty, unsigned R, unsigned C>
 ; Matrix<Ty, R, C> load(Ty *Ptr) {
 ;   Matrix<Ty, R, C> Result;
 ;   Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
 ;   return Result;
 ; }
 ;
 ; store.h:
 ; template <typename Ty, unsigned R, unsigned C>
 ; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
 ;   *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
 ; }
 ;
 ; toplevel.cpp
 ; void test(double *A, double *B, double *C) {
 ;   store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
 ; }
 ;

 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "aarch64-apple-ios"

 ; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
 ; CHECK-NEXT:  load(addr %A)

 ; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
 ; CHECK-NEXT:  column.major.load.3x5.double(addr %B, 5)

 ; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
 ; CHECK-NEXT: load(addr %D)

 ; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
 ; CHECK-NEXT:  load(addr %A)

 ; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
 ; CHECK-NEXT:  column.major.load.3x5.double(addr %B, 5)

 ; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
 ; CHECK-NEXT:  store(
 ; CHECK-NEXT:   fadd(
 ; CHECK-NEXT:    load(addr %A),
 ; CHECK-NEXT:    column.major.load.3x5.double(addr %B, 5)),
 ; CHECK-NEXT:   addr %C)

 ; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
 ; CHECK-NEXT:  store(
 ; CHECK-NEXT:   transpose.1x2.float(transpose.2x1.float(load(addr %D))),
 ; CHECK-NEXT:   addr %D)

 ; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
 ; CHECK-NEXT:  fadd(
 ; CHECK-NEXT:   addr %A,
 ; CHECK-NEXT:   scalar)

 ; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
 ; CHECK-NEXT:  store(
 ; CHECK-NEXT:   scalar,
 ; CHECK-NEXT:   addr %C)

 ; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
 ; CHECK-NEXT:  store(
 ; CHECK-NEXT:  scalar,
 ; CHECK-NEXT:  addr %D)

 ; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
 ; CHECK-NEXT:  transpose.1x2.float(transpose.2x1.float(addr %D))

 define void @toplevel(<15 x double>* %A, double* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
 entry:
   %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
   %b = call <15 x double> @llvm.matrix.column.major.load(double* %B, i64 5, i1 false, i32 3, i32 5), !dbg !3793
   %c  = fadd <15 x double> %a, %b, !dbg !100
   store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102

   %load = load <2 x float>, <2 x float>* %D, !dbg !104
   %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
   %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
   store <2 x float> %t2, <2 x float>* %D, !dbg !108
   ret void
 }

 declare <15 x double> @llvm.matrix.column.major.load(double*, i64, i1, i32, i32)
 declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)

 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}

 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
 !1 = !DIFile(filename: "load.h", directory: "/test")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
 !5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
 !17 = !DIFile(filename: "toplevel.c", directory: "/test")
 !16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
 !18 = !DIFile(filename: "assign.h", directory: "/test")
 !19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

 !20 = !DIFile(filename: "add.h", directory: "/test")
 !21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

 !22 = !DIFile(filename: "store.h", directory: "/test")
 !23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

 !24 = !DIFile(filename: "transpose.h", directory: "/test")
 !25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)


 !6 = !DISubroutineType(types: !7)
 !7 = !{null, !8, !8, !11}
 !8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
 !10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
 !11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !12 = !{!13}
 !13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
 !14 = !DILocation(line: 1, column: 27, scope: !5)

 !3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
 !3792 = !DILocation(line: 405, column: 3, scope: !16)
 !3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
 !3794 = !DILocation(line: 406, column: 11, scope: !16)
 !3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
 !3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)

 !100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
 !101 = !DILocation(line: 410, column: 11, scope: !16)

 !102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
 !103 = !DILocation(line: 410, column: 0, scope: !16)

 !104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
 !105 = !DILocation(line: 500, column: 11, scope: !16)

 !106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
 !107 = !DILocation(line: 510, column: 11, scope: !16)

 !108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
 !109 = !DILocation(line: 510, column: 0, scope: !16)
	; REQUIRES: aarch64-registered-target

	; This test needs to be target specific due to the cost estimate in the output.

	; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 \| FileCheck %s

	; Test the propagation of matrix expressions along to inlined-at chain. The IR
	; in the test roughly corresponds to the C++ code below, with the IR containing
	; references to a few more functions.

	; matrix.h
	; template <typename Ty, unsigned R, unsigned C>
	; struct Matrix {
	; using matrix_t = Ty __attribute__((matrix_type(R, C)));
	;
	; matrix_t value;
	; };
	;
	; ; add.h
	; template <typename Ty, unsigned R, unsigned C>
	; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
	; Matrix<Ty, R, C> Result;
	; Result.value = __builtin_matrix_add(M1.value, M2.value);
	; return Result;
	; }
	;
	; load.h:
	; template <typename Ty, unsigned R, unsigned C>
	; Matrix<Ty, R, C> load(Ty *Ptr) {
	; Matrix<Ty, R, C> Result;
	; Result.value = reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t >(Ptr);
	; return Result;
	; }
	;
	; store.h:
	; template <typename Ty, unsigned R, unsigned C>
	; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
	; reinterpret_cast<typename decltype(M1)::matrix_t >(Ptr) = M1.value;
	; }
	;
	; toplevel.cpp
	; void test(double A, double B, double *C) {
	; store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
	; }
	;

	target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "aarch64-apple-ios"

	; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
	; CHECK-NEXT: load(addr %A)

	; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
	; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5)

	; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
	; CHECK-NEXT: load(addr %D)

	; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
	; CHECK-NEXT: load(addr %A)

	; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
	; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5)

	; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
	; CHECK-NEXT: store(
	; CHECK-NEXT: fadd(
	; CHECK-NEXT: load(addr %A),
	; CHECK-NEXT: column.major.load.3x5.double(addr %B, 5)),
	; CHECK-NEXT: addr %C)

	; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
	; CHECK-NEXT: store(
	; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(load(addr %D))),
	; CHECK-NEXT: addr %D)

	; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
	; CHECK-NEXT: fadd(
	; CHECK-NEXT: addr %A,
	; CHECK-NEXT: scalar)

	; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
	; CHECK-NEXT: store(
	; CHECK-NEXT: scalar,
	; CHECK-NEXT: addr %C)

	; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
	; CHECK-NEXT: store(
	; CHECK-NEXT: scalar,
	; CHECK-NEXT: addr %D)

	; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
	; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(addr %D))

	define void @toplevel(<15 x double>* %A, double* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
	entry:
	%a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
	%b = call <15 x double> @llvm.matrix.column.major.load(double* %B, i64 5, i1 false, i32 3, i32 5), !dbg !3793
	%c = fadd <15 x double> %a, %b, !dbg !100
	store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102

	%load = load <2 x float>, <2 x float>* %D, !dbg !104
	%t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
	%t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
	store <2 x float> %t2, <2 x float>* %D, !dbg !108
	ret void
	}

	declare <15 x double> @llvm.matrix.column.major.load(double*, i64, i1, i32, i32)
	declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)

	!llvm.dbg.cu = !{!0}
	!llvm.module.flags = !{!3, !4}

	!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
	!1 = !DIFile(filename: "load.h", directory: "/test")
	!2 = !{}
	!3 = !{i32 2, !"Dwarf Version", i32 4}
	!4 = !{i32 2, !"Debug Info Version", i32 3}
	!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
	!17 = !DIFile(filename: "toplevel.c", directory: "/test")
	!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
	!18 = !DIFile(filename: "assign.h", directory: "/test")
	!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

	!20 = !DIFile(filename: "add.h", directory: "/test")
	!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

	!22 = !DIFile(filename: "store.h", directory: "/test")
	!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)

	!24 = !DIFile(filename: "transpose.h", directory: "/test")
	!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)


	!6 = !DISubroutineType(types: !7)
	!7 = !{null, !8, !8, !11}
	!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
	!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
	!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
	!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
	!12 = !{!13}
	!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
	!14 = !DILocation(line: 1, column: 27, scope: !5)

	!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
	!3792 = !DILocation(line: 405, column: 3, scope: !16)
	!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
	!3794 = !DILocation(line: 406, column: 11, scope: !16)
	!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
	!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)

	!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
	!101 = !DILocation(line: 410, column: 11, scope: !16)

	!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
	!103 = !DILocation(line: 410, column: 0, scope: !16)

	!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
	!105 = !DILocation(line: 500, column: 11, scope: !16)

	!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
	!107 = !DILocation(line: 510, column: 11, scope: !16)

	!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
	!109 = !DILocation(line: 510, column: 0, scope: !16)