llvm/test/Transforms/InstCombine/matrix-multiplication-negation.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s

 ; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <2 x double> @test_negation_move_to_result(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 ; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
 ; Fast flag should be preserved
 define <2 x double> @test_negation_move_to_result_with_fastflags(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result_with_fastflags(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg fast <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 define <2 x double> @test_negation_move_to_result_with_nnan_flag(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result_with_nnan_flag(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call nnan <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call nnan <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 define <2 x double> @test_negation_move_to_result_with_nsz_flag(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result_with_nsz_flag(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call nsz <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg nsz <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %a.neg = fneg <6 x double> %a
   %res = tail call nsz <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 define <2 x double> @test_negation_move_to_result_with_fastflag_on_negation(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_move_to_result_with_fastflag_on_negation(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %a.neg = fneg fast<6 x double> %a
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 ; %b has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <9 x double> @test_move_negation_to_second_operand(<27 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_second_operand(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
   ret <9 x double> %res
 }

 ; %b has the fewest vector elements between the result and the two operands so the negation can be moved there
 ; Fast flag should be preserved
 define <9 x double> @test_move_negation_to_second_operand_with_fast_flags(<27 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_second_operand_with_fast_flags(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
   %res = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
   ret <9 x double> %res
 }

 ; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <2 x double> @test_negation_move_to_result_from_second_operand(<3 x double> %a, <6 x double> %b){
 ; CHECK-LABEL: @test_negation_move_to_result_from_second_operand(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> [[A:%.*]], <6 x double> [[B:%.*]], i32 1, i32 3, i32 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %b.neg = fneg <6 x double> %b
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> %a, <6 x double> %b.neg, i32 1, i32 3, i32 2)
   ret <2 x double> %res
 }

 ; %a has the fewest vector elements between the result and the two operands so the negation can be moved there
 define <9 x double> @test_move_negation_to_first_operand(<3 x double> %a, <27 x double> %b) {
 ; CHECK-LABEL: @test_move_negation_to_first_operand(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[TMP1]], <27 x double> [[B:%.*]], i32 1, i32 3, i32 9)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %b.neg = fneg <27 x double> %b
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> %a, <27 x double> %b.neg, i32 1, i32 3, i32 9)
   ret <9 x double> %res
 }

 ; %a has the fewest vector elements between the result and the two operands so the negation is not moved
 define <15 x double> @test_negation_not_moved(<3 x double> %a, <5 x double> %b) {
 ; CHECK-LABEL: @test_negation_not_moved(
 ; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <3 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[A_NEG]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
 ; CHECK-NEXT:    ret <15 x double> [[RES]]
 ;
   %a.neg = fneg <3 x double> %a
   %res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a.neg, <5 x double> %b, i32 3, i32 1, i32 5)
   ret <15 x double> %res
 }

 ; %b as the fewest vector elements between the result and the two operands so the negation is not moved
 define <15 x double> @test_negation_not_moved_second_operand(<5 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @test_negation_not_moved_second_operand(
 ; CHECK-NEXT:    [[B_NEG:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double> [[A:%.*]], <3 x double> [[B_NEG]], i32 5, i32 1, i32 3)
 ; CHECK-NEXT:    ret <15 x double> [[RES]]
 ;
   %b.neg = fneg <3 x double> %b
   %res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double> %a, <3 x double> %b.neg, i32 5, i32 1, i32 3)
   ret <15 x double> %res
 }

 ; the negation should be moved from the result to operand %a because it has the smallest vector element count
 define <15 x double> @test_negation_on_result(<3 x double> %a, <5 x double> %b) {
 ; CHECK-LABEL: @test_negation_on_result(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[A:%.*]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
 ; CHECK-NEXT:    [[RES_2:%.*]] = fneg <15 x double> [[RES]]
 ; CHECK-NEXT:    ret <15 x double> [[RES_2]]
 ;
   %res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a, <5 x double> %b, i32 3, i32 1, i32 5)
   %res.2 = fneg <15 x double> %res
   ret <15 x double> %res.2
 }

 ; both negations can be deleted
 define <2 x double> @test_with_two_operands_negated1(<6 x double> %a, <3 x double> %b){
 ; CHECK-LABEL: @test_with_two_operands_negated1(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %a.neg = fneg <6 x double> %a
   %b.neg = fneg <3 x double> %b
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b.neg, i32 2, i32 3, i32 1)
   ret <2 x double> %res
 }

 ; both negations will be removed
 define <9 x double> @test_with_two_operands_negated2(<27 x double> %a, <3 x double> %b){
 ; CHECK-LABEL: @test_with_two_operands_negated2(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
   %b.neg = fneg <3 x double> %b
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
   ret <9 x double> %res
 }

 ; both negations will be removed
 define <9 x double> @test_with_two_operands_negated_with_fastflags(<27 x double> %a, <3 x double> %b){
 ; CHECK-LABEL: @test_with_two_operands_negated_with_fastflags(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
   %b.neg = fneg <3 x double> %b
   %res = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
   ret <9 x double> %res
 }

 ; both negations should be removed
 define <9 x double> @test_with_two_operands_negated2_commute(<3 x double> %a, <27 x double> %b){
 ; CHECK-LABEL: @test_with_two_operands_negated2_commute(
 ; CHECK-NEXT:    [[RES:%.*]] = call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[A:%.*]], <27 x double> [[B:%.*]], i32 1, i32 3, i32 9)
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <3 x double> %a
   %b.neg = fneg <27 x double> %b
   %res = call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> %a.neg, <27 x double> %b.neg, i32 1, i32 3, i32 9)
   ret <9 x double> %res
 }

 define <4 x double> @matrix_multiply_two_operands_negated_with_same_size(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @matrix_multiply_two_operands_negated_with_same_size(
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 2, i32 1, i32 2)
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %a.neg = fneg <2 x double> %a
   %b.neg = fneg <2 x double> %b
   %res = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a.neg, <2 x double> %b.neg, i32 2, i32 1, i32 2)
   ret <4 x double> %res
 }

 define <2 x double> @matrix_multiply_two_operands_with_multiple_uses(<6 x double> %a, <3 x double> %b) {
 ; CHECK-LABEL: @matrix_multiply_two_operands_with_multiple_uses(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.*]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[RES_3:%.*]] = fsub <2 x double> [[RES]], [[TMP1]]
 ; CHECK-NEXT:    ret <2 x double> [[RES_3]]
 ;
   %a.neg = fneg <6 x double> %a
   %b.neg = fneg <3 x double> %b
   %res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b.neg, i32 2, i32 3, i32 1)
   %res.2 = shufflevector <6 x double> %a.neg, <6 x double> undef,
   <2 x i32>  <i32 0, i32 1>
   %res.3 = fadd <2 x double> %res.2, %res
   ret <2 x double> %res.3
 }

 define <9 x double> @matrix_multiply_two_operands_with_multiple_uses2(<27 x double> %a, <3 x double> %b, ptr %a_loc, ptr %b_loc){
 ; CHECK-LABEL: @matrix_multiply_two_operands_with_multiple_uses2(
 ; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <27 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[B_NEG:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A]], <3 x double> [[B]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    store <27 x double> [[A_NEG]], ptr [[A_LOC:%.*]], align 256
 ; CHECK-NEXT:    store <3 x double> [[B_NEG]], ptr [[B_LOC:%.*]], align 32
 ; CHECK-NEXT:    ret <9 x double> [[RES]]
 ;
   %a.neg = fneg <27 x double> %a
   %b.neg = fneg <3 x double> %b
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
   store <27 x double> %a.neg, ptr %a_loc
   store <3 x double> %b.neg, ptr %b_loc
   ret <9 x double> %res
 }

 define <12 x double> @fneg_with_multiple_uses(<15 x double> %a, <20 x double> %b){
 ; CHECK-LABEL: @fneg_with_multiple_uses(
 ; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <15 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
 ; CHECK-NEXT:    [[RES_2:%.*]] = shufflevector <15 x double> [[A_NEG]], <15 x double> undef, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[RES_3:%.*]] = fadd <12 x double> [[RES_2]], [[RES]]
 ; CHECK-NEXT:    ret <12 x double> [[RES_3]]
 ;
   %a.neg = fneg <15 x double> %a
   %res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
   %res.2 = shufflevector <15 x double> %a.neg, <15 x double> undef,
   <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res.3 = fadd <12 x double> %res.2, %res
   ret <12 x double> %res.3
 }

 define <12 x double> @fneg_with_multiple_uses_2(<15 x double> %a, <20 x double> %b, ptr %a_loc){
 ; CHECK-LABEL: @fneg_with_multiple_uses_2(
 ; CHECK-NEXT:    [[A_NEG:%.*]] = fneg <15 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
 ; CHECK-NEXT:    store <15 x double> [[A_NEG]], ptr [[A_LOC:%.*]], align 128
 ; CHECK-NEXT:    ret <12 x double> [[RES]]
 ;
   %a.neg = fneg <15 x double> %a
   %res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
   store <15 x double> %a.neg, ptr %a_loc
   ret <12 x double> %res
 }
 ; negation should be moved to the second operand given it has the smallest operand count
 define <72 x double> @chain_of_matrix_mutliplies(<27 x double> %a, <3 x double> %b, <8 x double> %c) {
 ; CHECK-LABEL: @chain_of_matrix_mutliplies(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[B:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.*]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
 ; CHECK-NEXT:    [[RES_2:%.*]] = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> [[RES]], <8 x double> [[C:%.*]], i32 9, i32 1, i32 8)
 ; CHECK-NEXT:    ret <72 x double> [[RES_2]]
 ;
   %a.neg = fneg <27 x double> %a
   %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
   %res.2 = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> %res, <8 x double> %c, i32 9, i32 1, i32 8)
   ret <72 x double> %res.2
 }

 ; first negation should be moved to %a
 ; second negation should be moved to the result of the second multipication
 define <6 x double> @chain_of_matrix_mutliplies_with_two_negations(<3 x double> %a, <5 x double> %b, <10 x double> %c) {
 ; CHECK-LABEL: @chain_of_matrix_mutliplies_with_two_negations(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x double> [[A:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[TMP1]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> [[RES]], <10 x double> [[C:%.*]], i32 3, i32 5, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fneg <6 x double> [[TMP2]]
 ; CHECK-NEXT:    ret <6 x double> [[TMP3]]
 ;
   %b.neg = fneg <5 x double> %b
   %res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a, <5 x double> %b.neg, i32 3, i32 1, i32 5)
   %res.neg = fneg <15 x double> %res
   %res.2 = tail call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> %res.neg, <10 x double> %c, i32 3, i32 5, i32 2)
   ret <6 x double> %res.2
 }

 ; negation should be propagated to the result of the second matrix multiplication
 define <6 x double> @chain_of_matrix_mutliplies_propagation(<15 x double> %a, <20 x double> %b, <8 x double> %c){
 ; CHECK-LABEL: @chain_of_matrix_mutliplies_propagation(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A:%.*]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> [[TMP1]], <8 x double> [[C:%.*]], i32 3, i32 4, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fneg <6 x double> [[TMP2]]
 ; CHECK-NEXT:    ret <6 x double> [[TMP3]]
 ;
   %a.neg = fneg <15 x double> %a
   %res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
   %res.2 = tail call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> %res, <8 x double> %c, i32 3, i32 4, i32 2)
   ret <6 x double> %res.2
 }

 declare <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double>, <6 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double>, <27 x double>, i32 immarg, i32 immarg, i32 immarg)
 declare <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double>, <5 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double>, <8 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double>, <20 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <21 x double> @llvm.matrix.multiply.v21f64.v15f64.v35f64(<15 x double>, <35 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double>, <10 x double>, i32 immarg, i32 immarg, i32 immarg) #1
 declare <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double>, <8 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -passes=instcombine -S \| FileCheck %s

	; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
	define <2 x double> @test_negation_move_to_result(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_move_to_result(
	; CHECK-NEXT: [[TMP1:%.]] = call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%a.neg = fneg <6 x double> %a
	%res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
	; Fast flag should be preserved
	define <2 x double> @test_negation_move_to_result_with_fastflags(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_move_to_result_with_fastflags(
	; CHECK-NEXT: [[TMP1:%.]] = call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg fast <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%a.neg = fneg <6 x double> %a
	%res = tail call fast <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	define <2 x double> @test_negation_move_to_result_with_nnan_flag(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_move_to_result_with_nnan_flag(
	; CHECK-NEXT: [[TMP1:%.]] = call nnan <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg nnan <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%a.neg = fneg <6 x double> %a
	%res = tail call nnan <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	define <2 x double> @test_negation_move_to_result_with_nsz_flag(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_move_to_result_with_nsz_flag(
	; CHECK-NEXT: [[TMP1:%.]] = call nsz <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg nsz <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%a.neg = fneg <6 x double> %a
	%res = tail call nsz <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	define <2 x double> @test_negation_move_to_result_with_fastflag_on_negation(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_move_to_result_with_fastflag_on_negation(
	; CHECK-NEXT: [[TMP1:%.]] = call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%a.neg = fneg fast<6 x double> %a
	%res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	; %b has the fewest vector elements between the result and the two operands so the negation can be moved there
	define <9 x double> @test_move_negation_to_second_operand(<27 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_move_negation_to_second_operand(
	; CHECK-NEXT: [[TMP1:%.]] = fneg <3 x double> [[B:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <27 x double> %a
	%res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
	ret <9 x double> %res
	}

	; %b has the fewest vector elements between the result and the two operands so the negation can be moved there
	; Fast flag should be preserved
	define <9 x double> @test_move_negation_to_second_operand_with_fast_flags(<27 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_move_negation_to_second_operand_with_fast_flags(
	; CHECK-NEXT: [[TMP1:%.]] = fneg <3 x double> [[B:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <27 x double> %a
	%res = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
	ret <9 x double> %res
	}

	; The result has the fewest vector elements between the result and the two operands so the negation can be moved there
	define <2 x double> @test_negation_move_to_result_from_second_operand(<3 x double> %a, <6 x double> %b){
	; CHECK-LABEL: @test_negation_move_to_result_from_second_operand(
	; CHECK-NEXT: [[TMP1:%.]] = call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> [[A:%.]], <6 x double> [[B:%.*]], i32 1, i32 3, i32 2)
	; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[TMP2]]
	;
	%b.neg = fneg <6 x double> %b
	%res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double> %a, <6 x double> %b.neg, i32 1, i32 3, i32 2)
	ret <2 x double> %res
	}

	; %a has the fewest vector elements between the result and the two operands so the negation can be moved there
	define <9 x double> @test_move_negation_to_first_operand(<3 x double> %a, <27 x double> %b) {
	; CHECK-LABEL: @test_move_negation_to_first_operand(
	; CHECK-NEXT: [[TMP1:%.]] = fneg <3 x double> [[A:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[TMP1]], <27 x double> [[B:%.]], i32 1, i32 3, i32 9)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%b.neg = fneg <27 x double> %b
	%res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> %a, <27 x double> %b.neg, i32 1, i32 3, i32 9)
	ret <9 x double> %res
	}

	; %a has the fewest vector elements between the result and the two operands so the negation is not moved
	define <15 x double> @test_negation_not_moved(<3 x double> %a, <5 x double> %b) {
	; CHECK-LABEL: @test_negation_not_moved(
	; CHECK-NEXT: [[A_NEG:%.]] = fneg <3 x double> [[A:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[A_NEG]], <5 x double> [[B:%.]], i32 3, i32 1, i32 5)
	; CHECK-NEXT: ret <15 x double> [[RES]]
	;
	%a.neg = fneg <3 x double> %a
	%res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a.neg, <5 x double> %b, i32 3, i32 1, i32 5)
	ret <15 x double> %res
	}

	; %b as the fewest vector elements between the result and the two operands so the negation is not moved
	define <15 x double> @test_negation_not_moved_second_operand(<5 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @test_negation_not_moved_second_operand(
	; CHECK-NEXT: [[B_NEG:%.]] = fneg <3 x double> [[B:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double> [[A:%.]], <3 x double> [[B_NEG]], i32 5, i32 1, i32 3)
	; CHECK-NEXT: ret <15 x double> [[RES]]
	;
	%b.neg = fneg <3 x double> %b
	%res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double> %a, <3 x double> %b.neg, i32 5, i32 1, i32 3)
	ret <15 x double> %res
	}

	; the negation should be moved from the result to operand %a because it has the smallest vector element count
	define <15 x double> @test_negation_on_result(<3 x double> %a, <5 x double> %b) {
	; CHECK-LABEL: @test_negation_on_result(
	; CHECK-NEXT: [[RES:%.]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[A:%.]], <5 x double> [[B:%.*]], i32 3, i32 1, i32 5)
	; CHECK-NEXT: [[RES_2:%.*]] = fneg <15 x double> [[RES]]
	; CHECK-NEXT: ret <15 x double> [[RES_2]]
	;
	%res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a, <5 x double> %b, i32 3, i32 1, i32 5)
	%res.2 = fneg <15 x double> %res
	ret <15 x double> %res.2
	}

	; both negations can be deleted
	define <2 x double> @test_with_two_operands_negated1(<6 x double> %a, <3 x double> %b){
	; CHECK-LABEL: @test_with_two_operands_negated1(
	; CHECK-NEXT: [[RES:%.]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: ret <2 x double> [[RES]]
	;
	%a.neg = fneg <6 x double> %a
	%b.neg = fneg <3 x double> %b
	%res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b.neg, i32 2, i32 3, i32 1)
	ret <2 x double> %res
	}

	; both negations will be removed
	define <9 x double> @test_with_two_operands_negated2(<27 x double> %a, <3 x double> %b){
	; CHECK-LABEL: @test_with_two_operands_negated2(
	; CHECK-NEXT: [[RES:%.]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <27 x double> %a
	%b.neg = fneg <3 x double> %b
	%res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
	ret <9 x double> %res
	}

	; both negations will be removed
	define <9 x double> @test_with_two_operands_negated_with_fastflags(<27 x double> %a, <3 x double> %b){
	; CHECK-LABEL: @test_with_two_operands_negated_with_fastflags(
	; CHECK-NEXT: [[RES:%.]] = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <27 x double> %a
	%b.neg = fneg <3 x double> %b
	%res = tail call fast <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
	ret <9 x double> %res
	}

	; both negations should be removed
	define <9 x double> @test_with_two_operands_negated2_commute(<3 x double> %a, <27 x double> %b){
	; CHECK-LABEL: @test_with_two_operands_negated2_commute(
	; CHECK-NEXT: [[RES:%.]] = call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> [[A:%.]], <27 x double> [[B:%.*]], i32 1, i32 3, i32 9)
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <3 x double> %a
	%b.neg = fneg <27 x double> %b
	%res = call <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double> %a.neg, <27 x double> %b.neg, i32 1, i32 3, i32 9)
	ret <9 x double> %res
	}

	define <4 x double> @matrix_multiply_two_operands_negated_with_same_size(<2 x double> %a, <2 x double> %b) {
	; CHECK-LABEL: @matrix_multiply_two_operands_negated_with_same_size(
	; CHECK-NEXT: [[RES:%.]] = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> [[A:%.]], <2 x double> [[B:%.*]], i32 2, i32 1, i32 2)
	; CHECK-NEXT: ret <4 x double> [[RES]]
	;
	%a.neg = fneg <2 x double> %a
	%b.neg = fneg <2 x double> %b
	%res = call <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double> %a.neg, <2 x double> %b.neg, i32 2, i32 1, i32 2)
	ret <4 x double> %res
	}

	define <2 x double> @matrix_multiply_two_operands_with_multiple_uses(<6 x double> %a, <3 x double> %b) {
	; CHECK-LABEL: @matrix_multiply_two_operands_with_multiple_uses(
	; CHECK-NEXT: [[RES:%.]] = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> [[A:%.]], <3 x double> [[B:%.*]], i32 2, i32 3, i32 1)
	; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <2 x i32> <i32 0, i32 1>
	; CHECK-NEXT: [[RES_3:%.*]] = fsub <2 x double> [[RES]], [[TMP1]]
	; CHECK-NEXT: ret <2 x double> [[RES_3]]
	;
	%a.neg = fneg <6 x double> %a
	%b.neg = fneg <3 x double> %b
	%res = tail call <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double> %a.neg, <3 x double> %b.neg, i32 2, i32 3, i32 1)
	%res.2 = shufflevector <6 x double> %a.neg, <6 x double> undef,
	<2 x i32> <i32 0, i32 1>
	%res.3 = fadd <2 x double> %res.2, %res
	ret <2 x double> %res.3
	}

	define <9 x double> @matrix_multiply_two_operands_with_multiple_uses2(<27 x double> %a, <3 x double> %b, ptr %a_loc, ptr %b_loc){
	; CHECK-LABEL: @matrix_multiply_two_operands_with_multiple_uses2(
	; CHECK-NEXT: [[A_NEG:%.]] = fneg <27 x double> [[A:%.]]
	; CHECK-NEXT: [[B_NEG:%.]] = fneg <3 x double> [[B:%.]]
	; CHECK-NEXT: [[RES:%.*]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A]], <3 x double> [[B]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: store <27 x double> [[A_NEG]], ptr [[A_LOC:%.*]], align 256
	; CHECK-NEXT: store <3 x double> [[B_NEG]], ptr [[B_LOC:%.*]], align 32
	; CHECK-NEXT: ret <9 x double> [[RES]]
	;
	%a.neg = fneg <27 x double> %a
	%b.neg = fneg <3 x double> %b
	%res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b.neg, i32 9, i32 3, i32 1)
	store <27 x double> %a.neg, ptr %a_loc
	store <3 x double> %b.neg, ptr %b_loc
	ret <9 x double> %res
	}

	define <12 x double> @fneg_with_multiple_uses(<15 x double> %a, <20 x double> %b){
	; CHECK-LABEL: @fneg_with_multiple_uses(
	; CHECK-NEXT: [[A_NEG:%.]] = fneg <15 x double> [[A:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.]], i32 3, i32 5, i32 4)
	; CHECK-NEXT: [[RES_2:%.*]] = shufflevector <15 x double> [[A_NEG]], <15 x double> undef, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
	; CHECK-NEXT: [[RES_3:%.*]] = fadd <12 x double> [[RES_2]], [[RES]]
	; CHECK-NEXT: ret <12 x double> [[RES_3]]
	;
	%a.neg = fneg <15 x double> %a
	%res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
	%res.2 = shufflevector <15 x double> %a.neg, <15 x double> undef,
	<12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
	%res.3 = fadd <12 x double> %res.2, %res
	ret <12 x double> %res.3
	}

	define <12 x double> @fneg_with_multiple_uses_2(<15 x double> %a, <20 x double> %b, ptr %a_loc){
	; CHECK-LABEL: @fneg_with_multiple_uses_2(
	; CHECK-NEXT: [[A_NEG:%.]] = fneg <15 x double> [[A:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A_NEG]], <20 x double> [[B:%.]], i32 3, i32 5, i32 4)
	; CHECK-NEXT: store <15 x double> [[A_NEG]], ptr [[A_LOC:%.*]], align 128
	; CHECK-NEXT: ret <12 x double> [[RES]]
	;
	%a.neg = fneg <15 x double> %a
	%res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
	store <15 x double> %a.neg, ptr %a_loc
	ret <12 x double> %res
	}
	; negation should be moved to the second operand given it has the smallest operand count
	define <72 x double> @chain_of_matrix_mutliplies(<27 x double> %a, <3 x double> %b, <8 x double> %c) {
	; CHECK-LABEL: @chain_of_matrix_mutliplies(
	; CHECK-NEXT: [[TMP1:%.]] = fneg <3 x double> [[B:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> [[A:%.]], <3 x double> [[TMP1]], i32 9, i32 3, i32 1)
	; CHECK-NEXT: [[RES_2:%.]] = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> [[RES]], <8 x double> [[C:%.]], i32 9, i32 1, i32 8)
	; CHECK-NEXT: ret <72 x double> [[RES_2]]
	;
	%a.neg = fneg <27 x double> %a
	%res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double> %a.neg, <3 x double> %b, i32 9, i32 3, i32 1)
	%res.2 = tail call <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double> %res, <8 x double> %c, i32 9, i32 1, i32 8)
	ret <72 x double> %res.2
	}

	; first negation should be moved to %a
	; second negation should be moved to the result of the second multipication
	define <6 x double> @chain_of_matrix_mutliplies_with_two_negations(<3 x double> %a, <5 x double> %b, <10 x double> %c) {
	; CHECK-LABEL: @chain_of_matrix_mutliplies_with_two_negations(
	; CHECK-NEXT: [[TMP1:%.]] = fneg <3 x double> [[A:%.]]
	; CHECK-NEXT: [[RES:%.]] = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> [[TMP1]], <5 x double> [[B:%.]], i32 3, i32 1, i32 5)
	; CHECK-NEXT: [[TMP2:%.]] = call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> [[RES]], <10 x double> [[C:%.]], i32 3, i32 5, i32 2)
	; CHECK-NEXT: [[TMP3:%.*]] = fneg <6 x double> [[TMP2]]
	; CHECK-NEXT: ret <6 x double> [[TMP3]]
	;
	%b.neg = fneg <5 x double> %b
	%res = tail call <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double> %a, <5 x double> %b.neg, i32 3, i32 1, i32 5)
	%res.neg = fneg <15 x double> %res
	%res.2 = tail call <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double> %res.neg, <10 x double> %c, i32 3, i32 5, i32 2)
	ret <6 x double> %res.2
	}

	; negation should be propagated to the result of the second matrix multiplication
	define <6 x double> @chain_of_matrix_mutliplies_propagation(<15 x double> %a, <20 x double> %b, <8 x double> %c){
	; CHECK-LABEL: @chain_of_matrix_mutliplies_propagation(
	; CHECK-NEXT: [[TMP1:%.]] = call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> [[A:%.]], <20 x double> [[B:%.*]], i32 3, i32 5, i32 4)
	; CHECK-NEXT: [[TMP2:%.]] = call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> [[TMP1]], <8 x double> [[C:%.]], i32 3, i32 4, i32 2)
	; CHECK-NEXT: [[TMP3:%.*]] = fneg <6 x double> [[TMP2]]
	; CHECK-NEXT: ret <6 x double> [[TMP3]]
	;
	%a.neg = fneg <15 x double> %a
	%res = tail call <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double> %a.neg, <20 x double> %b, i32 3, i32 5, i32 4)
	%res.2 = tail call <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double> %res, <8 x double> %c, i32 3, i32 4, i32 2)
	ret <6 x double> %res.2
	}

	declare <2 x double> @llvm.matrix.multiply.v2f64.v6f64.v3f64(<6 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <4 x double> @llvm.matrix.multiply.v4f64.v2f64.v2f64(<2 x double>, <2 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <2 x double> @llvm.matrix.multiply.v2f64.v3f64.v6f64(<3 x double>, <6 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <9 x double> @llvm.matrix.multiply.v9f64.v27f64.v3f64(<27 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <9 x double> @llvm.matrix.multiply.v9f64.v3f64.v27f64(<3 x double>, <27 x double>, i32 immarg, i32 immarg, i32 immarg)
	declare <15 x double> @llvm.matrix.multiply.v15f64.v3f64.v5f64(<3 x double>, <5 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <15 x double> @llvm.matrix.multiply.v15f64.v5f64.v3f64(<5 x double>, <3 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <72 x double> @llvm.matrix.multiply.v72f64.v9f64.v8f64(<9 x double>, <8 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <12 x double> @llvm.matrix.multiply.v12f64.v15f64.v20f64(<15 x double>, <20 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <21 x double> @llvm.matrix.multiply.v21f64.v15f64.v35f64(<15 x double>, <35 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <6 x double> @llvm.matrix.multiply.v6f64.v15f64.v10f64(<15 x double>, <10 x double>, i32 immarg, i32 immarg, i32 immarg) #1
	declare <6 x double> @llvm.matrix.multiply.v6f64.v12f64.v8f64(<12 x double>, <8 x double>, i32 immarg, i32 immarg, i32 immarg) #1