[GlobalISel][AArch64] Add lowering for G_SMULFIX (#196757)

Adding lowering for G_SMULFIX G_OP. It is needed to compile
`libc/src/stdfix/expk.cpp` with `-O3`.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 8bcae5d..9858f5e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -576,6 +576,7 @@
   LLVM_ABI LegalizeResult lowerMemCpyFamily(MachineInstr &MI,
                                             unsigned MaxLen = 0);
   LLVM_ABI LegalizeResult lowerVAArg(MachineInstr &MI);
+  LLVM_ABI LegalizeResult lowerSmulfix(MachineInstr &MI);
 };
 
 } // End namespace llvm.
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 909decf..070d7ec 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4968,6 +4968,8 @@
     MI.eraseFromParent();
     return Legalized;
   }
+  case G_SMULFIX:
+    return lowerSmulfix(MI);
   }
 }
 
@@ -10654,6 +10656,30 @@
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSmulfix(MachineInstr &MI) {
+  auto [Dst, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+  unsigned Scale = MI.getOperand(3).getImm();
+
+  if (Scale == 0) {
+    MIRBuilder.buildMul(Dst, LHS, RHS);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  LLT WideTy = Ty.changeElementSize(Ty.getScalarSizeInBits() * 2);
+  auto SExtLHS = MIRBuilder.buildSExt(WideTy, LHS);
+  auto SExtRHS = MIRBuilder.buildSExt(WideTy, RHS);
+  auto Mul = MIRBuilder.buildMul(WideTy, SExtLHS, SExtRHS);
+  auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Scale);
+  auto Shifted = MIRBuilder.buildAShr(WideTy, Mul, ShiftAmt);
+  MIRBuilder.buildTrunc(Dst, Shifted);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
   // On Darwin, -Os means optimize for size without hurting performance, so
   // only really optimize for size when -Oz (MinSize) is used.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 14a3f75..4c7abbf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -297,6 +297,8 @@
       .legalFor({i64, v16i8, v8i16, v4i32})
       .lower();
 
+  getActionDefinitionsBuilder(G_SMULFIX).lower();
+
   getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
       .legalFor({v8i8, v16i8, v4i16, v8i16, v2i32, v4i32})
       .legalFor(HasCSSC, {i32, i64})
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir
new file mode 100644
index 0000000..2b660e5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir
@@ -0,0 +1,181 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            smulfix_i32_scale_0
+body:             |
+  bb.1:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: smulfix_i32_scale_0
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i32) = COPY $w1
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i32) = G_MUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY [[MUL]](i32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(i32) = COPY $w0
+    %1:_(i32) = COPY $w1
+    %2:_(i32) = G_SMULFIX %0, %1, 0
+    $w0 = COPY %2(i32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            smulfix_i32
+body:             |
+  bb.1:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: smulfix_i32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i32) = COPY $w1
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(i64) = G_SEXT [[COPY]](i32)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(i64) = G_SEXT [[COPY1]](i32)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i64) = G_MUL [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(i64) = G_ASHR [[MUL]], [[C]](i64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(i32) = G_TRUNC [[ASHR]](i64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](i32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(i32) = COPY $w0
+    %1:_(i32) = COPY $w1
+    %2:_(i32) = G_SMULFIX %0, %1, 15
+    $w0 = COPY %2(i32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            smulfix_i64
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: smulfix_i64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i64) = COPY $x1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 63
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(i64) = G_ASHR [[COPY]], [[C]](i64)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(i64) = G_ASHR [[COPY1]], [[C]](i64)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i64) = G_MUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(i64) = G_MUL [[ASHR]], [[COPY1]]
+    ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(i64) = G_MUL [[COPY]], [[ASHR1]]
+    ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(i64) = G_UMULH [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(i64) = G_ADD [[MUL1]], [[MUL2]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(i64) = G_ADD [[ADD]], [[UMULH]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(i64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(i64) = G_LSHR [[MUL]], [[C1]](i64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(i64) = G_CONSTANT i64 49
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(i64) = G_SHL [[ADD1]], [[C2]](i64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(i64) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: $x0 = COPY [[OR]](i64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(i64) = COPY $x0
+    %1:_(i64) = COPY $x1
+    %2:_(i64) = G_SMULFIX %0, %1, 15
+    $x0 = COPY %2(i64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            smulfix_4xi32
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: smulfix_4xi32
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]]
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x i32>) = COPY $q0
+    %1:_(<4 x i32>) = COPY $q1
+    %2:_(<4 x i32>) = G_SMULFIX %0, %1, 2
+    $q0 = COPY %2(<4 x i32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            smulfix_4xi32_15
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: smulfix_4xi32_15
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]]
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x i32>) = COPY $q0
+    %1:_(<4 x i32>) = COPY $q1
+    %2:_(<4 x i32>) = G_SMULFIX %0, %1, 15
+    $q0 = COPY %2(<4 x i32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            smulfix_4xi32_31
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+    ; CHECK-LABEL: name: smulfix_4xi32_31
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>)
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>)
+    ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]]
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x i32>) = COPY $q0
+    %1:_(<4 x i32>) = COPY $q1
+    %2:_(<4 x i32>) = G_SMULFIX %0, %1, 31
+    $q0 = COPY %2(<4 x i32>)
+    RET_ReallyLR implicit $q0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 2e0b781..70dbeb7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -492,8 +492,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_SMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_UMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/smul_fix.ll b/llvm/test/CodeGen/AArch64/smul_fix.ll
index dacce72..f99d20a 100644
--- a/llvm/test/CodeGen/AArch64/smul_fix.ll
+++ b/llvm/test/CodeGen/AArch64/smul_fix.ll
@@ -1,37 +1,65 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i32 @func(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: func:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull x8, w0, w1
-; CHECK-NEXT:    lsr x9, x8, #32
-; CHECK-NEXT:    extr w0, w9, w8, #2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smull x8, w0, w1
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    extr w0, w9, w8, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull x8, w0, w1
+; CHECK-GI-NEXT:    asr x0, x8, #2
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    ret
   %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2)
   ret i32 %tmp
 }
 
 define i64 @func2(i64 %x, i64 %y) {
-; CHECK-LABEL: func2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x8, x0, x1
-; CHECK-NEXT:    smulh x9, x0, x1
-; CHECK-NEXT:    extr x0, x9, x8, #2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mul x8, x0, x1
+; CHECK-SD-NEXT:    smulh x9, x0, x1
+; CHECK-SD-NEXT:    extr x0, x9, x8, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    asr x8, x1, #63
+; CHECK-GI-NEXT:    asr x9, x0, #63
+; CHECK-GI-NEXT:    umulh x10, x0, x1
+; CHECK-GI-NEXT:    mul x8, x0, x8
+; CHECK-GI-NEXT:    madd x8, x9, x1, x8
+; CHECK-GI-NEXT:    mul x9, x0, x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    extr x0, x8, x9, #2
+; CHECK-GI-NEXT:    ret
   %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2)
   ret i64 %tmp
 }
 
 define i4 @func3(i4 %x, i4 %y) nounwind {
-; CHECK-LABEL: func3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sbfx w8, w1, #0, #4
-; CHECK-NEXT:    sbfx w9, w0, #0, #4
-; CHECK-NEXT:    smull x8, w9, w8
-; CHECK-NEXT:    lsr x9, x8, #32
-; CHECK-NEXT:    extr w0, w9, w8, #2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sbfx w8, w1, #0, #4
+; CHECK-SD-NEXT:    sbfx w9, w0, #0, #4
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    extr w0, w9, w8, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sbfx w8, w0, #0, #4
+; CHECK-GI-NEXT:    sbfx w9, w1, #0, #4
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sbfx w0, w8, #2, #6
+; CHECK-GI-NEXT:    ret
   %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2)
   ret i4 %tmp
 }
@@ -56,40 +84,69 @@
 }
 
 define i4 @func6(i4 %x, i4 %y) nounwind {
-; CHECK-LABEL: func6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sbfx w8, w1, #0, #4
-; CHECK-NEXT:    sbfx w9, w0, #0, #4
-; CHECK-NEXT:    mul w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func6:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sbfx w8, w1, #0, #4
+; CHECK-SD-NEXT:    sbfx w9, w0, #0, #4
+; CHECK-SD-NEXT:    mul w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func6:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mul w0, w0, w1
+; CHECK-GI-NEXT:    ret
   %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0)
   ret i4 %tmp
 }
 
 define i64 @func7(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: func7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x8, x0, x1
-; CHECK-NEXT:    smulh x9, x0, x1
-; CHECK-NEXT:    extr x0, x9, x8, #32
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func7:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mul x8, x0, x1
+; CHECK-SD-NEXT:    smulh x9, x0, x1
+; CHECK-SD-NEXT:    extr x0, x9, x8, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func7:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    asr x8, x1, #63
+; CHECK-GI-NEXT:    asr x9, x0, #63
+; CHECK-GI-NEXT:    umulh x10, x0, x1
+; CHECK-GI-NEXT:    mul x8, x0, x8
+; CHECK-GI-NEXT:    madd x8, x9, x1, x8
+; CHECK-GI-NEXT:    mul x9, x0, x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    extr x0, x8, x9, #32
+; CHECK-GI-NEXT:    ret
   %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32)
   ret i64 %tmp
 }
 
 define i64 @func8(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: func8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x8, x0, x1
-; CHECK-NEXT:    smulh x9, x0, x1
-; CHECK-NEXT:    extr x0, x9, x8, #63
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: func8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mul x8, x0, x1
+; CHECK-SD-NEXT:    smulh x9, x0, x1
+; CHECK-SD-NEXT:    extr x0, x9, x8, #63
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: func8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    asr x8, x1, #63
+; CHECK-GI-NEXT:    asr x9, x0, #63
+; CHECK-GI-NEXT:    umulh x10, x0, x1
+; CHECK-GI-NEXT:    mul x8, x0, x8
+; CHECK-GI-NEXT:    madd x8, x9, x1, x8
+; CHECK-GI-NEXT:    mul x9, x0, x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    extr x0, x8, x9, #63
+; CHECK-GI-NEXT:    ret
   %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63)
   ret i64 %tmp
 }
 
-define <2 x i32> @vec(<2 x i32> %x, <2 x i32> %y) nounwind {
-; CHECK-LABEL: vec:
+define <2 x i32> @smulfix_2xi32_0(<2 x i32> %x, <2 x i32> %y) nounwind {
+; CHECK-LABEL: smulfix_2xi32_0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
@@ -97,8 +154,8 @@
   ret <2 x i32> %tmp
 }
 
-define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec2:
+define <4 x i32> @smulfix_4xi32_0(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: smulfix_4xi32_0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -106,47 +163,127 @@
   ret <4 x i32> %tmp
 }
 
-define <4 x i64> @vec3(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: vec3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v2.d[1]
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    fmov x10, d2
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    mov x14, v3.d[1]
-; CHECK-NEXT:    mov x15, v1.d[1]
-; CHECK-NEXT:    mul x12, x11, x10
-; CHECK-NEXT:    mul x13, x9, x8
-; CHECK-NEXT:    smulh x8, x9, x8
-; CHECK-NEXT:    smulh x9, x11, x10
-; CHECK-NEXT:    fmov x10, d3
-; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    mul x16, x11, x10
-; CHECK-NEXT:    extr x8, x8, x13, #32
-; CHECK-NEXT:    smulh x10, x11, x10
-; CHECK-NEXT:    extr x9, x9, x12, #32
-; CHECK-NEXT:    mul x11, x15, x14
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    smulh x14, x15, x14
-; CHECK-NEXT:    extr x10, x10, x16, #32
-; CHECK-NEXT:    mov v0.d[1], x8
-; CHECK-NEXT:    fmov d1, x10
-; CHECK-NEXT:    extr x11, x14, x11, #32
-; CHECK-NEXT:    mov v1.d[1], x11
-; CHECK-NEXT:    ret
+define <4 x i32> @smulfix_4xi32(<4 x i32> %1, <4 x i32> %2) {
+; CHECK-SD-LABEL: smulfix_4xi32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v3.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    shl v0.4s, v2.4s, #17
+; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #15
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smulfix_4xi32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.2s, v2.2d, #15
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #15
+; CHECK-GI-NEXT:    ret
+  %m = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %1, <4 x i32> %2, i32 15)
+  ret <4 x i32> %m
+}
+
+define <4 x i64> @smulfix_4xi64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-SD-LABEL: smulfix_4xi64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov x8, v2.d[1]
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    fmov x10, d2
+; CHECK-SD-NEXT:    fmov x11, d0
+; CHECK-SD-NEXT:    mov x14, v3.d[1]
+; CHECK-SD-NEXT:    mov x15, v1.d[1]
+; CHECK-SD-NEXT:    mul x12, x11, x10
+; CHECK-SD-NEXT:    mul x13, x9, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    smulh x9, x11, x10
+; CHECK-SD-NEXT:    fmov x10, d3
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    mul x16, x11, x10
+; CHECK-SD-NEXT:    extr x8, x8, x13, #32
+; CHECK-SD-NEXT:    smulh x10, x11, x10
+; CHECK-SD-NEXT:    extr x9, x9, x12, #32
+; CHECK-SD-NEXT:    mul x11, x15, x14
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    smulh x14, x15, x14
+; CHECK-SD-NEXT:    extr x10, x10, x16, #32
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    extr x11, x14, x11, #32
+; CHECK-SD-NEXT:    mov v1.d[1], x11
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smulfix_4xi64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov d2, v2.d[1]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    asr x10, x9, #63
+; CHECK-GI-NEXT:    asr x12, x8, #63
+; CHECK-GI-NEXT:    mul x11, x8, x9
+; CHECK-GI-NEXT:    mul x10, x8, x10
+; CHECK-GI-NEXT:    umulh x8, x8, x9
+; CHECK-GI-NEXT:    madd x9, x12, x9, x10
+; CHECK-GI-NEXT:    fmov x12, d2
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov d0, v3.d[1]
+; CHECK-GI-NEXT:    asr x13, x12, #63
+; CHECK-GI-NEXT:    asr x15, x10, #63
+; CHECK-GI-NEXT:    mul x14, x10, x12
+; CHECK-GI-NEXT:    mul x13, x10, x13
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    extr x8, x8, x11, #32
+; CHECK-GI-NEXT:    umulh x10, x10, x12
+; CHECK-GI-NEXT:    asr x1, x0, #63
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    madd x12, x15, x12, x13
+; CHECK-GI-NEXT:    fmov x15, d3
+; CHECK-GI-NEXT:    fmov x13, d1
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    asr x16, x15, #63
+; CHECK-GI-NEXT:    asr x18, x13, #63
+; CHECK-GI-NEXT:    mul x17, x13, x15
+; CHECK-GI-NEXT:    mul x16, x13, x16
+; CHECK-GI-NEXT:    add x9, x12, x10
+; CHECK-GI-NEXT:    extr x9, x9, x14, #32
+; CHECK-GI-NEXT:    umulh x13, x13, x15
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    madd x15, x18, x15, x16
+; CHECK-GI-NEXT:    fmov x16, d1
+; CHECK-GI-NEXT:    mul x18, x16, x1
+; CHECK-GI-NEXT:    asr x1, x16, #63
+; CHECK-GI-NEXT:    umulh x2, x16, x0
+; CHECK-GI-NEXT:    add x10, x15, x13
+; CHECK-GI-NEXT:    extr x10, x10, x17, #32
+; CHECK-GI-NEXT:    madd x18, x1, x0, x18
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mul x16, x16, x0
+; CHECK-GI-NEXT:    add x12, x18, x2
+; CHECK-GI-NEXT:    extr x11, x12, x16, #32
+; CHECK-GI-NEXT:    mov v1.d[1], x11
+; CHECK-GI-NEXT:    ret
   %tmp = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> %x, <4 x i64> %y, i32 32)
   ret <4 x i64> %tmp
 }
 
 define <4 x i16> @widemul(<4 x i16> %x, <4 x i16> %y) nounwind {
-; CHECK-LABEL: widemul:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    shrn v1.4h, v0.4s, #16
-; CHECK-NEXT:    xtn v2.4h, v0.4s
-; CHECK-NEXT:    shl v0.4h, v1.4h, #14
-; CHECK-NEXT:    usra v0.4h, v2.4h, #2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: widemul:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    shrn v1.4h, v0.4s, #16
+; CHECK-SD-NEXT:    xtn v2.4h, v0.4s
+; CHECK-SD-NEXT:    shl v0.4h, v1.4h, #14
+; CHECK-SD-NEXT:    usra v0.4h, v2.4h, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: widemul:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #2
+; CHECK-GI-NEXT:    ret
   %tmp = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> %x, <4 x i16> %y, i32 2)
   ret <4 x i16> %tmp
 }