[X86] LowerTRUNCATE - use LowerTruncateVecPackWithSignBits for prefer-256 bit AVX512 cases during type legalization

If the AVX512 target will split the 512-bit vector truncation then try to use PACKSS/PACKUS first.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2c65984..44654a8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23113,14 +23113,16 @@
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
     }
 
-    // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
-    if (!Subtarget.hasAVX512()) {
+    // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
+    if (!Subtarget.hasAVX512() ||
+        (InVT.is512BitVector() && VT.is256BitVector()))
       if (SDValue SignPack =
               LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
         return SignPack;
 
+    // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
+    if (!Subtarget.hasAVX512())
       return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
-    }
 
     // Otherwise let default legalization handle it.
     return SDValue();
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 8dffb2c..7dd4af7 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1186,9 +1186,8 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsraq $48, 32(%rdi), %ymm0
 ; CHECK-NEXT:    vpsraq $48, (%rdi), %ymm1
-; CHECK-NEXT:    vpmovqd %ymm1, %xmm1
-; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpackssdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; CHECK-NEXT:    retq
   %a = load <8 x i64>, <8 x i64>* %x
   %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>