PowerPC32:PIC: Update to bcl to fix branch prediction mis-predict issue (#134140)

Update `bl` to `bcl 20, 31, .+4` for 32bit PIC code gen so the link
stack is 
not corrupted and cause mis-predict for the branch predictor.

fixes: https://github.com/llvm/llvm-project/issues/128644
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ba6653e..f07331b 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -967,9 +967,9 @@
     // L1$pb:
     MCSymbol *PICBase = MF->getPICBaseSymbol();
 
-    // Emit the 'bl'.
+    // Emit 'bcl 20,31,.+4' so the link stack is not corrupted.
     EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(PPC::BL)
+                   MCInstBuilder(PPC::BCLalways)
                        // FIXME: We would like an efficient form for this, so we
                        // don't have to do a lot of extra uniquing.
                        .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
diff --git a/llvm/test/CodeGen/PowerPC/mcm-5.ll b/llvm/test/CodeGen/PowerPC/mcm-5.ll
index b88f405..f9629b5 100644
--- a/llvm/test/CodeGen/PowerPC/mcm-5.ll
+++ b/llvm/test/CodeGen/PowerPC/mcm-5.ll
@@ -51,7 +51,7 @@
   ret i32 %5
 }
 ; CHECK-LABEL: test_jump_table:
-; CHECK-NOT:       bl .L0$pb
+; CHECK-NOT:       bcl 20, 31, .L0$pb
 
 ; CHECK:       addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
 ; CHECK:       ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
@@ -64,7 +64,7 @@
 ; CHECK-NEXT: .long	.LBB0_{{[0-9]+}}-.LJTI0_0
 
 ; LARGE-LABEL: test_jump_table:
-; LARGE:       bl .L0$pb
+; LARGE:       bcl 20, 31, .L0$pb
 ; LARGE-NEXT:  .L0$pb:
 ; LARGE:       mflr [[REGBASE:[0-9]+]]
 
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll
new file mode 100644
index 0000000..1e938b1
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=powerpc -relocation-model=pic | \
+; RUN:    FileCheck -check-prefixes=SMALL %s
+
+@val = global i8 0, align 1
+
+define zeroext i8 @testbcl() nounwind {
+; SMALL-LABEL: testbcl:
+; SMALL:       # %bb.0: # %entry
+; SMALL-NEXT:    mflr 0
+; SMALL-NEXT:    stwu 1, -16(1)
+; SMALL-NEXT:    stw 30, 8(1)
+; SMALL-NEXT:    stw 0, 20(1)
+; SMALL-NEXT:    bcl 20, 31, .L0$pb
+; SMALL-NEXT:  .L0$pb:
+; SMALL-NEXT:    mflr 30
+; SMALL-NEXT:    lwz 3, .L0$poff-.L0$pb(30)
+; SMALL-NEXT:    add 30, 3, 30
+; SMALL-NEXT:    lwz 3, .LC0-.LTOC(30)
+; SMALL-NEXT:    lbz 3, 0(3)
+; SMALL-NEXT:    lwz 0, 20(1)
+; SMALL-NEXT:    lwz 30, 8(1)
+; SMALL-NEXT:    addi 1, 1, 16
+; SMALL-NEXT:    mtlr 0
+; SMALL-NEXT:    blr
+entry:
+  %0 = load i8, ptr @val, align 1
+  ret i8 %0
+}
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
index 2f0b929..7be1a80 100644
--- a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -55,7 +55,7 @@
 ; LARGE-BSS-NEXT:  foo:
 ; LARGE-BSS:         stwu 1, -32(1)
 ; LARGE-BSS:         stw 30, 24(1)
-; LARGE-BSS:         bl [[PB]]
+; LARGE-BSS:         bcl 20, 31, [[PB]]
 ; LARGE-BSS-NEXT:  [[PB]]:
 ; LARGE-BSS:         mflr 30
 ; LARGE-BSS:         lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30)
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic.ll
index aed9941..f7d8df9 100644
--- a/llvm/test/CodeGen/PowerPC/ppc32-pic.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc32-pic.ll
@@ -19,7 +19,7 @@
 ; SMALL:         stwu 1, -32(1)
 ; SMALL:         stw 30, 24(1)
 ; SMALL-BSS:     bl _GLOBAL_OFFSET_TABLE_@local-4
-; SMALL-SECURE:  bl .L0$pb
+; SMALL-SECURE:  bcl 20, 31, .L0$pb
 ; SMALL:         mflr 30
 ; SMALL-SECURE:  addis 30, 30, _GLOBAL_OFFSET_TABLE_-.L0$pb@ha
 ; SMALL-SECURE:  addi 30, 30, _GLOBAL_OFFSET_TABLE_-.L0$pb@l