[RISCV][PowerPC] Fix google/benchmark benchmark::cycleclock::Now

This is a cherrypick of the upstream fix commit a77d5f7 onto
the llvm-test-suite's `MicroBenchmarks/libs/benchmark-1.3.0`,
to match the same cherrypick in the LLVM monorepo.
This fixes 32-bit RISC-V compilation, and the issues
mentioned in https://github.com/google/benchmark/pull/955
An additional cherrypick of ecc1685 fixes some minor formatting
issues introduced by the preceding commit.

Differential Revision: https://reviews.llvm.org/D78456
diff --git a/MicroBenchmarks/libs/benchmark-1.3.0/README.llvm b/MicroBenchmarks/libs/benchmark-1.3.0/README.llvm
index 8654e62..daac9ad 100644
--- a/MicroBenchmarks/libs/benchmark-1.3.0/README.llvm
+++ b/MicroBenchmarks/libs/benchmark-1.3.0/README.llvm
@@ -12,3 +12,9 @@
   is applied on top of v1.3.0 to add NetBSD Support.
 * https://github.com/google/benchmark/commit/4abdfbb802d1b514703223f5f852ce4a507d32d2
   is applied on top of v1.3.0 to add RISC-V timer support.
+* https://github.com/google/benchmark/commit/a77d5f70efaebe2b7e8c10134526a23a7ce7ef35
+  and
+  https://github.com/google/benchmark/commit/ecc1685340f58f7fe6b707036bc0bb1fccabb0c1
+  are applied on top of v1.3.0 to fix timestamp-related inline asm issues and
+  32-bit RISC-V build failures. The second cherrypicked commit fixes formatting
+  issues introduced by the first one.
diff --git a/MicroBenchmarks/libs/benchmark-1.3.0/src/cycleclock.h b/MicroBenchmarks/libs/benchmark-1.3.0/src/cycleclock.h
index 14ffd97..08269b1 100644
--- a/MicroBenchmarks/libs/benchmark-1.3.0/src/cycleclock.h
+++ b/MicroBenchmarks/libs/benchmark-1.3.0/src/cycleclock.h
@@ -84,13 +84,21 @@
   return (high << 32) | low;
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftbl %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
   int64_t tick;
   asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -167,16 +175,22 @@
 #elif defined(__riscv) // RISC-V
   // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
-  uint64_t cycles_low, cycles_hi0, cycles_hi1;
-  asm("rdcycleh %0" : "=r"(cycles_hi0));
-  asm("rdcycle %0" : "=r"(cycles_lo));
-  asm("rdcycleh %0" : "=r"(cycles_hi1));
-  // This matches the PowerPC overflow detection, above
-  cycles_lo &= -static_cast<int64_t>(cycles_hi0 == cycles_hi1);
-  return (cycles_hi1 << 32) | cycles_lo;
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
 #else
   uint64_t cycles;
-  asm("rdcycle %0" : "=r"(cycles));
+  asm volatile("rdcycle %0" : "=r"(cycles));
   return cycles;
 #endif
 #else