llvm/lib/Target/X86/X86PreAMXConfig.cpp - llvm-project - Git at Google

 //===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// Insert tilecfg for each area of key AMX intrinsic.
 /// All the key AMX intrinsic's tile operand must come from tileload. And the
 /// def tile of key AMX intrinsic must be tilestored.
 /// take tdpbssd for example:
 /// --------------------------------------------------------------------------
 /// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...)                key
 /// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...)                 |
 /// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...)                amx
 /// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3)         |
 /// call void @llvm.x86.tilestored64.internal(... td)                     area
 /// --------------------------------------------------------------------------
 /// This pass will insert tilecfg before every key-amx-area, some like:
 /// --------------------------------------------------------------------------
 /// %cfgmem = alloca <16 x i32>, align 4                        * allocate mem
 /// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem       * zero init
 /// ...
 /// ... pre-config shape of %t1                                 *
 /// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1     *
 /// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2   * pre-config
 /// ...                                                         *
 /// ... pre-config shape of %t2                                 * shapes
 /// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1     *
 /// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2   *
 /// ...
 /// call void @llvm.x86.ldtilecfg(i8* %cfgmem)                  * tile config
 //
 //===----------------------------------------------------------------------===//
 //
 #include "X86.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"

 using namespace llvm;
 using namespace PatternMatch;

 #define DEBUG_TYPE "pre-amx-config"

 static bool isAMXIntrinsic(IntrinsicInst *II) {
   for (Value *Operand : II->operands())
     if (Operand->getType()->isX86_AMXTy())
       return true;
   return II->getType()->isX86_AMXTy();
 }

 static bool isTileLoad(IntrinsicInst *II) {
   return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
          II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
 }

 static bool isTileStore(IntrinsicInst *II) {
   return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal;
 }

 #ifndef NDEBUG
 static bool onlyTileDef(IntrinsicInst *II) {
   for (Value *Operand : II->operands())
     if (Operand->getType()->isX86_AMXTy())
       return false;
   return II->getType()->isX86_AMXTy();
 }

 static bool brokenVolatile(Instruction *I) {
   // Todo: it is weak to identify a normal call here.
   if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) || I->isTerminator())
     return true;
   return false;
 }
 #endif

 namespace {
 class X86PreAMXConfig {
   Function &F;

 public:
   X86PreAMXConfig(Function &Func) : F(Func) {}
   bool preTileConfig();
   bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
   bool findConfigShapes(
       DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
   bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
   bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
                        SmallVector<Value *, 8> &Shapes);
   BasicBlock::iterator
   getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
                            SmallVector<Value *, 8> &Shapes);
   bool checkVolatileModel(SmallSet<Value *, 4> &Loads, IntrinsicInst *Store,
                           IntrinsicInst *KeyAMX);
 };

 // Orderly write the shapes in tilecfg's mem. This maybe not right.
 // Because the first shape may not corresponding to the first tmm register,
 // so we need to handle at at X86FastTileConfig::materializeTileCfg()
 // after register allocation.
 // For example:
 // --------------------------------------------------------------------------
 // zeroinitialize tilecfg's mem (of ldtilecfg)
 // --------------------------------------------------------------------------
 // ... pre-config shape of %t1                                 *
 // %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48   *
 // %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 *
 // store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1     *
 // store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2   * pre-config
 // ...                                                         *
 // ... pre-config shape of %t2                                 *
 // %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49   *
 // %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 *
 // store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1     * shapes
 // store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2   *
 // ...                                                         *
 // ... pre-config shape of %t3                                 * of
 // %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50   *
 // %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 *
 // store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1     *
 // store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2   *
 // ...                                                         * tiles
 // ... pre-config shape of %td                                 *
 // %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51   *
 // %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 *
 // store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1     *
 // store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2   *
 // --------------------------------------------------------------------------
 // call void @llvm.x86.ldtilecfg(i8* %mem)                     * tile config
 // --------------------------------------------------------------------------
 // %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...)          key
 // %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
 // %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...)          amx
 // %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
 // call void @llvm.x86.tilestored64.internal(... td)                     area
 // --------------------------------------------------------------------------
 bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
                                       SmallVector<Value *, 8> &Shapes) {
   bool Write = false;
   LLVMContext &Ctx = Pos->getParent()->getContext();
   Type *I8Ty = Type::getInt8Ty(Ctx);
   Type *I16Ty = Type::getInt16Ty(Ctx);

   // TODO: Currently we defaultly set Palette = 1, it may be assigned to
   // other value in the future.
   Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
   Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
   Value *PalettePos =
       GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
   new StoreInst(PaletteValue, PalettePos, Pos);

   for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
     Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
     Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
     const std::string ShapeName = "amx.tmm." + itostr(I);
     Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
                                               ShapeName + ".shape.row", Pos);
     Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
     ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
                              ShapeName + ".shape.col", Pos);
     Value *Row = Shapes[I * 2];
     Value *Col = Shapes[I * 2 + 1];
     Row = new TruncInst(Row, I8Ty, "", Pos);
     new StoreInst(Row, RowPos, Pos);
     new StoreInst(Col, ColPos, Pos);
     Write = true;
   }
   return Write;
 }

 bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
                                     SmallVector<Value *, 8> &Shapes) {
   Module *M = F.getParent();
   IRBuilder<> Builder(ModelStart);
   const DataLayout &DL = M->getDataLayout();
   unsigned AddrSpace = DL.getAllocaAddrSpace();
   LLVMContext &Ctx = Builder.getContext();
   Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false);
   Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx));

   AllocaInst *Addr =
       new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front());
   Addr->setAlignment(Alignment);
   Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());

   std::array<Value *, 1> Args = {I8Ptr};
   Instruction *Cfg =
       Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);

   Value *Val0 = Constant::getNullValue(V512Ty);
   Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
   assert(Init0 && "Not Zero initilizate the cfg mem!");

   preWriteTileCfg(I8Ptr, Cfg, Shapes);

   return Init0;
 }

 // Todo: We may need to handle "more than one store" case in the future.
 bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads,
                                          IntrinsicInst *Store,
                                          IntrinsicInst *KeyAMX) {
   Value *ST = Store->getOperand(4);

   // Only has tileload and tilestore.
   if (!KeyAMX)
     return (Loads.size() == 1) && Loads.contains(ST);

   // All Loads should be operands of KeyAMX.
   // All tile operands of KeyAMX should come from Loads.
   for (Value *Op : KeyAMX->operands()) {
     if (Op->getType()->isX86_AMXTy())
       if (!Loads.erase(Op))
         return false;
   }

   // The def of KeyAMX should be stored into mem.
   // Todo: is it key amx can be no def?
   return Loads.empty() && (ST == cast<Value>(KeyAMX));
 }

 bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX,
                                       SmallVector<Value *, 8> &Shapes) {
   for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) {
     Value *Op = KeyAMX->getOperand(I);
     if (!Op->getType()->isX86_AMXTy())
       continue;
     IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op);
     assert((TileDef && isTileLoad(TileDef)) &&
            "All KeyAMX's tile definiation should comes from TileLoad!");
     Shapes.push_back(TileDef->getOperand(0));
     Shapes.push_back(TileDef->getOperand(1));
   }
   if (!isTileStore(KeyAMX)) {
     Shapes.push_back(KeyAMX->getOperand(0));
     Shapes.push_back(KeyAMX->getOperand(1));
   }
   return Shapes.size() != 0;
 }

 // Collect the shapes and skip the area of current key amx intrinsic.
 //
 // For example:
 // ...
 // --------------------------------------------------------------------------
 // %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...)  record (m,k)
 // %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)  record (m,k)
 // %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...)  record (m,k)
 // %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)
 // call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k)
 // --------------------------------------------------------------------------
 BasicBlock::iterator
 X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
                                           SmallVector<Value *, 8> &Shapes) {
   IntrinsicInst *KeyAMX = nullptr;
   BasicBlock *BB = Iter->getParent();
   BasicBlock::iterator PosEnd = BB->end();
   SmallSet<Value *, 4> Loads;

   // See TileStore as "Config Position End" and check volatile model.
   for (auto I = Iter, E = BB->end(); I != E; ++I) {
     assert(!brokenVolatile(&*I) && "Not reach tile store!");
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
     if (!II || !isAMXIntrinsic(II))
       continue;

     if (isTileLoad(II)) {
       Loads.insert(II);
     } else if (isTileStore(II)) {
       if (!checkVolatileModel(Loads, II, KeyAMX))
         report_fatal_error("Not Volatile AMX Model!");
       PosEnd = I;
       break;
     } else {
       assert(!KeyAMX && "Too many key amx intrinsic!");
       KeyAMX = II;
     }
   }
   assert(PosEnd != BB->end() && "Not find TileStore!");

   // See KeyAMX as TileStore if only TileLoad and TileStore.
   if (!KeyAMX)
     KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd);

   // Get Shapes in order.
   assert(Shapes.empty() && "Shapes should be clean.");
   getKeyAMXShapes(KeyAMX, Shapes);

   return PosEnd;
 }

 // Record a key amx area's shapes with its position.
 // Use the first tileload as its position.
 // For example:
 // ...
 // --------------------------------------------------------------------------
 // %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...)   <--  pos
 // %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)        /
 // %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...)     shapes:
 // %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)    (m,k)(k,n)
 // call void @llvm.x86.tilestored64.internal(m, n,... td)          (m,n)(m,n)
 // --------------------------------------------------------------------------
 bool X86PreAMXConfig::findConfigShapes(
     DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
   bool Find = false;
   for (BasicBlock &BB : F) {
     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
       if (!II)
         continue;
       if (!isAMXIntrinsic(II))
         continue;
       assert(onlyTileDef(II) && "Not volatile model for AMX at O0!");

       I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]);
       Find = true;
     }
   }
   return Find;
 }

 // Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic.
 // e.g. (key amx = tdpbssd)
 // --------------------------------------------------------------------------
 // %cfgmem = alloca <16 x i32>, align 4                        * allocate mem
 // store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem       * zero init
 // ...
 // ... pre-config shape of %t1                                 *
 // store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1     *
 // store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2   * pre-config
 // ...                                                         *
 // ... pre-config shape of %t2                                 *
 // store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1     * shapes
 // store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2   *
 // ...                                                         *
 // ... pre-config shape of %t3                                 * of
 // store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1     *
 // store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2   *
 // ...                                                         * tiles
 // ... pre-config shape of %td                                 *
 // store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1     *
 // store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2   *
 //
 // call void @llvm.x86.ldtilecfg(i8* %cfgmem)                  * pre-config
 // --------------------------------------------------------------------------
 // %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...)          key
 // %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
 // %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...)          amx
 // %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
 // call void @llvm.x86.tilestored64.internal(... td)                     area
 // --------------------------------------------------------------------------
 bool X86PreAMXConfig::preTileConfig() {
   DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
   bool NeedCfg = findConfigShapes(PosAndShapes);
   if (!NeedCfg)
     return false;
   for (auto &IPAndShapes : PosAndShapes)
     addTileConfig(IPAndShapes.first, IPAndShapes.second);

   return true;
 }
 } // anonymous namespace

 namespace {

 class X86PreAMXConfigPass : public FunctionPass {
 public:
   static char ID;

   X86PreAMXConfigPass() : FunctionPass(ID) {
     initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry());
   }

   bool runOnFunction(Function &F) override {
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     bool C = false;

     // Prepare for fast register allocation at O0.
     if (TM->getOptLevel() == CodeGenOpt::None) {

       // We pre-config each key AMX intrinsic at O0.
       // In theory, one tile config can cover several AMX intrinsics, but
       // it is very diffcult to classify the tile shapes at O0. So here we
       // let thing be easy, pre-config every key AMX intrinsic.
       X86PreAMXConfig PCFG(F);
       C = PCFG.preTileConfig();
     }

     return C;
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
   }
 };

 } // anonymous namespace

 static const char PassName[] = "Pre AMX Tile Config";
 char X86PreAMXConfigPass::ID = 0;
 INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)

 FunctionPass *llvm::createX86PreAMXConfigPass() {
   return new X86PreAMXConfigPass();
 }
	//===- Target/X86/X86PreAMXConfig.cpp - ------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// Insert tilecfg for each area of key AMX intrinsic.
	/// All the key AMX intrinsic's tile operand must come from tileload. And the
	/// def tile of key AMX intrinsic must be tilestored.
	/// take tdpbssd for example:
	/// --------------------------------------------------------------------------
	/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key
	/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) \|
	/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx
	/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) \|
	/// call void @llvm.x86.tilestored64.internal(... td) area
	/// --------------------------------------------------------------------------
	/// This pass will insert tilecfg before every key-amx-area, some like:
	/// --------------------------------------------------------------------------
	/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
	/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
	/// ...
	/// ... pre-config shape of %t1 *
	/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
	/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
	/// ... *
	/// ... pre-config shape of %t2 * shapes
	/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 *
	/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
	/// ...
	/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config
	//
	//===----------------------------------------------------------------------===//
	//
	#include "X86.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsX86.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace llvm;
	using namespace PatternMatch;

	#define DEBUG_TYPE "pre-amx-config"

	static bool isAMXIntrinsic(IntrinsicInst *II) {
	for (Value *Operand : II->operands())
	if (Operand->getType()->isX86_AMXTy())
	return true;
	return II->getType()->isX86_AMXTy();
	}

	static bool isTileLoad(IntrinsicInst *II) {
	return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal \|\|
	II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
	}

	static bool isTileStore(IntrinsicInst *II) {
	return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal;
	}

	#ifndef NDEBUG
	static bool onlyTileDef(IntrinsicInst *II) {
	for (Value *Operand : II->operands())
	if (Operand->getType()->isX86_AMXTy())
	return false;
	return II->getType()->isX86_AMXTy();
	}

	static bool brokenVolatile(Instruction *I) {
	// Todo: it is weak to identify a normal call here.
	if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) \|\| I->isTerminator())
	return true;
	return false;
	}
	#endif

	namespace {
	class X86PreAMXConfig {
	Function &F;

	public:
	X86PreAMXConfig(Function &Func) : F(Func) {}
	bool preTileConfig();
	bool addTileConfig(Instruction ModelStart, SmallVector<Value , 8> &Shapes);
	bool findConfigShapes(
	DenseMap<Instruction , SmallVector<Value , 8>> &PosAndShapes);
	bool getKeyAMXShapes(IntrinsicInst KeyAMX, SmallVector<Value , 8> &Shapes);
	bool preWriteTileCfg(Value I8Ptr, Instruction Pos,
	SmallVector<Value *, 8> &Shapes);
	BasicBlock::iterator
	getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
	SmallVector<Value *, 8> &Shapes);
	bool checkVolatileModel(SmallSet<Value , 4> &Loads, IntrinsicInst Store,
	IntrinsicInst *KeyAMX);
	};

	// Orderly write the shapes in tilecfg's mem. This maybe not right.
	// Because the first shape may not corresponding to the first tmm register,
	// so we need to handle at at X86FastTileConfig::materializeTileCfg()
	// after register allocation.
	// For example:
	// --------------------------------------------------------------------------
	// zeroinitialize tilecfg's mem (of ldtilecfg)
	// --------------------------------------------------------------------------
	// ... pre-config shape of %t1 *
	// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 *
	// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 *
	// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
	// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
	// ... *
	// ... pre-config shape of %t2 *
	// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 *
	// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 *
	// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
	// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
	// ... *
	// ... pre-config shape of %t3 * of
	// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 *
	// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 *
	// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
	// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
	// ... * tiles
	// ... pre-config shape of %td *
	// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 *
	// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 *
	// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
	// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
	// --------------------------------------------------------------------------
	// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config
	// --------------------------------------------------------------------------
	// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
	// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
	// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
	// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
	// call void @llvm.x86.tilestored64.internal(... td) area
	// --------------------------------------------------------------------------
	bool X86PreAMXConfig::preWriteTileCfg(Value I8Ptr, Instruction Pos,
	SmallVector<Value *, 8> &Shapes) {
	bool Write = false;
	LLVMContext &Ctx = Pos->getParent()->getContext();
	Type *I8Ty = Type::getInt8Ty(Ctx);
	Type *I16Ty = Type::getInt16Ty(Ctx);

	// TODO: Currently we defaultly set Palette = 1, it may be assigned to
	// other value in the future.
	Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
	Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
	Value *PalettePos =
	GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
	new StoreInst(PaletteValue, PalettePos, Pos);

	for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
	Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
	Value ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I 2);
	const std::string ShapeName = "amx.tmm." + itostr(I);
	Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
	ShapeName + ".shape.row", Pos);
	Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
	ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
	ShapeName + ".shape.col", Pos);
	Value Row = Shapes[I 2];
	Value Col = Shapes[I 2 + 1];
	Row = new TruncInst(Row, I8Ty, "", Pos);
	new StoreInst(Row, RowPos, Pos);
	new StoreInst(Col, ColPos, Pos);
	Write = true;
	}
	return Write;
	}

	bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
	SmallVector<Value *, 8> &Shapes) {
	Module *M = F.getParent();
	IRBuilder<> Builder(ModelStart);
	const DataLayout &DL = M->getDataLayout();
	unsigned AddrSpace = DL.getAllocaAddrSpace();
	LLVMContext &Ctx = Builder.getContext();
	Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false);
	Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx));

	AllocaInst *Addr =
	new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front());
	Addr->setAlignment(Alignment);
	Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());

	std::array<Value *, 1> Args = {I8Ptr};
	Instruction *Cfg =
	Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);

	Value *Val0 = Constant::getNullValue(V512Ty);
	Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
	assert(Init0 && "Not Zero initilizate the cfg mem!");

	preWriteTileCfg(I8Ptr, Cfg, Shapes);

	return Init0;
	}

	// Todo: We may need to handle "more than one store" case in the future.
	bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads,
	IntrinsicInst *Store,
	IntrinsicInst *KeyAMX) {
	Value *ST = Store->getOperand(4);

	// Only has tileload and tilestore.
	if (!KeyAMX)
	return (Loads.size() == 1) && Loads.contains(ST);

	// All Loads should be operands of KeyAMX.
	// All tile operands of KeyAMX should come from Loads.
	for (Value *Op : KeyAMX->operands()) {
	if (Op->getType()->isX86_AMXTy())
	if (!Loads.erase(Op))
	return false;
	}

	// The def of KeyAMX should be stored into mem.
	// Todo: is it key amx can be no def?
	return Loads.empty() && (ST == cast<Value>(KeyAMX));
	}

	bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX,
	SmallVector<Value *, 8> &Shapes) {
	for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) {
	Value *Op = KeyAMX->getOperand(I);
	if (!Op->getType()->isX86_AMXTy())
	continue;
	IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op);
	assert((TileDef && isTileLoad(TileDef)) &&
	"All KeyAMX's tile definiation should comes from TileLoad!");
	Shapes.push_back(TileDef->getOperand(0));
	Shapes.push_back(TileDef->getOperand(1));
	}
	if (!isTileStore(KeyAMX)) {
	Shapes.push_back(KeyAMX->getOperand(0));
	Shapes.push_back(KeyAMX->getOperand(1));
	}
	return Shapes.size() != 0;
	}

	// Collect the shapes and skip the area of current key amx intrinsic.
	//
	// For example:
	// ...
	// --------------------------------------------------------------------------
	// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k)
	// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k)
	// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k)
	// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)
	// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k)
	// --------------------------------------------------------------------------
	BasicBlock::iterator
	X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
	SmallVector<Value *, 8> &Shapes) {
	IntrinsicInst *KeyAMX = nullptr;
	BasicBlock *BB = Iter->getParent();
	BasicBlock::iterator PosEnd = BB->end();
	SmallSet<Value *, 4> Loads;

	// See TileStore as "Config Position End" and check volatile model.
	for (auto I = Iter, E = BB->end(); I != E; ++I) {
	assert(!brokenVolatile(&*I) && "Not reach tile store!");
	IntrinsicInst II = dyn_cast<IntrinsicInst>(&I);
	if (!II \|\| !isAMXIntrinsic(II))
	continue;

	if (isTileLoad(II)) {
	Loads.insert(II);
	} else if (isTileStore(II)) {
	if (!checkVolatileModel(Loads, II, KeyAMX))
	report_fatal_error("Not Volatile AMX Model!");
	PosEnd = I;
	break;
	} else {
	assert(!KeyAMX && "Too many key amx intrinsic!");
	KeyAMX = II;
	}
	}
	assert(PosEnd != BB->end() && "Not find TileStore!");

	// See KeyAMX as TileStore if only TileLoad and TileStore.
	if (!KeyAMX)
	KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd);

	// Get Shapes in order.
	assert(Shapes.empty() && "Shapes should be clean.");
	getKeyAMXShapes(KeyAMX, Shapes);

	return PosEnd;
	}

	// Record a key amx area's shapes with its position.
	// Use the first tileload as its position.
	// For example:
	// ...
	// --------------------------------------------------------------------------
	// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos
	// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) /
	// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes:
	// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n)
	// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n)
	// --------------------------------------------------------------------------
	bool X86PreAMXConfig::findConfigShapes(
	DenseMap<Instruction , SmallVector<Value , 8>> &PosAndShapes) {
	bool Find = false;
	for (BasicBlock &BB : F) {
	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
	IntrinsicInst II = dyn_cast<IntrinsicInst>(&I);
	if (!II)
	continue;
	if (!isAMXIntrinsic(II))
	continue;
	assert(onlyTileDef(II) && "Not volatile model for AMX at O0!");

	I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]);
	Find = true;
	}
	}
	return Find;
	}

	// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic.
	// e.g. (key amx = tdpbssd)
	// --------------------------------------------------------------------------
	// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
	// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
	// ...
	// ... pre-config shape of %t1 *
	// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
	// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
	// ... *
	// ... pre-config shape of %t2 *
	// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
	// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
	// ... *
	// ... pre-config shape of %t3 * of
	// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
	// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
	// ... * tiles
	// ... pre-config shape of %td *
	// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
	// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
	//
	// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config
	// --------------------------------------------------------------------------
	// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
	// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
	// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
	// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
	// call void @llvm.x86.tilestored64.internal(... td) area
	// --------------------------------------------------------------------------
	bool X86PreAMXConfig::preTileConfig() {
	DenseMap<Instruction , SmallVector<Value , 8>> PosAndShapes;
	bool NeedCfg = findConfigShapes(PosAndShapes);
	if (!NeedCfg)
	return false;
	for (auto &IPAndShapes : PosAndShapes)
	addTileConfig(IPAndShapes.first, IPAndShapes.second);

	return true;
	}
	} // anonymous namespace

	namespace {

	class X86PreAMXConfigPass : public FunctionPass {
	public:
	static char ID;

	X86PreAMXConfigPass() : FunctionPass(ID) {
	initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
	bool C = false;

	// Prepare for fast register allocation at O0.
	if (TM->getOptLevel() == CodeGenOpt::None) {

	// We pre-config each key AMX intrinsic at O0.
	// In theory, one tile config can cover several AMX intrinsics, but
	// it is very diffcult to classify the tile shapes at O0. So here we
	// let thing be easy, pre-config every key AMX intrinsic.
	X86PreAMXConfig PCFG(F);
	C = PCFG.preTileConfig();
	}

	return C;
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addRequired<TargetPassConfig>();
	}
	};

	} // anonymous namespace

	static const char PassName[] = "Pre AMX Tile Config";
	char X86PreAMXConfigPass::ID = 0;
	INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
	INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)

	FunctionPass *llvm::createX86PreAMXConfigPass() {
	return new X86PreAMXConfigPass();
	}