[GlobalISel][AMDGPU] Legalize saturating add/subtract
Add support in LegalizerHelper for lowering G_SADDSAT etc. either
using add/subtract-with-overflow or using max/min instructions.
Enable this lowering for AMDGPU so it can be tested. The legalization
rules are still approximate and skips out on using the clamp bit to
treat these as legal, which has never been used before. This also
doesn't yet try to deal with expanding SALU cases.
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e420e75..5dcb5b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2728,6 +2728,27 @@
case G_READ_REGISTER:
case G_WRITE_REGISTER:
return lowerReadWriteRegister(MI);
+ case G_UADDSAT:
+ case G_USUBSAT: {
+ // Try to make a reasonable guess about which lowering strategy to use. The
+ // target can override this with custom lowering and calling the
+ // implementation functions.
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ if (LI.isLegalOrCustom({G_UMIN, Ty}))
+ return lowerAddSubSatToMinMax(MI);
+ return lowerAddSubSatToAddoSubo(MI);
+ }
+ case G_SADDSAT:
+ case G_SSUBSAT: {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ // FIXME: It would probably make more sense to see if G_SADDO is preferred,
+ // since it's a shorter expansion. However, we would need to figure out the
+ // preferred boolean type for the carry out for the query.
+ if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
+ return lowerAddSubSatToMinMax(MI);
+ return lowerAddSubSatToAddoSubo(MI);
+ }
}
}
@@ -5316,6 +5337,151 @@
}
LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ LLT Ty = MRI.getType(Res);
+ bool IsSigned;
+ bool IsAdd;
+ unsigned BaseOp;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected addsat/subsat opcode");
+ case TargetOpcode::G_UADDSAT:
+ IsSigned = false;
+ IsAdd = true;
+ BaseOp = TargetOpcode::G_ADD;
+ break;
+ case TargetOpcode::G_SADDSAT:
+ IsSigned = true;
+ IsAdd = true;
+ BaseOp = TargetOpcode::G_ADD;
+ break;
+ case TargetOpcode::G_USUBSAT:
+ IsSigned = false;
+ IsAdd = false;
+ BaseOp = TargetOpcode::G_SUB;
+ break;
+ case TargetOpcode::G_SSUBSAT:
+ IsSigned = true;
+ IsAdd = false;
+ BaseOp = TargetOpcode::G_SUB;
+ break;
+ }
+
+ if (IsSigned) {
+ // sadd.sat(a, b) ->
+ // hi = 0x7fffffff - smax(a, 0)
+ // lo = 0x80000000 - smin(a, 0)
+ // a + smin(smax(lo, b), hi)
+ // ssub.sat(a, b) ->
+ // lo = smax(a, -1) - 0x7fffffff
+ // hi = smin(a, -1) - 0x80000000
+ // a - smin(smax(lo, b), hi)
+ // TODO: AMDGPU can use a "median of 3" instruction here:
+ // a +/- med3(lo, b, hi)
+ uint64_t NumBits = Ty.getScalarSizeInBits();
+ auto MaxVal =
+ MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
+ auto MinVal =
+ MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+ MachineInstrBuilder Hi, Lo;
+ if (IsAdd) {
+ auto Zero = MIRBuilder.buildConstant(Ty, 0);
+ Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
+ Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
+ } else {
+ auto NegOne = MIRBuilder.buildConstant(Ty, -1);
+ Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
+ MaxVal);
+ Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
+ MinVal);
+ }
+ auto RHSClamped =
+ MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
+ MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
+ } else {
+ // uadd.sat(a, b) -> a + umin(~a, b)
+ // usub.sat(a, b) -> a - umin(a, b)
+ Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
+ auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
+ MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
+ }
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ LLT Ty = MRI.getType(Res);
+ LLT BoolTy = Ty.changeElementSize(1);
+ bool IsSigned;
+ bool IsAdd;
+ unsigned OverflowOp;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected addsat/subsat opcode");
+ case TargetOpcode::G_UADDSAT:
+ IsSigned = false;
+ IsAdd = true;
+ OverflowOp = TargetOpcode::G_UADDO;
+ break;
+ case TargetOpcode::G_SADDSAT:
+ IsSigned = true;
+ IsAdd = true;
+ OverflowOp = TargetOpcode::G_SADDO;
+ break;
+ case TargetOpcode::G_USUBSAT:
+ IsSigned = false;
+ IsAdd = false;
+ OverflowOp = TargetOpcode::G_USUBO;
+ break;
+ case TargetOpcode::G_SSUBSAT:
+ IsSigned = true;
+ IsAdd = false;
+ OverflowOp = TargetOpcode::G_SSUBO;
+ break;
+ }
+
+ auto OverflowRes =
+ MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
+ Register Tmp = OverflowRes.getReg(0);
+ Register Ov = OverflowRes.getReg(1);
+ MachineInstrBuilder Clamp;
+ if (IsSigned) {
+ // sadd.sat(a, b) ->
+ // {tmp, ov} = saddo(a, b)
+ // ov ? (tmp >>s 31) + 0x80000000 : r
+ // ssub.sat(a, b) ->
+ // {tmp, ov} = ssubo(a, b)
+ // ov ? (tmp >>s 31) + 0x80000000 : r
+ uint64_t NumBits = Ty.getScalarSizeInBits();
+ auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
+ auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
+ auto MinVal =
+ MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+ Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
+ } else {
+ // uadd.sat(a, b) ->
+ // {tmp, ov} = uaddo(a, b)
+ // ov ? 0xffffffff : tmp
+ // usub.sat(a, b) ->
+ // {tmp, ov} = usubo(a, b)
+ // ov ? 0 : tmp
+ Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
+ }
+ MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
LegalizerHelper::lowerBswap(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();