mlir/include/mlir/Dialect/Quant/UniformSupport.h - llvm-project - Git at Google

 //===- UniformSupport.h - Support utilities for uniform quant ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
 #define MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_

 #include "mlir/Dialect/Quant/QuantTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"

 namespace mlir {
 namespace quant {

 /// Performs type conversion from an arbitrary input type to a type
 /// that is expressed by a QuantizedType.
 ///
 /// This handles cases where the inputType is a supported primitive type
 /// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
 /// elemental type.
 ///
 /// Since conversion often involves introspecting some attributes of the
 /// input type in order to determine how to represent it, this is a two step
 /// process.
 struct ExpressedToQuantizedConverter {
   /// Creates a converter for the given input type.
   static const ExpressedToQuantizedConverter forInputType(Type inputType);

   /// Converts the inputType to be based on the given elemental type,
   /// returning the new type (or nullptr and emit an error on failure).
   Type convert(QuantizedType elementalType) const;

   /// Whether the conversion is legal.
   explicit operator bool() const { return (bool)expressedType; }

   /// The input type that is being converted from.
   /// This may be an elemental or composite type.
   const Type inputType;

   /// Supported, elemental expressed type (i.e. f32).
   /// Will be nullptr if conversion is not supported.
   const Type expressedType;
 };

 /// Reference implementation of converting between real numbers and values
 /// represented by a UniformQuantizedType.
 /// Note that this is not expected to be speedy and may be superseded eventually
 /// by a more optimal implementation.
 /// Also, the interface assumes that quantization is done per-layer and will
 /// need to be wider for various per-channel schemes. As such, this is a
 /// placeholder.
 class UniformQuantizedValueConverter {
 public:
   explicit UniformQuantizedValueConverter(UniformQuantizedType uniformType)
       : UniformQuantizedValueConverter(
             uniformType.getScale(),
             static_cast<double>(uniformType.getZeroPoint()),
             static_cast<double>(uniformType.getStorageTypeMin()),
             static_cast<double>(uniformType.getStorageTypeMax()),
             uniformType.getStorageTypeIntegralWidth(), uniformType.isSigned()) {
     assert(uniformType.getExpressedType().isa<FloatType>());
     assert(uniformType.getStorageType().isSignlessInteger());
   }

   UniformQuantizedValueConverter(double scale, double zeroPoint,
                                  double clampMin, double clampMax,
                                  uint32_t storageBitWidth, bool isSigned)
       : scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
         clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
         clampMinDouble(clampMin), clampMaxDouble(clampMax),
         storageBitWidth(storageBitWidth), isSigned(isSigned),
         roundMode(APFloat::rmNearestTiesToAway) {}

   UniformQuantizedValueConverter(double scale, double zeroPoint,
                                  APFloat clampMin, APFloat clampMax,
                                  uint32_t storageBitWidth, bool isSigned)
       : scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
         clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
         clampMinDouble(clampMin.convertToDouble()),
         clampMaxDouble(clampMax.convertToDouble()),
         storageBitWidth(storageBitWidth), isSigned(isSigned),
         roundMode(APFloat::rmNearestTiesToAway) {}

   virtual APInt quantizeFloatToInt(APFloat expressedValue) const {
     // This function is a performance critical code path in quantization
     // since it runs for each single float parameter value.

     // Specialize f32->u8/i8 case to optimize performance.
     if (&expressedValue.getSemantics() == &APFloat::IEEEsingle() &&
         storageBitWidth == 8 &&
         roundMode == llvm::APFloatBase::rmNearestTiesToAway) {
       return quantizeF32ToInt8(expressedValue);
     }

     bool lossy;
     expressedValue.convert(scale.getSemantics(), roundMode, &lossy);
     // fixedpoint = clamp(clampMin, clampMax, (
     //   roundHalfToEven(expressed / scale) + zeroPoint))
     APFloat scaled = (expressedValue / scale);
     scaled.roundToIntegral(roundMode);
     scaled.add(zeroPoint, roundMode);
     APFloat fixedpoint = llvm::minimum(scaled, clampMax);
     fixedpoint = llvm::maximum(fixedpoint, clampMin);

     llvm::APSInt result(storageBitWidth, !isSigned);
     fixedpoint.convertToInteger(result, roundMode, &lossy);

     return std::move(result);
   }

   int64_t quantizeFloatToInt64(APFloat expressedValue) const {
     APInt qValue = quantizeFloatToInt(expressedValue);
     return isSigned ? qValue.getSExtValue() : qValue.getZExtValue();
   }

   virtual ~UniformQuantizedValueConverter() {}

 private:
   // An optimized implementation to quantize f32 to i8/u8 with C++ native
   // arithmetic.
   virtual APInt quantizeF32ToInt8(APFloat expressedValue) const {
     assert(&expressedValue.getSemantics() == &APFloat::IEEEsingle());
     assert(storageBitWidth == 8);
     assert(roundMode == llvm::APFloatBase::rmNearestTiesToAway);

     const float realValue = expressedValue.convertToFloat();

     const double scaled = realValue / scaleDouble + zeroPointDouble;
     // Round to nearest integer with halfway cases rounded away from zero.
     const double scaledRounded = std::round(scaled);
     const double clamped =
         std::min(std::max(scaledRounded, clampMinDouble), clampMaxDouble);

     uint64_t signlessResult;
     if (isSigned) {
       int64_t clampedInt = static_cast<int8_t>(clamped);
       memcpy(&signlessResult, &clampedInt, sizeof(clampedInt));
     } else {
       signlessResult = static_cast<uint8_t>(clamped);
     }
     return APInt(storageBitWidth, signlessResult);
   }

   // Keep both APFloat and double versions of the quantization parameters
   // around since they will be used in generic and specialized arithmetic,
   // respectively.
   const APFloat scale;
   const APFloat zeroPoint;
   const APFloat clampMin;
   const APFloat clampMax;

   const double scaleDouble;
   const double zeroPointDouble;
   const double clampMinDouble;
   const double clampMaxDouble;

   const uint32_t storageBitWidth;
   const bool isSigned;
   const llvm::APFloat::roundingMode roundMode;
 };

 /// An utility class to quantize an attribute by the per-axis quantization
 /// parameters. The size of the quantization dim in the converted elements
 /// attribute should matche the size of of scales/zeroPoints vectors in the
 /// quantization parameters.
 class UniformQuantizedPerAxisValueConverter {
 public:
   explicit UniformQuantizedPerAxisValueConverter(
       UniformQuantizedPerAxisType uniformType)
       : scales(uniformType.getScales()),
         zeroPoints(uniformType.getZeroPoints()),
         clampMin(static_cast<double>(uniformType.getStorageTypeMin())),
         clampMax(static_cast<double>(uniformType.getStorageTypeMax())),
         storageBitWidth(uniformType.getStorageTypeIntegralWidth()),
         isSigned(uniformType.isSigned()),
         quantizationDim(uniformType.getQuantizedDimension()) {
     assert(uniformType.getExpressedType().isa<FloatType>());
     assert(uniformType.getStorageType().isSignlessInteger());
     assert(scales.size() == zeroPoints.size());
   }

   /// Quantize an Attribute by the quantization parameters. Return nullptr if
   /// the conversion fails or the input array isn't an ElementsAttr.
   ElementsAttr convert(Attribute realValue);

 private:
   /// Quantize an DenseFPElementsAttr by the quantization parameters.
   DenseElementsAttr convert(DenseFPElementsAttr attr);

   /// Get a uniform converter for the index-th chunk along the quantizationDim.
   /// All the elements in this chunk is quantized by the returned converter.
   UniformQuantizedValueConverter getPerChunkConverter(int index) const {
     UniformQuantizedValueConverter converter(scales[index], zeroPoints[index],
                                              clampMin, clampMax,
                                              storageBitWidth, isSigned);
     return converter;
   }

   const ArrayRef<double> scales;
   const ArrayRef<int64_t> zeroPoints;
   const APFloat clampMin;
   const APFloat clampMax;
   const uint32_t storageBitWidth;
   const bool isSigned;
   int32_t quantizationDim;
 };

 } // namespace quant
 } // namespace mlir

 #endif // MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
	//===- UniformSupport.h - Support utilities for uniform quant ---- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_
	#define MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_

	#include "mlir/Dialect/Quant/QuantTypes.h"
	#include "mlir/IR/BuiltinTypes.h"
	#include "mlir/IR/Types.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"

	namespace mlir {
	namespace quant {

	/// Performs type conversion from an arbitrary input type to a type
	/// that is expressed by a QuantizedType.
	///
	/// This handles cases where the inputType is a supported primitive type
	/// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
	/// elemental type.
	///
	/// Since conversion often involves introspecting some attributes of the
	/// input type in order to determine how to represent it, this is a two step
	/// process.
	struct ExpressedToQuantizedConverter {
	/// Creates a converter for the given input type.
	static const ExpressedToQuantizedConverter forInputType(Type inputType);

	/// Converts the inputType to be based on the given elemental type,
	/// returning the new type (or nullptr and emit an error on failure).
	Type convert(QuantizedType elementalType) const;

	/// Whether the conversion is legal.
	explicit operator bool() const { return (bool)expressedType; }

	/// The input type that is being converted from.
	/// This may be an elemental or composite type.
	const Type inputType;

	/// Supported, elemental expressed type (i.e. f32).
	/// Will be nullptr if conversion is not supported.
	const Type expressedType;
	};

	/// Reference implementation of converting between real numbers and values
	/// represented by a UniformQuantizedType.
	/// Note that this is not expected to be speedy and may be superseded eventually
	/// by a more optimal implementation.
	/// Also, the interface assumes that quantization is done per-layer and will
	/// need to be wider for various per-channel schemes. As such, this is a
	/// placeholder.
	class UniformQuantizedValueConverter {
	public:
	explicit UniformQuantizedValueConverter(UniformQuantizedType uniformType)
	: UniformQuantizedValueConverter(
	uniformType.getScale(),
	static_cast<double>(uniformType.getZeroPoint()),
	static_cast<double>(uniformType.getStorageTypeMin()),
	static_cast<double>(uniformType.getStorageTypeMax()),
	uniformType.getStorageTypeIntegralWidth(), uniformType.isSigned()) {
	assert(uniformType.getExpressedType().isa<FloatType>());
	assert(uniformType.getStorageType().isSignlessInteger());
	}

	UniformQuantizedValueConverter(double scale, double zeroPoint,
	double clampMin, double clampMax,
	uint32_t storageBitWidth, bool isSigned)
	: scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
	clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
	clampMinDouble(clampMin), clampMaxDouble(clampMax),
	storageBitWidth(storageBitWidth), isSigned(isSigned),
	roundMode(APFloat::rmNearestTiesToAway) {}

	UniformQuantizedValueConverter(double scale, double zeroPoint,
	APFloat clampMin, APFloat clampMax,
	uint32_t storageBitWidth, bool isSigned)
	: scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
	clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
	clampMinDouble(clampMin.convertToDouble()),
	clampMaxDouble(clampMax.convertToDouble()),
	storageBitWidth(storageBitWidth), isSigned(isSigned),
	roundMode(APFloat::rmNearestTiesToAway) {}

	virtual APInt quantizeFloatToInt(APFloat expressedValue) const {
	// This function is a performance critical code path in quantization
	// since it runs for each single float parameter value.

	// Specialize f32->u8/i8 case to optimize performance.
	if (&expressedValue.getSemantics() == &APFloat::IEEEsingle() &&
	storageBitWidth == 8 &&
	roundMode == llvm::APFloatBase::rmNearestTiesToAway) {
	return quantizeF32ToInt8(expressedValue);
	}

	bool lossy;
	expressedValue.convert(scale.getSemantics(), roundMode, &lossy);
	// fixedpoint = clamp(clampMin, clampMax, (
	// roundHalfToEven(expressed / scale) + zeroPoint))
	APFloat scaled = (expressedValue / scale);
	scaled.roundToIntegral(roundMode);
	scaled.add(zeroPoint, roundMode);
	APFloat fixedpoint = llvm::minimum(scaled, clampMax);
	fixedpoint = llvm::maximum(fixedpoint, clampMin);

	llvm::APSInt result(storageBitWidth, !isSigned);
	fixedpoint.convertToInteger(result, roundMode, &lossy);

	return std::move(result);
	}

	int64_t quantizeFloatToInt64(APFloat expressedValue) const {
	APInt qValue = quantizeFloatToInt(expressedValue);
	return isSigned ? qValue.getSExtValue() : qValue.getZExtValue();
	}

	virtual ~UniformQuantizedValueConverter() {}

	private:
	// An optimized implementation to quantize f32 to i8/u8 with C++ native
	// arithmetic.
	virtual APInt quantizeF32ToInt8(APFloat expressedValue) const {
	assert(&expressedValue.getSemantics() == &APFloat::IEEEsingle());
	assert(storageBitWidth == 8);
	assert(roundMode == llvm::APFloatBase::rmNearestTiesToAway);

	const float realValue = expressedValue.convertToFloat();

	const double scaled = realValue / scaleDouble + zeroPointDouble;
	// Round to nearest integer with halfway cases rounded away from zero.
	const double scaledRounded = std::round(scaled);
	const double clamped =
	std::min(std::max(scaledRounded, clampMinDouble), clampMaxDouble);

	uint64_t signlessResult;
	if (isSigned) {
	int64_t clampedInt = static_cast<int8_t>(clamped);
	memcpy(&signlessResult, &clampedInt, sizeof(clampedInt));
	} else {
	signlessResult = static_cast<uint8_t>(clamped);
	}
	return APInt(storageBitWidth, signlessResult);
	}

	// Keep both APFloat and double versions of the quantization parameters
	// around since they will be used in generic and specialized arithmetic,
	// respectively.
	const APFloat scale;
	const APFloat zeroPoint;
	const APFloat clampMin;
	const APFloat clampMax;

	const double scaleDouble;
	const double zeroPointDouble;
	const double clampMinDouble;
	const double clampMaxDouble;

	const uint32_t storageBitWidth;
	const bool isSigned;
	const llvm::APFloat::roundingMode roundMode;
	};

	/// An utility class to quantize an attribute by the per-axis quantization
	/// parameters. The size of the quantization dim in the converted elements
	/// attribute should matche the size of of scales/zeroPoints vectors in the
	/// quantization parameters.
	class UniformQuantizedPerAxisValueConverter {
	public:
	explicit UniformQuantizedPerAxisValueConverter(
	UniformQuantizedPerAxisType uniformType)
	: scales(uniformType.getScales()),
	zeroPoints(uniformType.getZeroPoints()),
	clampMin(static_cast<double>(uniformType.getStorageTypeMin())),
	clampMax(static_cast<double>(uniformType.getStorageTypeMax())),
	storageBitWidth(uniformType.getStorageTypeIntegralWidth()),
	isSigned(uniformType.isSigned()),
	quantizationDim(uniformType.getQuantizedDimension()) {
	assert(uniformType.getExpressedType().isa<FloatType>());
	assert(uniformType.getStorageType().isSignlessInteger());
	assert(scales.size() == zeroPoints.size());
	}

	/// Quantize an Attribute by the quantization parameters. Return nullptr if
	/// the conversion fails or the input array isn't an ElementsAttr.
	ElementsAttr convert(Attribute realValue);

	private:
	/// Quantize an DenseFPElementsAttr by the quantization parameters.
	DenseElementsAttr convert(DenseFPElementsAttr attr);

	/// Get a uniform converter for the index-th chunk along the quantizationDim.
	/// All the elements in this chunk is quantized by the returned converter.
	UniformQuantizedValueConverter getPerChunkConverter(int index) const {
	UniformQuantizedValueConverter converter(scales[index], zeroPoints[index],
	clampMin, clampMax,
	storageBitWidth, isSigned);
	return converter;
	}

	const ArrayRef<double> scales;
	const ArrayRef<int64_t> zeroPoints;
	const APFloat clampMin;
	const APFloat clampMax;
	const uint32_t storageBitWidth;
	const bool isSigned;
	int32_t quantizationDim;
	};

	} // namespace quant
	} // namespace mlir

	#endif // MLIR_DIALECT_QUANT_UNIFORMSUPPORT_H_