mirror of https://github.com/alibaba/MNN.git
330 lines
16 KiB
C++
330 lines
16 KiB
C++
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
// edited from tensorflow - quantization_utils.cc by MNN.
|
|
|
|
|
|
#ifndef QUANTIZATION_HPP
|
|
#define QUANTIZATION_HPP
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <cmath>
|
|
#include <limits>
|
|
#include "TFQuantizeOp_generated.h"
|
|
|
|
namespace MNN {
|
|
|
|
inline int CalculateInputRadius(int inputIntegerBits, int inputLeftShift) {
|
|
const double maxInputRescaled =
|
|
1.0 * ((1 << inputIntegerBits) - 1) * (1ll << (31 - inputIntegerBits)) / (1ll << inputLeftShift);
|
|
return static_cast<int>(std::floor(maxInputRescaled));
|
|
}
|
|
|
|
inline void QuantizeMultiplier(double doubleMultiplier, int32_t* quantizedMultiplier, int* shift) {
|
|
if (doubleMultiplier == 0.) {
|
|
*quantizedMultiplier = 0;
|
|
*shift = 0;
|
|
return;
|
|
}
|
|
const double q = std::frexp(doubleMultiplier, shift);
|
|
auto qFixed = static_cast<int64_t>(round(q * (1ll << 31)));
|
|
MNN_ASSERT(qFixed <= (1ll << 31));
|
|
if (qFixed == (1ll << 31)) {
|
|
qFixed /= 2;
|
|
++*shift;
|
|
}
|
|
MNN_ASSERT(qFixed <= std::numeric_limits<int32_t>::max());
|
|
*quantizedMultiplier = static_cast<int32_t>(qFixed);
|
|
}
|
|
|
|
inline void QuantizeMultiplierGreaterThanOne(double doubleMultiplier, int32_t* quantizedMultiplier, int* leftShift) {
|
|
MNN_ASSERT(doubleMultiplier > 1.);
|
|
QuantizeMultiplier(doubleMultiplier, quantizedMultiplier, leftShift);
|
|
MNN_ASSERT(*leftShift >= 0);
|
|
}
|
|
|
|
inline void PreprocessSoftmaxScaling(double beta, double inputScale, int inputIntegerBits, int32_t* quantizedMultiplier,
|
|
int* leftShift) {
|
|
const double inputBetaRealMultiplier =
|
|
std::min(beta * inputScale * (1 << (31 - inputIntegerBits)), (1ll << 31) - 1.0);
|
|
|
|
QuantizeMultiplierGreaterThanOne(inputBetaRealMultiplier, quantizedMultiplier, leftShift);
|
|
}
|
|
inline void CalculateActivationRangeUint8(FusedActivation activation, int outputZeropoint, float inputScale,
|
|
int32_t* actMin, int32_t* actMax) {
|
|
const int32_t qmin = std::numeric_limits<uint8_t>::min();
|
|
const int32_t qmax = std::numeric_limits<uint8_t>::max();
|
|
|
|
const auto scale = inputScale;
|
|
const auto zeroPoint = outputZeropoint;
|
|
|
|
auto quantize = [scale, zeroPoint](float f) { return zeroPoint + static_cast<int32_t>(round(f / scale)); };
|
|
|
|
if (activation == FusedActivation_kTfLiteActRelu) {
|
|
*actMin = std::max(qmin, quantize(0.0));
|
|
*actMax = qmax;
|
|
} else if (activation == FusedActivation_kTfLiteActRelu6) {
|
|
*actMin = std::max(qmin, quantize(0.0));
|
|
*actMax = std::min(qmax, quantize(6.0));
|
|
} else if (activation == FusedActivation_kTfLiteActRelu1) {
|
|
*actMin = std::max(qmin, quantize(-1.0));
|
|
*actMax = std::min(qmax, quantize(1.0));
|
|
} else {
|
|
*actMin = qmin;
|
|
*actMax = qmax;
|
|
}
|
|
}
|
|
|
|
inline void QuantizeMultiplierSmallerThanOne(double doubleMultiplier, int32_t* quantizedMultiplier, int* rightShift) {
|
|
MNN_ASSERT(doubleMultiplier < 1.);
|
|
MNN_ASSERT(doubleMultiplier > 0.);
|
|
int shift;
|
|
QuantizeMultiplier(doubleMultiplier, quantizedMultiplier, &shift);
|
|
MNN_ASSERT(shift <= 0);
|
|
*rightShift = -shift;
|
|
}
|
|
|
|
template <class T>
|
|
float FloatForOneQuantizedLevel(float rangeMin, float rangeMax) {
|
|
const int64_t highest = static_cast<int64_t>(std::numeric_limits<T>::max());
|
|
const int64_t lowest = static_cast<int64_t>(std::numeric_limits<T>::min());
|
|
const float floatForOneQuantizedLevel = (rangeMax - rangeMin) / (highest - lowest);
|
|
return floatForOneQuantizedLevel;
|
|
}
|
|
|
|
template <class T1, class T2, class T3>
|
|
void QuantizationRangeForMultiplication(float minA, float maxA, float minB, float maxB, float* minC, float* maxC) {
|
|
const float aFloatForOneQuantLevel = FloatForOneQuantizedLevel<T1>(minA, maxA);
|
|
const float bFloatForOneQuantLevel = FloatForOneQuantizedLevel<T2>(minB, maxB);
|
|
|
|
const int64_t cHighest = static_cast<int64_t>(std::numeric_limits<T3>::max());
|
|
const int64_t cLowest = static_cast<int64_t>(std::numeric_limits<T3>::min());
|
|
const float cFloatForOneQuantLevel = aFloatForOneQuantLevel * bFloatForOneQuantLevel;
|
|
|
|
*minC = cFloatForOneQuantLevel * cLowest;
|
|
*maxC = cFloatForOneQuantLevel * cHighest;
|
|
}
|
|
|
|
template <class T>
|
|
int64_t FloatToQuantizedUnclamped(float input, float rangeMin, float rangeMax) {
|
|
const int64_t lowestQuantized = static_cast<double>(std::numeric_limits<T>::min());
|
|
if (rangeMin == rangeMax) {
|
|
return lowestQuantized;
|
|
}
|
|
const int numberOfBits = sizeof(T) * 8;
|
|
const int64_t numberOfSteps = static_cast<int64_t>(1) << numberOfBits;
|
|
const double rangeAdjust = (numberOfSteps / (numberOfSteps - 1.0));
|
|
const double range = ((rangeMax - rangeMin) * rangeAdjust);
|
|
const double rangeScale = (numberOfSteps / range);
|
|
int64_t quantized = (round(input * rangeScale) - round(rangeMin * rangeScale));
|
|
quantized += lowestQuantized;
|
|
return quantized;
|
|
}
|
|
|
|
template <class T>
|
|
int64_t FloatToQuantizedUnclampedOpt(float input, float rangeMin, float rangeMax) {
|
|
const double rangeScale = (((static_cast<int64_t>(1) << 32) - 1.0) / (rangeMax - rangeMin));
|
|
int64_t quantized = (round(input * rangeScale) - round(rangeMin * rangeScale));
|
|
quantized += -(static_cast<int64_t>(1) << 31);
|
|
return quantized;
|
|
}
|
|
|
|
template <class T>
|
|
T FloatToQuantized(float input, float rangeMin, float rangeMax) {
|
|
if (std::is_same<T, float>::value) {
|
|
// Specialization for float. This is used in reference implementation
|
|
// for float which is useful to compare performance between float
|
|
// and quantized type.
|
|
return input;
|
|
}
|
|
int64_t quantized = FloatToQuantizedUnclamped<T>(input, rangeMin, rangeMax);
|
|
const int64_t lowestQuantized = static_cast<int64_t>(std::numeric_limits<T>::min());
|
|
const int64_t highestQuantized = static_cast<int64_t>(std::numeric_limits<T>::max());
|
|
quantized = std::max(quantized, lowestQuantized);
|
|
quantized = std::min(quantized, highestQuantized);
|
|
return static_cast<T>(static_cast<int32_t>(quantized));
|
|
}
|
|
|
|
template <class T>
|
|
float QuantizedToFloat(T input, float rangeMin, float rangeMax) {
|
|
if (std::is_same<T, float>::value) {
|
|
// Specialization for float. This is used in reference implementation
|
|
// for float which is useful to compare performance between float
|
|
// and quantized type.
|
|
return input;
|
|
}
|
|
if (rangeMin == rangeMax) {
|
|
return rangeMin;
|
|
}
|
|
const int numberOfBits = sizeof(T) * 8;
|
|
const int64_t numberOfSteps = static_cast<int64_t>(1) << numberOfBits;
|
|
const double rangeAdjust = (numberOfSteps / (numberOfSteps - 1.0));
|
|
const double range = ((rangeMax - rangeMin) * rangeAdjust);
|
|
const double rangeScale = (range / numberOfSteps);
|
|
const int64_t lowestQuantized = static_cast<int64_t>(std::numeric_limits<T>::min());
|
|
const double offsetInput = static_cast<double>(input) - lowestQuantized;
|
|
// For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
|
|
// rangeScale to a float, otherwise rangeMinRounded might be slightly
|
|
// different.
|
|
const double rangeMinRounded = round(rangeMin / static_cast<float>(rangeScale)) * static_cast<float>(rangeScale);
|
|
const double result = rangeMinRounded + (offsetInput * rangeScale);
|
|
return static_cast<float>(result);
|
|
}
|
|
|
|
template <class T>
|
|
float QuantizedToFloatOpt(T input, float rangeMin, float rangeMax) {
|
|
if (std::is_same<T, float>::value) {
|
|
// Specialization for float. This is used in reference implementation
|
|
// for float which is useful to compare performance between float
|
|
// and quantized type.
|
|
return input;
|
|
}
|
|
if (rangeMin == rangeMax) {
|
|
return rangeMin;
|
|
}
|
|
const int numberOfBits = sizeof(int32_t) * 8;
|
|
const int64_t numberOfSteps = static_cast<int64_t>(1) << numberOfBits;
|
|
const int64_t lowestQuantized = static_cast<int64_t>(1) << (numberOfBits - 1);
|
|
const double rangeScale = ((rangeMax - rangeMin) / (numberOfSteps - 1.0));
|
|
const double result = rangeMin + ((input + lowestQuantized) * rangeScale);
|
|
return static_cast<float>(result);
|
|
}
|
|
|
|
template <class T1, class T2>
|
|
inline T2 RequantizeInNewRange(T1 input, float minInput, float maxInput, float minNew, float maxNew) {
|
|
const float inputFloat = QuantizedToFloat<T1>(input, minInput, maxInput);
|
|
T2 result = FloatToQuantized<T2>(inputFloat, minNew, maxNew);
|
|
return result;
|
|
}
|
|
|
|
// Because converting 32-bit accumulated results down to eight bit is a common
|
|
// case, we have a specialized code path to handle it as efficiently as
|
|
// possible using only fixed-point math for the inner loop.
|
|
inline void RequantizeManyInNewRangeReference(const int32_t* input, int64_t count, float minInput, float maxInput,
|
|
float minOutput, float maxOutput, uint8_t* output) {
|
|
// Initially we calculate all the constants we need once, before we go into
|
|
// the inner loop. If this is updated, also update the Eigen version.
|
|
const int fpShift = 16;
|
|
const float inputRange = maxInput - minInput;
|
|
const float outputRange = maxOutput - minOutput;
|
|
const float recipOutputRange = outputRange == 0.0 ? 0.0 : (255.0 / outputRange);
|
|
const float inputRezero = (minInput + maxInput) / 2.0;
|
|
const int64_t rangeScaleFp =
|
|
outputRange == 0.0 ? 0.0 : static_cast<int64_t>(255.0 * (1 << fpShift) * inputRange / outputRange);
|
|
const int64_t inputOffsetFp = static_cast<int64_t>(inputRezero * recipOutputRange * (1 << fpShift));
|
|
const int64_t outputOffsetFp =
|
|
outputRange == 0.0 ? 0 : static_cast<int64_t>((1 << fpShift) * (minOutput * 255.0) / outputRange);
|
|
const int64_t roundingDelta = 1 << (fpShift - 1);
|
|
|
|
// Inside this loop we just do minimal adds, multiplies, and shifts, in a way
|
|
// that could be easily adapted for a SIMD implementation. It should also be
|
|
// possible to perform all the calculations in 32-bit rather than 64, but
|
|
// that's not been implemented yet.
|
|
for (size_t index = 0; index < count; ++index) {
|
|
const int64_t inputValue = static_cast<int64_t>(input[index]);
|
|
const int64_t fpValue = ((inputValue * rangeScaleFp) >> 32) + inputOffsetFp;
|
|
const int64_t offsetIntermediate = fpValue - outputOffsetFp;
|
|
const int64_t roundIntermediate = offsetIntermediate + roundingDelta;
|
|
int64_t quantizedInt64 = roundIntermediate >> fpShift;
|
|
quantizedInt64 = std::max(quantizedInt64, int64_t(0));
|
|
quantizedInt64 = std::min(quantizedInt64, int64_t(255));
|
|
output[index] = static_cast<uint8_t>(static_cast<int32_t>(quantizedInt64));
|
|
}
|
|
}
|
|
|
|
// Another common case is converting eight bit inputs up to thirty two bits, so
|
|
// we have specialized fixed-point code to accelerate that. There is also a NEON
|
|
// version for ARM devices below.
|
|
inline void RequantizeManyInNewRange8To32BitReference(const uint8_t* input, int64_t count, float minInput,
|
|
float maxInput, float minOutput, float maxOutput,
|
|
int32_t* output) {
|
|
const float code0Float = QuantizedToFloat<uint8_t>(0, minInput, maxInput);
|
|
const float code1Float = QuantizedToFloat<uint8_t>(1, minInput, maxInput);
|
|
const int64_t code0Int64 = FloatToQuantizedUnclamped<int32_t>(code0Float, minOutput, maxOutput);
|
|
const int64_t code1Int64 = FloatToQuantizedUnclamped<int32_t>(code1Float, minOutput, maxOutput);
|
|
const int32_t multInt32 = static_cast<int32_t>(code1Int64 - code0Int64);
|
|
const int64_t lowestQuantized = static_cast<int64_t>(std::numeric_limits<int32_t>::min());
|
|
const int64_t highestQuantized = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
|
|
for (int64_t i = 0; i < count; ++i) {
|
|
const int64_t inputValue = static_cast<int64_t>(input[i]);
|
|
int64_t outputValue = code0Int64 + (inputValue * multInt32);
|
|
outputValue = std::max(outputValue, lowestQuantized);
|
|
outputValue = std::min(outputValue, highestQuantized);
|
|
output[i] = static_cast<int32_t>(outputValue);
|
|
}
|
|
}
|
|
|
|
template <class T1, class T2>
|
|
inline void RequantizeManyInNewRange(const T1* input, int64_t count, float minInput, float maxInput, float minOutput,
|
|
float maxOutput, T2* output) {
|
|
for (size_t index = 0; index < count; ++index) {
|
|
const float inputFloat = QuantizedToFloat<T1>(input[index], minInput, maxInput);
|
|
output[index] = FloatToQuantized<T2>(inputFloat, minOutput, maxOutput);
|
|
}
|
|
}
|
|
|
|
// template <>
|
|
// inline void RequantizeManyInNewRange<int32_t, uint8_t>(
|
|
// const int32_t* input, int64_t count, float minInput, float
|
|
// maxInput, float minOutput, float maxOutput, uint8_t*
|
|
// output) {
|
|
// RequantizeManyInNewRangeReference(input, count, minInput, maxInput,
|
|
// minOutput, maxOutput, output);
|
|
// }
|
|
|
|
template <>
|
|
inline void RequantizeManyInNewRange<uint8_t, int32_t>(const uint8_t* input, int64_t count, float minInput,
|
|
float maxInput, float minOutput, float maxOutput,
|
|
int32_t* output) {
|
|
RequantizeManyInNewRange8To32BitReference(input, count, minInput, maxInput, minOutput, maxOutput, output);
|
|
}
|
|
|
|
inline void CalculateUsedRange(Tensor* input, int32_t* usedMinQuantized, int32_t* usedMaxQuantized) {
|
|
int inputDataSize = 1;
|
|
for (int i = 0; i < input->buffer().dimensions; i++) {
|
|
inputDataSize *= input->buffer().dim[i].extent;
|
|
}
|
|
int32_t* inputData = (int32_t*)input->buffer().host;
|
|
|
|
usedMinQuantized[0] = inputData[0];
|
|
usedMaxQuantized[0] = inputData[0];
|
|
for (int i = 0; i < inputDataSize; i++) {
|
|
if (inputData[i] < usedMinQuantized[0]) {
|
|
usedMinQuantized[0] = inputData[i];
|
|
}
|
|
if (inputData[i] > usedMaxQuantized[0]) {
|
|
usedMaxQuantized[0] = inputData[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void GetOutputMinAndMaxForQuantizedAdd(float inputMin, float inputMax, float smallerInputMin,
|
|
float smallerInputMax, float* outputMin, float* outputMax) {
|
|
// We need to have a good range to add our two arguments together in. This
|
|
// is surprisingly tricky, since it has to satisfy a few different needs:
|
|
// - Must be symmetrical around zero, so that 0 + 0 = 0.
|
|
// - Must hold the largest of the argument ranges.
|
|
// - Should have enough range that the bits of the lowest and highest
|
|
// arguments overlap if possible without the lower getting truncated.
|
|
// - Should have some headroom so that there's no overflow.
|
|
// - Needs to be signed.
|
|
// This leads us to use a scheme where we (assuming the inputs are eight bit
|
|
// and the output is 32-bit) use the bottom 32 - 17 = 15 bits to store the
|
|
// accumulated results. This gives us all the properties we need.
|
|
*outputMax = std::max(inputMax, std::max(-inputMin, std::max(smallerInputMax, -smallerInputMin))) * (1 << 17);
|
|
*outputMin = -(*outputMax);
|
|
}
|
|
} // namespace MNN
|
|
|
|
#endif /* CPUQuantizedBiasAdd_hpp */
|