mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			330 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			330 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C++
		
	
	
	
| /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 | |
|  Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  you may not use this file except in compliance with the License.
 | |
|  You may obtain a copy of the License at
 | |
|  http://www.apache.org/licenses/LICENSE-2.0
 | |
|  Unless required by applicable law or agreed to in writing, software
 | |
|  distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  See the License for the specific language governing permissions and
 | |
|  limitations under the License.
 | |
|  ==============================================================================*/
 | |
| 
 | |
| // edited from tensorflow - quantization_utils.cc by MNN.
 | |
| 
 | |
| 
 | |
| #ifndef QUANTIZATION_HPP
 | |
| #define QUANTIZATION_HPP
 | |
| 
 | |
| #include <math.h>
 | |
| #include <stdio.h>
 | |
| #include <cmath>
 | |
| #include <limits>
 | |
| #include "TFQuantizeOp_generated.h"
 | |
| 
 | |
| namespace MNN {
 | |
| 
 | |
| inline int CalculateInputRadius(int inputIntegerBits, int inputLeftShift) {
 | |
|     const double maxInputRescaled =
 | |
|         1.0 * ((1 << inputIntegerBits) - 1) * (1ll << (31 - inputIntegerBits)) / (1ll << inputLeftShift);
 | |
|     return static_cast<int>(std::floor(maxInputRescaled));
 | |
| }
 | |
| 
 | |
| inline void QuantizeMultiplier(double doubleMultiplier, int32_t* quantizedMultiplier, int* shift) {
 | |
|     if (doubleMultiplier == 0.) {
 | |
|         *quantizedMultiplier = 0;
 | |
|         *shift               = 0;
 | |
|         return;
 | |
|     }
 | |
|     const double q = std::frexp(doubleMultiplier, shift);
 | |
|     auto qFixed    = static_cast<int64_t>(round(q * (1ll << 31)));
 | |
|     MNN_ASSERT(qFixed <= (1ll << 31));
 | |
|     if (qFixed == (1ll << 31)) {
 | |
|         qFixed /= 2;
 | |
|         ++*shift;
 | |
|     }
 | |
|     MNN_ASSERT(qFixed <= std::numeric_limits<int32_t>::max());
 | |
|     *quantizedMultiplier = static_cast<int32_t>(qFixed);
 | |
| }
 | |
| 
 | |
| inline void QuantizeMultiplierGreaterThanOne(double doubleMultiplier, int32_t* quantizedMultiplier, int* leftShift) {
 | |
|     MNN_ASSERT(doubleMultiplier > 1.);
 | |
|     QuantizeMultiplier(doubleMultiplier, quantizedMultiplier, leftShift);
 | |
|     MNN_ASSERT(*leftShift >= 0);
 | |
| }
 | |
| 
 | |
| inline void PreprocessSoftmaxScaling(double beta, double inputScale, int inputIntegerBits, int32_t* quantizedMultiplier,
 | |
|                                      int* leftShift) {
 | |
|     const double inputBetaRealMultiplier =
 | |
|         std::min(beta * inputScale * (1 << (31 - inputIntegerBits)), (1ll << 31) - 1.0);
 | |
| 
 | |
|     QuantizeMultiplierGreaterThanOne(inputBetaRealMultiplier, quantizedMultiplier, leftShift);
 | |
| }
 | |
| inline void CalculateActivationRangeUint8(FusedActivation activation, int outputZeropoint, float inputScale,
 | |
|                                           int32_t* actMin, int32_t* actMax) {
 | |
|     const int32_t qmin = std::numeric_limits<uint8_t>::min();
 | |
|     const int32_t qmax = std::numeric_limits<uint8_t>::max();
 | |
| 
 | |
|     const auto scale     = inputScale;
 | |
|     const auto zeroPoint = outputZeropoint;
 | |
| 
 | |
|     auto quantize = [scale, zeroPoint](float f) { return zeroPoint + static_cast<int32_t>(round(f / scale)); };
 | |
| 
 | |
|     if (activation == FusedActivation_kTfLiteActRelu) {
 | |
|         *actMin = std::max(qmin, quantize(0.0));
 | |
|         *actMax = qmax;
 | |
|     } else if (activation == FusedActivation_kTfLiteActRelu6) {
 | |
|         *actMin = std::max(qmin, quantize(0.0));
 | |
|         *actMax = std::min(qmax, quantize(6.0));
 | |
|     } else if (activation == FusedActivation_kTfLiteActRelu1) {
 | |
|         *actMin = std::max(qmin, quantize(-1.0));
 | |
|         *actMax = std::min(qmax, quantize(1.0));
 | |
|     } else {
 | |
|         *actMin = qmin;
 | |
|         *actMax = qmax;
 | |
|     }
 | |
| }
 | |
| 
 | |
| inline void QuantizeMultiplierSmallerThanOne(double doubleMultiplier, int32_t* quantizedMultiplier, int* rightShift) {
 | |
|     MNN_ASSERT(doubleMultiplier < 1.);
 | |
|     MNN_ASSERT(doubleMultiplier > 0.);
 | |
|     int shift;
 | |
|     QuantizeMultiplier(doubleMultiplier, quantizedMultiplier, &shift);
 | |
|     MNN_ASSERT(shift <= 0);
 | |
|     *rightShift = -shift;
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| float FloatForOneQuantizedLevel(float rangeMin, float rangeMax) {
 | |
|     const int64_t highest                 = static_cast<int64_t>(std::numeric_limits<T>::max());
 | |
|     const int64_t lowest                  = static_cast<int64_t>(std::numeric_limits<T>::min());
 | |
|     const float floatForOneQuantizedLevel = (rangeMax - rangeMin) / (highest - lowest);
 | |
|     return floatForOneQuantizedLevel;
 | |
| }
 | |
| 
 | |
| template <class T1, class T2, class T3>
 | |
| void QuantizationRangeForMultiplication(float minA, float maxA, float minB, float maxB, float* minC, float* maxC) {
 | |
|     const float aFloatForOneQuantLevel = FloatForOneQuantizedLevel<T1>(minA, maxA);
 | |
|     const float bFloatForOneQuantLevel = FloatForOneQuantizedLevel<T2>(minB, maxB);
 | |
| 
 | |
|     const int64_t cHighest             = static_cast<int64_t>(std::numeric_limits<T3>::max());
 | |
|     const int64_t cLowest              = static_cast<int64_t>(std::numeric_limits<T3>::min());
 | |
|     const float cFloatForOneQuantLevel = aFloatForOneQuantLevel * bFloatForOneQuantLevel;
 | |
| 
 | |
|     *minC = cFloatForOneQuantLevel * cLowest;
 | |
|     *maxC = cFloatForOneQuantLevel * cHighest;
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| int64_t FloatToQuantizedUnclamped(float input, float rangeMin, float rangeMax) {
 | |
|     const int64_t lowestQuantized = static_cast<double>(std::numeric_limits<T>::min());
 | |
|     if (rangeMin == rangeMax) {
 | |
|         return lowestQuantized;
 | |
|     }
 | |
|     const int numberOfBits      = sizeof(T) * 8;
 | |
|     const int64_t numberOfSteps = static_cast<int64_t>(1) << numberOfBits;
 | |
|     const double rangeAdjust    = (numberOfSteps / (numberOfSteps - 1.0));
 | |
|     const double range          = ((rangeMax - rangeMin) * rangeAdjust);
 | |
|     const double rangeScale     = (numberOfSteps / range);
 | |
|     int64_t quantized           = (round(input * rangeScale) - round(rangeMin * rangeScale));
 | |
|     quantized += lowestQuantized;
 | |
|     return quantized;
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| int64_t FloatToQuantizedUnclampedOpt(float input, float rangeMin, float rangeMax) {
 | |
|     const double rangeScale = (((static_cast<int64_t>(1) << 32) - 1.0) / (rangeMax - rangeMin));
 | |
|     int64_t quantized       = (round(input * rangeScale) - round(rangeMin * rangeScale));
 | |
|     quantized += -(static_cast<int64_t>(1) << 31);
 | |
|     return quantized;
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| T FloatToQuantized(float input, float rangeMin, float rangeMax) {
 | |
|     if (std::is_same<T, float>::value) {
 | |
|         // Specialization for float. This is used in reference implementation
 | |
|         // for float which is useful to compare performance between float
 | |
|         // and quantized type.
 | |
|         return input;
 | |
|     }
 | |
|     int64_t quantized              = FloatToQuantizedUnclamped<T>(input, rangeMin, rangeMax);
 | |
|     const int64_t lowestQuantized  = static_cast<int64_t>(std::numeric_limits<T>::min());
 | |
|     const int64_t highestQuantized = static_cast<int64_t>(std::numeric_limits<T>::max());
 | |
|     quantized                      = std::max(quantized, lowestQuantized);
 | |
|     quantized                      = std::min(quantized, highestQuantized);
 | |
|     return static_cast<T>(static_cast<int32_t>(quantized));
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| float QuantizedToFloat(T input, float rangeMin, float rangeMax) {
 | |
|     if (std::is_same<T, float>::value) {
 | |
|         // Specialization for float. This is used in reference implementation
 | |
|         // for float which is useful to compare performance between float
 | |
|         // and quantized type.
 | |
|         return input;
 | |
|     }
 | |
|     if (rangeMin == rangeMax) {
 | |
|         return rangeMin;
 | |
|     }
 | |
|     const int numberOfBits        = sizeof(T) * 8;
 | |
|     const int64_t numberOfSteps   = static_cast<int64_t>(1) << numberOfBits;
 | |
|     const double rangeAdjust      = (numberOfSteps / (numberOfSteps - 1.0));
 | |
|     const double range            = ((rangeMax - rangeMin) * rangeAdjust);
 | |
|     const double rangeScale       = (range / numberOfSteps);
 | |
|     const int64_t lowestQuantized = static_cast<int64_t>(std::numeric_limits<T>::min());
 | |
|     const double offsetInput      = static_cast<double>(input) - lowestQuantized;
 | |
|     // For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
 | |
|     // rangeScale to a float, otherwise rangeMinRounded might be slightly
 | |
|     // different.
 | |
|     const double rangeMinRounded = round(rangeMin / static_cast<float>(rangeScale)) * static_cast<float>(rangeScale);
 | |
|     const double result          = rangeMinRounded + (offsetInput * rangeScale);
 | |
|     return static_cast<float>(result);
 | |
| }
 | |
| 
 | |
| template <class T>
 | |
| float QuantizedToFloatOpt(T input, float rangeMin, float rangeMax) {
 | |
|     if (std::is_same<T, float>::value) {
 | |
|         // Specialization for float. This is used in reference implementation
 | |
|         // for float which is useful to compare performance between float
 | |
|         // and quantized type.
 | |
|         return input;
 | |
|     }
 | |
|     if (rangeMin == rangeMax) {
 | |
|         return rangeMin;
 | |
|     }
 | |
|     const int numberOfBits        = sizeof(int32_t) * 8;
 | |
|     const int64_t numberOfSteps   = static_cast<int64_t>(1) << numberOfBits;
 | |
|     const int64_t lowestQuantized = static_cast<int64_t>(1) << (numberOfBits - 1);
 | |
|     const double rangeScale       = ((rangeMax - rangeMin) / (numberOfSteps - 1.0));
 | |
|     const double result           = rangeMin + ((input + lowestQuantized) * rangeScale);
 | |
|     return static_cast<float>(result);
 | |
| }
 | |
| 
 | |
| template <class T1, class T2>
 | |
| inline T2 RequantizeInNewRange(T1 input, float minInput, float maxInput, float minNew, float maxNew) {
 | |
|     const float inputFloat = QuantizedToFloat<T1>(input, minInput, maxInput);
 | |
|     T2 result              = FloatToQuantized<T2>(inputFloat, minNew, maxNew);
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| // Because converting 32-bit accumulated results down to eight bit is a common
 | |
| // case, we have a specialized code path to handle it as efficiently as
 | |
| // possible using only fixed-point math for the inner loop.
 | |
| inline void RequantizeManyInNewRangeReference(const int32_t* input, int64_t count, float minInput, float maxInput,
 | |
|                                               float minOutput, float maxOutput, uint8_t* output) {
 | |
|     // Initially we calculate all the constants we need once, before we go into
 | |
|     // the inner loop.  If this is updated, also update the Eigen version.
 | |
|     const int fpShift            = 16;
 | |
|     const float inputRange       = maxInput - minInput;
 | |
|     const float outputRange      = maxOutput - minOutput;
 | |
|     const float recipOutputRange = outputRange == 0.0 ? 0.0 : (255.0 / outputRange);
 | |
|     const float inputRezero      = (minInput + maxInput) / 2.0;
 | |
|     const int64_t rangeScaleFp =
 | |
|         outputRange == 0.0 ? 0.0 : static_cast<int64_t>(255.0 * (1 << fpShift) * inputRange / outputRange);
 | |
|     const int64_t inputOffsetFp = static_cast<int64_t>(inputRezero * recipOutputRange * (1 << fpShift));
 | |
|     const int64_t outputOffsetFp =
 | |
|         outputRange == 0.0 ? 0 : static_cast<int64_t>((1 << fpShift) * (minOutput * 255.0) / outputRange);
 | |
|     const int64_t roundingDelta = 1 << (fpShift - 1);
 | |
| 
 | |
|     // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
 | |
|     // that could be easily adapted for a SIMD implementation. It should also be
 | |
|     // possible to perform all the calculations in 32-bit rather than 64, but
 | |
|     // that's not been implemented yet.
 | |
|     for (size_t index = 0; index < count; ++index) {
 | |
|         const int64_t inputValue         = static_cast<int64_t>(input[index]);
 | |
|         const int64_t fpValue            = ((inputValue * rangeScaleFp) >> 32) + inputOffsetFp;
 | |
|         const int64_t offsetIntermediate = fpValue - outputOffsetFp;
 | |
|         const int64_t roundIntermediate  = offsetIntermediate + roundingDelta;
 | |
|         int64_t quantizedInt64           = roundIntermediate >> fpShift;
 | |
|         quantizedInt64                   = std::max(quantizedInt64, int64_t(0));
 | |
|         quantizedInt64                   = std::min(quantizedInt64, int64_t(255));
 | |
|         output[index]                    = static_cast<uint8_t>(static_cast<int32_t>(quantizedInt64));
 | |
|     }
 | |
| }
 | |
| 
 | |
| // Another common case is converting eight bit inputs up to thirty two bits, so
 | |
| // we have specialized fixed-point code to accelerate that. There is also a NEON
 | |
| // version for ARM devices below.
 | |
| inline void RequantizeManyInNewRange8To32BitReference(const uint8_t* input, int64_t count, float minInput,
 | |
|                                                       float maxInput, float minOutput, float maxOutput,
 | |
|                                                       int32_t* output) {
 | |
|     const float code0Float         = QuantizedToFloat<uint8_t>(0, minInput, maxInput);
 | |
|     const float code1Float         = QuantizedToFloat<uint8_t>(1, minInput, maxInput);
 | |
|     const int64_t code0Int64       = FloatToQuantizedUnclamped<int32_t>(code0Float, minOutput, maxOutput);
 | |
|     const int64_t code1Int64       = FloatToQuantizedUnclamped<int32_t>(code1Float, minOutput, maxOutput);
 | |
|     const int32_t multInt32        = static_cast<int32_t>(code1Int64 - code0Int64);
 | |
|     const int64_t lowestQuantized  = static_cast<int64_t>(std::numeric_limits<int32_t>::min());
 | |
|     const int64_t highestQuantized = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
 | |
|     for (int64_t i = 0; i < count; ++i) {
 | |
|         const int64_t inputValue = static_cast<int64_t>(input[i]);
 | |
|         int64_t outputValue      = code0Int64 + (inputValue * multInt32);
 | |
|         outputValue              = std::max(outputValue, lowestQuantized);
 | |
|         outputValue              = std::min(outputValue, highestQuantized);
 | |
|         output[i]                = static_cast<int32_t>(outputValue);
 | |
|     }
 | |
| }
 | |
| 
 | |
| template <class T1, class T2>
 | |
| inline void RequantizeManyInNewRange(const T1* input, int64_t count, float minInput, float maxInput, float minOutput,
 | |
|                                      float maxOutput, T2* output) {
 | |
|     for (size_t index = 0; index < count; ++index) {
 | |
|         const float inputFloat = QuantizedToFloat<T1>(input[index], minInput, maxInput);
 | |
|         output[index]          = FloatToQuantized<T2>(inputFloat, minOutput, maxOutput);
 | |
|     }
 | |
| }
 | |
| 
 | |
| //    template <>
 | |
| //    inline void RequantizeManyInNewRange<int32_t, uint8_t>(
 | |
| //                                                         const int32_t* input, int64_t count, float minInput, float
 | |
| //                                                         maxInput, float minOutput, float maxOutput, uint8_t*
 | |
| //                                                         output) {
 | |
| //        RequantizeManyInNewRangeReference(input, count, minInput, maxInput,
 | |
| //                                          minOutput, maxOutput, output);
 | |
| //    }
 | |
| 
 | |
| template <>
 | |
| inline void RequantizeManyInNewRange<uint8_t, int32_t>(const uint8_t* input, int64_t count, float minInput,
 | |
|                                                        float maxInput, float minOutput, float maxOutput,
 | |
|                                                        int32_t* output) {
 | |
|     RequantizeManyInNewRange8To32BitReference(input, count, minInput, maxInput, minOutput, maxOutput, output);
 | |
| }
 | |
| 
 | |
| inline void CalculateUsedRange(Tensor* input, int32_t* usedMinQuantized, int32_t* usedMaxQuantized) {
 | |
|     int inputDataSize = 1;
 | |
|     for (int i = 0; i < input->buffer().dimensions; i++) {
 | |
|         inputDataSize *= input->buffer().dim[i].extent;
 | |
|     }
 | |
|     int32_t* inputData = (int32_t*)input->buffer().host;
 | |
| 
 | |
|     usedMinQuantized[0] = inputData[0];
 | |
|     usedMaxQuantized[0] = inputData[0];
 | |
|     for (int i = 0; i < inputDataSize; i++) {
 | |
|         if (inputData[i] < usedMinQuantized[0]) {
 | |
|             usedMinQuantized[0] = inputData[i];
 | |
|         }
 | |
|         if (inputData[i] > usedMaxQuantized[0]) {
 | |
|             usedMaxQuantized[0] = inputData[i];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| inline void GetOutputMinAndMaxForQuantizedAdd(float inputMin, float inputMax, float smallerInputMin,
 | |
|                                               float smallerInputMax, float* outputMin, float* outputMax) {
 | |
|     // We need to have a good range to add our two arguments together in. This
 | |
|     // is surprisingly tricky, since it has to satisfy a few different needs:
 | |
|     //  - Must be symmetrical around zero, so that 0 + 0 = 0.
 | |
|     //  - Must hold the largest of the argument ranges.
 | |
|     //  - Should have enough range that the bits of the lowest and highest
 | |
|     //    arguments overlap if possible without the lower getting truncated.
 | |
|     //  - Should have some headroom so that there's no overflow.
 | |
|     //  - Needs to be signed.
 | |
|     // This leads us to use a scheme where we (assuming the inputs are eight bit
 | |
|     // and the output is 32-bit) use the bottom 32 - 17 = 15 bits to store the
 | |
|     // accumulated results. This gives us all the properties we need.
 | |
|     *outputMax = std::max(inputMax, std::max(-inputMin, std::max(smallerInputMax, -smallerInputMin))) * (1 << 17);
 | |
|     *outputMin = -(*outputMax);
 | |
| }
 | |
| } // namespace MNN
 | |
| 
 | |
| #endif /* CPUQuantizedBiasAdd_hpp */
 |