MNN/source/backend/cpu/CPUQuantizedMaxPool.cpp

//
//  CPUQuantizedMaxPool.cpp
//  MNN
//
//  Created by MNN on 2018/08/08.
//  Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef MNN_SUPPORT_TFLITE_QUAN
#include "backend/cpu/CPUQuantizedMaxPool.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/CPUQuantizationUtils.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/Macro.h"

namespace MNN {

CPUQuantizedMaxPool::CPUQuantizedMaxPool(Backend *backend, const Op *op) : Execution(backend) {
    auto mp       = op->main_as_QuantizedMaxPool();
    mIstflite     = (mp->modelFormat() == ModeFormat_TFLITE);
    mKernelWidth  = mp->kernelX();
    mKernelHeight = mp->kernelY();
    mPadWidth     = mp->padX();
    mPadHeight    = mp->padY();
    mStrideWidth  = mp->strideX();
    mStrideHeight = mp->strideY();
    mPadMode      = mp->padType();
}

ErrorCode CPUQuantizedMaxPool::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto input  = inputs[0];
    auto output = outputs[0];

    MNN_ASSERT(input->buffer().dimensions == 4);

    if (!mIstflite) {
        MNN_ASSERT(inputs.size() == 3);
        MNN_ASSERT(outputs.size() == 3);
        const float minInput                    = inputs[1]->host<float>()[0];
        const float maxInput                    = inputs[2]->host<float>()[0];
        ((float *)outputs[1]->buffer().host)[0] = minInput;
        ((float *)outputs[2]->buffer().host)[0] = maxInput;
    }

    // input : nhwc
    const int32_t inBatch   = input->buffer().dim[0].extent;
    const int32_t inRows    = input->buffer().dim[1].extent;
    const int32_t inCols    = input->buffer().dim[2].extent;
    const int32_t inChannel = input->buffer().dim[3].extent;

    int32_t padRows          = mPadHeight;
    int32_t padCols          = mPadWidth;
    const int32_t windowRows = mKernelHeight;
    const int32_t windowCols = mKernelWidth;
    const int32_t rowStride  = mStrideHeight;
    const int32_t colStride  = mStrideWidth;
    const int32_t outHeight  = output->buffer().dim[1].extent;
    const int32_t outWidth   = output->buffer().dim[2].extent;

    switch (mPadMode) {
        case PoolPadType_VALID:
            padRows = padCols = 0;
            break;
        case PoolPadType_SAME: {
            auto widthNeeded  = (outWidth - 1) * colStride + windowCols - inCols;
            auto heightNeeded = (outHeight - 1) * rowStride + windowRows - inRows;
            mPadWidth         = widthNeeded > 0 ? widthNeeded / 2 : 0;
            mPadHeight        = heightNeeded > 0 ? heightNeeded / 2 : 0;
            break;
        }
        default:
            MNN_ASSERT(false);
            break;
    }

    uint8_t *inputPtr            = (uint8_t *)input->buffer().host;
    uint8_t *outputPtr           = (uint8_t *)output->buffer().host;
    const uint8_t minAsQuantized = 0;

    for (int batchIndex = 0; batchIndex < inBatch; batchIndex++) {
        uint8_t *outputBatchPtr = outputPtr + batchIndex * outWidth * outHeight * inChannel;
        uint8_t *inputBatchPtr  = inputPtr + batchIndex * inCols * inRows * inChannel;

        for (int channelIndex = 0; channelIndex < inChannel; channelIndex++) {
            for (int outHeightIndex = 0; outHeightIndex < outHeight; outHeightIndex++) {
                for (int outWidthIndex = 0; outWidthIndex < outWidth; outWidthIndex++) {
                    uint8_t maxTemp          = std::numeric_limits<uint8_t>::min();
                    int32_t inputHeightIndex = outHeightIndex * rowStride - padRows;
                    int32_t inputWidthIndex  = outWidthIndex * colStride - padCols;
                    uint8_t *outputTemp      = (uint8_t *)(outputBatchPtr + outHeightIndex * outWidth * inChannel +
                                                      outWidthIndex * inChannel + channelIndex);
                    for (int windowRowsIndex = 0; windowRowsIndex < windowRows; windowRowsIndex++) {
                        for (int windowColsIndex = 0; windowColsIndex < windowCols; windowColsIndex++) {
                            if (((inputWidthIndex + windowColsIndex) < 0) ||
                                ((inputWidthIndex + windowColsIndex) >= inCols) ||
                                ((inputHeightIndex + windowRowsIndex) < 0) ||
                                ((inputHeightIndex + windowRowsIndex) >= inRows)) {
                                maxTemp = std::max(minAsQuantized, maxTemp);
                            } else {
                                maxTemp = std::max(
                                    inputBatchPtr[(inputHeightIndex + windowRowsIndex) * inCols * inChannel +
                                                  (inputWidthIndex + windowColsIndex) * inChannel + channelIndex],
                                    maxTemp);
                            }
                        }
                    }
                    *outputTemp = maxTemp;
                }
            }
        }
    }

    return NO_ERROR;
}

class CPUQuantizedMaxPoolCreator : public CPUBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const {
        return new CPUQuantizedMaxPool(backend, op);
    }
};
REGISTER_CPU_OP_CREATOR(CPUQuantizedMaxPoolCreator, OpType_QuantizedMaxPool);
} // namespace MNN
#endif
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// CPUQuantizedMaxPool.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2018/08/08.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00			`#ifdef MNN_SUPPORT_TFLITE_QUAN`
Update 2019-12-27 22:16:57 +08:00			`#include "backend/cpu/CPUQuantizedMaxPool.hpp"`
			`#include "backend/cpu/CPUBackend.hpp"`
			`#include "backend/cpu/CPUQuantizationUtils.hpp"`
			`#include "backend/cpu/compute/CommonOptFunction.h"`
			`#include "core/Macro.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`namespace MNN {`

			`CPUQuantizedMaxPool::CPUQuantizedMaxPool(Backend backend, const Op op) : Execution(backend) {`
			`auto mp = op->main_as_QuantizedMaxPool();`
			`mIstflite = (mp->modelFormat() == ModeFormat_TFLITE);`
			`mKernelWidth = mp->kernelX();`
			`mKernelHeight = mp->kernelY();`
			`mPadWidth = mp->padX();`
			`mPadHeight = mp->padY();`
			`mStrideWidth = mp->strideX();`
			`mStrideHeight = mp->strideY();`
			`mPadMode = mp->padType();`
			`}`

			`ErrorCode CPUQuantizedMaxPool::onExecute(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
			`auto input = inputs[0];`
			`auto output = outputs[0];`

			`MNN_ASSERT(input->buffer().dimensions == 4);`

			`if (!mIstflite) {`
			`MNN_ASSERT(inputs.size() == 3);`
			`MNN_ASSERT(outputs.size() == 3);`
			`const float minInput = inputs[1]->host<float>()[0];`
			`const float maxInput = inputs[2]->host<float>()[0];`
			`((float *)outputs[1]->buffer().host)[0] = minInput;`
			`((float *)outputs[2]->buffer().host)[0] = maxInput;`
			`}`

			`// input : nhwc`
			`const int32_t inBatch = input->buffer().dim[0].extent;`
			`const int32_t inRows = input->buffer().dim[1].extent;`
			`const int32_t inCols = input->buffer().dim[2].extent;`
			`const int32_t inChannel = input->buffer().dim[3].extent;`

			`int32_t padRows = mPadHeight;`
			`int32_t padCols = mPadWidth;`
			`const int32_t windowRows = mKernelHeight;`
			`const int32_t windowCols = mKernelWidth;`
			`const int32_t rowStride = mStrideHeight;`
			`const int32_t colStride = mStrideWidth;`
			`const int32_t outHeight = output->buffer().dim[1].extent;`
			`const int32_t outWidth = output->buffer().dim[2].extent;`

			`switch (mPadMode) {`
			`case PoolPadType_VALID:`
			`padRows = padCols = 0;`
			`break;`
			`case PoolPadType_SAME: {`
			`auto widthNeeded = (outWidth - 1) * colStride + windowCols - inCols;`
			`auto heightNeeded = (outHeight - 1) * rowStride + windowRows - inRows;`
			`mPadWidth = widthNeeded > 0 ? widthNeeded / 2 : 0;`
			`mPadHeight = heightNeeded > 0 ? heightNeeded / 2 : 0;`
			`break;`
			`}`
			`default:`
			`MNN_ASSERT(false);`
			`break;`
			`}`

			`uint8_t inputPtr = (uint8_t )input->buffer().host;`
			`uint8_t outputPtr = (uint8_t )output->buffer().host;`
			`const uint8_t minAsQuantized = 0;`

			`for (int batchIndex = 0; batchIndex < inBatch; batchIndex++) {`
			`uint8_t outputBatchPtr = outputPtr + batchIndex outWidth * outHeight * inChannel;`
			`uint8_t inputBatchPtr = inputPtr + batchIndex inCols * inRows * inChannel;`

			`for (int channelIndex = 0; channelIndex < inChannel; channelIndex++) {`
			`for (int outHeightIndex = 0; outHeightIndex < outHeight; outHeightIndex++) {`
			`for (int outWidthIndex = 0; outWidthIndex < outWidth; outWidthIndex++) {`
			`uint8_t maxTemp = std::numeric_limits<uint8_t>::min();`
			`int32_t inputHeightIndex = outHeightIndex * rowStride - padRows;`
			`int32_t inputWidthIndex = outWidthIndex * colStride - padCols;`
			`uint8_t outputTemp = (uint8_t )(outputBatchPtr + outHeightIndex * outWidth * inChannel +`
			`outWidthIndex * inChannel + channelIndex);`
			`for (int windowRowsIndex = 0; windowRowsIndex < windowRows; windowRowsIndex++) {`
			`for (int windowColsIndex = 0; windowColsIndex < windowCols; windowColsIndex++) {`
			`if (((inputWidthIndex + windowColsIndex) < 0) \|\|`
			`((inputWidthIndex + windowColsIndex) >= inCols) \|\|`
			`((inputHeightIndex + windowRowsIndex) < 0) \|\|`
			`((inputHeightIndex + windowRowsIndex) >= inRows)) {`
			`maxTemp = std::max(minAsQuantized, maxTemp);`
			`} else {`
			`maxTemp = std::max(`
			`inputBatchPtr[(inputHeightIndex + windowRowsIndex) * inCols * inChannel +`
			`(inputWidthIndex + windowColsIndex) * inChannel + channelIndex],`
			`maxTemp);`
			`}`
			`}`
			`}`
			`*outputTemp = maxTemp;`
			`}`
			`}`
			`}`
			`}`

			`return NO_ERROR;`
			`}`

			`class CPUQuantizedMaxPoolCreator : public CPUBackend::Creator {`
			`public:`
			`virtual Execution onCreate(const std::vector<Tensor > &inputs, const std::vector<Tensor *> &outputs,`
			`const MNN::Op op, Backend backend) const {`
			`return new CPUQuantizedMaxPool(backend, op);`
			`}`
			`};`
			`REGISTER_CPU_OP_CREATOR(CPUQuantizedMaxPoolCreator, OpType_QuantizedMaxPool);`
			`} // namespace MNN`
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00			`#endif`