mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			270 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			270 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
//
 | 
						|
//  CPUPoolInt8.cpp
 | 
						|
//  MNN
 | 
						|
//
 | 
						|
//  Created by MNN on 2019/06/10.
 | 
						|
//  Copyright © 2018, Alibaba Group Holding Limited
 | 
						|
//
 | 
						|
 | 
						|
#include "backend/cpu/CPUPoolInt8.hpp"
 | 
						|
#include "core/Macro.h"
 | 
						|
#include <math.h>
 | 
						|
#ifdef MNN_USE_NEON
 | 
						|
#include <arm_neon.h>
 | 
						|
#endif
 | 
						|
#include "compute/Int8FunctionsOpt.h"
 | 
						|
#include "core/Concurrency.h"
 | 
						|
#include "backend/cpu/compute/CommonOptFunction.h"
 | 
						|
 | 
						|
namespace MNN {
 | 
						|
 | 
						|
static void poolingAvgNC16HW16Int8(void poolfunc(int8_t*, int8_t*, size_t, size_t, size_t, size_t, size_t, ssize_t, ssize_t), const Tensor *src, Tensor *dst,
 | 
						|
                                   int stridesx, int stridesy, int kernelx, int kernely, int paddingx, int paddingy)
 | 
						|
{
 | 
						|
    const int inputHeight = src->height();
 | 
						|
    const int inputWidth = src->width();
 | 
						|
    const int outputHeight = dst->height();
 | 
						|
    const int outputWidth = dst->width();
 | 
						|
    const int channel = dst->channel();
 | 
						|
    const int batchsize = src->batch();
 | 
						|
 | 
						|
    const auto srcPtr = src->host<int8_t>();
 | 
						|
    auto dstPtr       = dst->host<int8_t>();
 | 
						|
    int pack = 16;
 | 
						|
    int thred0 = UP_DIV(paddingx, stridesx);
 | 
						|
    int thred1 = inputWidth + paddingx - kernelx;
 | 
						|
    thred1 = UP_DIV(thred1, stridesx);     // ix + kernelx >= inputWidth;
 | 
						|
    // int factor = static_cast<int>((1 << 24)/(kernelx * kernely));
 | 
						|
 | 
						|
    const int channel_ = UP_DIV(channel, pack);
 | 
						|
    for (int oc = 0; oc < channel_; ++oc) {
 | 
						|
        for(int ob = 0; ob < batchsize; ++ob) {
 | 
						|
            for (int oy = 0; oy < outputHeight; ++oy) {
 | 
						|
                int iy = oy * stridesy - paddingy;
 | 
						|
                const int kernely_ = std::min(iy + kernely, inputHeight) - std::max(iy, 0);
 | 
						|
                iy = std::max(iy, 0);
 | 
						|
                int ox = 0;
 | 
						|
                for (ox = 0; ox < thred0; ++ox) { // ix < 0;
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
 | 
						|
                    int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | 
						|
 | 
						|
                    const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx, paddingx, mul);
 | 
						|
 | 
						|
                } // ix < 0;
 | 
						|
 | 
						|
                // ix > 0 && ix + kernelx < inputWidth;
 | 
						|
                if (thred1 - thred0 > 0) {
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
                    int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | 
						|
 | 
						|
                    const int indexOutput = pack * (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));    
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, thred1 - thred0, inputWidth, kernelx_, kernely_, stridesx, 0, mul);
 | 
						|
                }
 | 
						|
 | 
						|
                for (ox = thred1; ox < outputWidth; ++ox) { // ix + kernelx > inputWidth;
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
 | 
						|
                    int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | 
						|
 | 
						|
                    const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx, paddingx, mul);
 | 
						|
 | 
						|
                } 
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void poolingMaxNC16HW16Int8(void poolfunc(int8_t*, int8_t*, size_t, size_t, size_t, size_t, size_t), const Tensor *src, Tensor *dst, int stridesx, int stridesy, int kernelx, int kernely, int paddingx, int paddingy)
 | 
						|
{
 | 
						|
    const int inputHeight = src->height();
 | 
						|
    const int inputWidth = src->width();
 | 
						|
    const int outputHeight = dst->height();
 | 
						|
    const int outputWidth = dst->width();
 | 
						|
    const int channel = dst->channel();
 | 
						|
    const int batchsize = src->batch();
 | 
						|
    int pack = 16;
 | 
						|
    int thred0 = UP_DIV(paddingx, stridesx);
 | 
						|
    int thred1 = inputWidth + paddingx - kernelx;
 | 
						|
    thred1 = UP_DIV(thred1, stridesx);     // ix + kernelx >= inputWidth;
 | 
						|
 | 
						|
    const auto srcPtr = src->host<int8_t>();
 | 
						|
    auto dstPtr       = dst->host<int8_t>();
 | 
						|
 | 
						|
    const int channel16 = UP_DIV(channel, pack);
 | 
						|
    for (int oc = 0; oc < channel16; ++oc){
 | 
						|
        for(int ob = 0; ob < batchsize; ++ob){
 | 
						|
            for (int oy = 0; oy < outputHeight; ++oy) {
 | 
						|
                
 | 
						|
                int iy = oy * stridesy - paddingy;
 | 
						|
                const int kernely_ = std::min(iy + kernely, inputHeight) - std::max(iy, 0);
 | 
						|
                iy = std::max(iy, 0);
 | 
						|
                int ox = 0;
 | 
						|
                for (ox = 0; ox < thred0; ++ox) { // ix < 0;
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
 | 
						|
                    const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx);
 | 
						|
 | 
						|
                } // ix < 0;
 | 
						|
 | 
						|
                // ix > 0 && ix + kernelx < inputWidth;
 | 
						|
                if (thred1 - thred0 > 0) {
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
 | 
						|
                    const int indexOutput = pack * (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));    
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, thred1 - thred0, inputWidth, kernelx_, kernely_, stridesx);
 | 
						|
                }
 | 
						|
 | 
						|
                for (ox = thred1; ox < outputWidth; ++ox) { // ix + kernelx > inputWidth;
 | 
						|
                    int ix = ox * stridesx - paddingx;
 | 
						|
                    const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | 
						|
                    ix = std::max(ix, 0);
 | 
						|
 | 
						|
                    const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | 
						|
                    const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | 
						|
                    int8_t* dstCur = dstPtr + indexOutput;
 | 
						|
                    int8_t* srcCur = srcPtr + indexInput;
 | 
						|
 | 
						|
                    poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx);
 | 
						|
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
CPUPoolInt8::CPUPoolInt8(Backend *backend, const Pool *parameter) : Execution(backend), mParameter(parameter) {
 | 
						|
}
 | 
						|
 | 
						|
ErrorCode CPUPoolInt8::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 | 
						|
    const auto input = inputs[0];
 | 
						|
    auto output      = outputs[0];
 | 
						|
 | 
						|
    auto core = static_cast<CPUBackend*>(backend())->int8Functions();
 | 
						|
 | 
						|
    int strideWidth  = mParameter->strideX();
 | 
						|
    int strideHeight = mParameter->strideY();
 | 
						|
    int padWidth     = mParameter->padX();
 | 
						|
    int padHeight    = mParameter->padY();
 | 
						|
    int kernelWidth  = mParameter->kernelX();
 | 
						|
    int kernelHeight = mParameter->kernelY();
 | 
						|
 | 
						|
    const int inputWidth   = input->width();
 | 
						|
    const int inputHeight  = input->height();
 | 
						|
    const int outputWidth  = output->width();
 | 
						|
    const int outputHeight = output->height();
 | 
						|
 | 
						|
    kernelWidth  = std::min(kernelWidth, inputWidth);
 | 
						|
    kernelHeight = std::min(kernelHeight, inputHeight);
 | 
						|
    if (mParameter->isGlobal()) {
 | 
						|
        kernelWidth  = inputWidth;
 | 
						|
        kernelHeight = inputHeight;
 | 
						|
        strideWidth  = inputWidth;
 | 
						|
        strideHeight = inputHeight;
 | 
						|
        padWidth     = 0;
 | 
						|
        padHeight    = 0;
 | 
						|
    }
 | 
						|
    if (mParameter->padType() == PoolPadType_SAME) {
 | 
						|
        int padNeededWidth  = (outputWidth - 1) * strideWidth + kernelWidth - inputWidth;
 | 
						|
        int padNeededHeight = (outputHeight - 1) * strideHeight + kernelHeight - inputHeight;
 | 
						|
        padWidth            = padNeededWidth > 0 ? padNeededWidth / 2 : 0;
 | 
						|
        padHeight           = padNeededHeight > 0 ? padNeededHeight / 2 : 0;
 | 
						|
    }
 | 
						|
 | 
						|
    const int channel = input->channel();
 | 
						|
    
 | 
						|
    mThreadFunction = [=](const Tensor *src, Tensor *dst) {
 | 
						|
        poolingMaxNC16HW16Int8(core->MNNMaxPoolInt8, src, dst, strideWidth, strideHeight, kernelWidth, kernelHeight, padWidth, padHeight);
 | 
						|
    };
 | 
						|
    if (mParameter->type() == MNN::PoolType_AVEPOOL) {
 | 
						|
        mThreadFunction = [=](const Tensor *src, Tensor *dst) {
 | 
						|
            poolingAvgNC16HW16Int8(core->MNNAvgPoolInt8, src, dst, strideWidth, strideHeight, kernelWidth, kernelHeight, padWidth, padHeight);
 | 
						|
        };
 | 
						|
    }
 | 
						|
 | 
						|
    mInputTemp.reset(Tensor::createDevice<int8_t>({input->batch(), inputHeight, inputWidth, UP_DIV(channel, 16) * 16}));
 | 
						|
    mOutputTemp.reset(Tensor::createDevice<int8_t>({output->batch(), outputHeight, outputWidth, UP_DIV(channel, 16) * 16}));
 | 
						|
 | 
						|
    bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
 | 
						|
    allocSucc      = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
 | 
						|
    if (!allocSucc) {
 | 
						|
        return OUT_OF_MEMORY;
 | 
						|
    }
 | 
						|
 | 
						|
    backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
 | 
						|
    backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
 | 
						|
    return NO_ERROR;
 | 
						|
}
 | 
						|
 | 
						|
ErrorCode CPUPoolInt8::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 | 
						|
    auto input  = inputs[0];
 | 
						|
    auto output = outputs[0];
 | 
						|
    auto channel_input = input->channel();
 | 
						|
    auto plane_in = input->width() * input->height() * input->batch();
 | 
						|
    auto plane_out = output->width() * output->height() * output->batch();
 | 
						|
    auto core = static_cast<CPUBackend*>(backend())->functions();
 | 
						|
    auto depth = UP_DIV(channel_input, core->pack);
 | 
						|
    
 | 
						|
    if (core->pack == 8) {
 | 
						|
        MNNPackC2Origin(mInputTemp.get()->host<double>(), input->host<double>(), plane_in, depth, plane_in);
 | 
						|
        mThreadFunction(mInputTemp.get(), mOutputTemp.get());
 | 
						|
        MNNUnpackC2Origin(output->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
 | 
						|
    }
 | 
						|
    else if (core->pack == 4) {
 | 
						|
        MNNPackC4Origin(mInputTemp.get()->host<float>(), input->host<float>(), plane_in, depth, plane_in);
 | 
						|
        mThreadFunction(mInputTemp.get(), mOutputTemp.get());
 | 
						|
        MNNUnpackC4Origin(output->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
 | 
						|
    }
 | 
						|
    else if (core->pack == 16) {
 | 
						|
        mThreadFunction(input, output);
 | 
						|
    }
 | 
						|
    return NO_ERROR;
 | 
						|
}
 | 
						|
 | 
						|
class CPUPoolInt8Creator : public CPUBackend::Creator {
 | 
						|
public:
 | 
						|
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
 | 
						|
                                const MNN::Op *op, Backend *backend) const override {
 | 
						|
        return new CPUPoolInt8(backend, op->main_as_Pool());
 | 
						|
    }
 | 
						|
};
 | 
						|
 | 
						|
REGISTER_CPU_OP_CREATOR(CPUPoolInt8Creator, OpType_PoolInt8);
 | 
						|
 | 
						|
} // namespace MNN
 |