mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			270 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			270 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
| //
 | |
| //  CPUPoolInt8.cpp
 | |
| //  MNN
 | |
| //
 | |
| //  Created by MNN on 2019/06/10.
 | |
| //  Copyright © 2018, Alibaba Group Holding Limited
 | |
| //
 | |
| 
 | |
| #include "backend/cpu/CPUPoolInt8.hpp"
 | |
| #include "core/Macro.h"
 | |
| #include <math.h>
 | |
| #ifdef MNN_USE_NEON
 | |
| #include <arm_neon.h>
 | |
| #endif
 | |
| #include "compute/Int8FunctionsOpt.h"
 | |
| #include "core/Concurrency.h"
 | |
| #include "backend/cpu/compute/CommonOptFunction.h"
 | |
| 
 | |
| namespace MNN {
 | |
| 
 | |
| static void poolingAvgNC16HW16Int8(void poolfunc(int8_t*, int8_t*, size_t, size_t, size_t, size_t, size_t, ssize_t, ssize_t), const Tensor *src, Tensor *dst,
 | |
|                                    int stridesx, int stridesy, int kernelx, int kernely, int paddingx, int paddingy)
 | |
| {
 | |
|     const int inputHeight = src->height();
 | |
|     const int inputWidth = src->width();
 | |
|     const int outputHeight = dst->height();
 | |
|     const int outputWidth = dst->width();
 | |
|     const int channel = dst->channel();
 | |
|     const int batchsize = src->batch();
 | |
| 
 | |
|     const auto srcPtr = src->host<int8_t>();
 | |
|     auto dstPtr       = dst->host<int8_t>();
 | |
|     int pack = 16;
 | |
|     int thred0 = UP_DIV(paddingx, stridesx);
 | |
|     int thred1 = inputWidth + paddingx - kernelx;
 | |
|     thred1 = UP_DIV(thred1, stridesx);     // ix + kernelx >= inputWidth;
 | |
|     // int factor = static_cast<int>((1 << 24)/(kernelx * kernely));
 | |
| 
 | |
|     const int channel_ = UP_DIV(channel, pack);
 | |
|     for (int oc = 0; oc < channel_; ++oc) {
 | |
|         for(int ob = 0; ob < batchsize; ++ob) {
 | |
|             for (int oy = 0; oy < outputHeight; ++oy) {
 | |
|                 int iy = oy * stridesy - paddingy;
 | |
|                 const int kernely_ = std::min(iy + kernely, inputHeight) - std::max(iy, 0);
 | |
|                 iy = std::max(iy, 0);
 | |
|                 int ox = 0;
 | |
|                 for (ox = 0; ox < thred0; ++ox) { // ix < 0;
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
| 
 | |
|                     int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | |
| 
 | |
|                     const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx, paddingx, mul);
 | |
| 
 | |
|                 } // ix < 0;
 | |
| 
 | |
|                 // ix > 0 && ix + kernelx < inputWidth;
 | |
|                 if (thred1 - thred0 > 0) {
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
|                     int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | |
| 
 | |
|                     const int indexOutput = pack * (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));    
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
| 
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, thred1 - thred0, inputWidth, kernelx_, kernely_, stridesx, 0, mul);
 | |
|                 }
 | |
| 
 | |
|                 for (ox = thred1; ox < outputWidth; ++ox) { // ix + kernelx > inputWidth;
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
| 
 | |
|                     int mul = static_cast<int>((1 << 24)/(kernelx_ * kernely_));
 | |
| 
 | |
|                     const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx, paddingx, mul);
 | |
| 
 | |
|                 } 
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void poolingMaxNC16HW16Int8(void poolfunc(int8_t*, int8_t*, size_t, size_t, size_t, size_t, size_t), const Tensor *src, Tensor *dst, int stridesx, int stridesy, int kernelx, int kernely, int paddingx, int paddingy)
 | |
| {
 | |
|     const int inputHeight = src->height();
 | |
|     const int inputWidth = src->width();
 | |
|     const int outputHeight = dst->height();
 | |
|     const int outputWidth = dst->width();
 | |
|     const int channel = dst->channel();
 | |
|     const int batchsize = src->batch();
 | |
|     int pack = 16;
 | |
|     int thred0 = UP_DIV(paddingx, stridesx);
 | |
|     int thred1 = inputWidth + paddingx - kernelx;
 | |
|     thred1 = UP_DIV(thred1, stridesx);     // ix + kernelx >= inputWidth;
 | |
| 
 | |
|     const auto srcPtr = src->host<int8_t>();
 | |
|     auto dstPtr       = dst->host<int8_t>();
 | |
| 
 | |
|     const int channel16 = UP_DIV(channel, pack);
 | |
|     for (int oc = 0; oc < channel16; ++oc){
 | |
|         for(int ob = 0; ob < batchsize; ++ob){
 | |
|             for (int oy = 0; oy < outputHeight; ++oy) {
 | |
|                 
 | |
|                 int iy = oy * stridesy - paddingy;
 | |
|                 const int kernely_ = std::min(iy + kernely, inputHeight) - std::max(iy, 0);
 | |
|                 iy = std::max(iy, 0);
 | |
|                 int ox = 0;
 | |
|                 for (ox = 0; ox < thred0; ++ox) { // ix < 0;
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
| 
 | |
|                     const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx);
 | |
| 
 | |
|                 } // ix < 0;
 | |
| 
 | |
|                 // ix > 0 && ix + kernelx < inputWidth;
 | |
|                 if (thred1 - thred0 > 0) {
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
| 
 | |
|                     const int indexOutput = pack * (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));    
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
| 
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, thred1 - thred0, inputWidth, kernelx_, kernely_, stridesx);
 | |
|                 }
 | |
| 
 | |
|                 for (ox = thred1; ox < outputWidth; ++ox) { // ix + kernelx > inputWidth;
 | |
|                     int ix = ox * stridesx - paddingx;
 | |
|                     const int kernelx_ = std::min(ix + kernelx, inputWidth) - std::max(ix, 0);
 | |
|                     ix = std::max(ix, 0);
 | |
| 
 | |
|                     const int indexOutput = pack* (ox + outputWidth * (oy + outputHeight * (ob + batchsize * oc)));
 | |
|                     const int indexInput = pack * (ix + inputWidth * (iy + inputHeight * (ob + batchsize * oc)));
 | |
|                     int8_t* dstCur = dstPtr + indexOutput;
 | |
|                     int8_t* srcCur = srcPtr + indexInput;
 | |
| 
 | |
|                     poolfunc(dstCur, srcCur, 1, inputWidth, kernelx_, kernely_, stridesx);
 | |
| 
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| CPUPoolInt8::CPUPoolInt8(Backend *backend, const Pool *parameter) : Execution(backend), mParameter(parameter) {
 | |
| }
 | |
| 
 | |
| ErrorCode CPUPoolInt8::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 | |
|     const auto input = inputs[0];
 | |
|     auto output      = outputs[0];
 | |
| 
 | |
|     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
 | |
| 
 | |
|     int strideWidth  = mParameter->strideX();
 | |
|     int strideHeight = mParameter->strideY();
 | |
|     int padWidth     = mParameter->padX();
 | |
|     int padHeight    = mParameter->padY();
 | |
|     int kernelWidth  = mParameter->kernelX();
 | |
|     int kernelHeight = mParameter->kernelY();
 | |
| 
 | |
|     const int inputWidth   = input->width();
 | |
|     const int inputHeight  = input->height();
 | |
|     const int outputWidth  = output->width();
 | |
|     const int outputHeight = output->height();
 | |
| 
 | |
|     kernelWidth  = std::min(kernelWidth, inputWidth);
 | |
|     kernelHeight = std::min(kernelHeight, inputHeight);
 | |
|     if (mParameter->isGlobal()) {
 | |
|         kernelWidth  = inputWidth;
 | |
|         kernelHeight = inputHeight;
 | |
|         strideWidth  = inputWidth;
 | |
|         strideHeight = inputHeight;
 | |
|         padWidth     = 0;
 | |
|         padHeight    = 0;
 | |
|     }
 | |
|     if (mParameter->padType() == PoolPadType_SAME) {
 | |
|         int padNeededWidth  = (outputWidth - 1) * strideWidth + kernelWidth - inputWidth;
 | |
|         int padNeededHeight = (outputHeight - 1) * strideHeight + kernelHeight - inputHeight;
 | |
|         padWidth            = padNeededWidth > 0 ? padNeededWidth / 2 : 0;
 | |
|         padHeight           = padNeededHeight > 0 ? padNeededHeight / 2 : 0;
 | |
|     }
 | |
| 
 | |
|     const int channel = input->channel();
 | |
|     
 | |
|     mThreadFunction = [=](const Tensor *src, Tensor *dst) {
 | |
|         poolingMaxNC16HW16Int8(core->MNNMaxPoolInt8, src, dst, strideWidth, strideHeight, kernelWidth, kernelHeight, padWidth, padHeight);
 | |
|     };
 | |
|     if (mParameter->type() == MNN::PoolType_AVEPOOL) {
 | |
|         mThreadFunction = [=](const Tensor *src, Tensor *dst) {
 | |
|             poolingAvgNC16HW16Int8(core->MNNAvgPoolInt8, src, dst, strideWidth, strideHeight, kernelWidth, kernelHeight, padWidth, padHeight);
 | |
|         };
 | |
|     }
 | |
| 
 | |
|     mInputTemp.reset(Tensor::createDevice<int8_t>({input->batch(), inputHeight, inputWidth, UP_DIV(channel, 16) * 16}));
 | |
|     mOutputTemp.reset(Tensor::createDevice<int8_t>({output->batch(), outputHeight, outputWidth, UP_DIV(channel, 16) * 16}));
 | |
| 
 | |
|     bool allocSucc = backend()->onAcquireBuffer(mInputTemp.get(), Backend::DYNAMIC);
 | |
|     allocSucc      = allocSucc && backend()->onAcquireBuffer(mOutputTemp.get(), Backend::DYNAMIC);
 | |
|     if (!allocSucc) {
 | |
|         return OUT_OF_MEMORY;
 | |
|     }
 | |
| 
 | |
|     backend()->onReleaseBuffer(mInputTemp.get(), Backend::DYNAMIC);
 | |
|     backend()->onReleaseBuffer(mOutputTemp.get(), Backend::DYNAMIC);
 | |
|     return NO_ERROR;
 | |
| }
 | |
| 
 | |
| ErrorCode CPUPoolInt8::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 | |
|     auto input  = inputs[0];
 | |
|     auto output = outputs[0];
 | |
|     auto channel_input = input->channel();
 | |
|     auto plane_in = input->width() * input->height() * input->batch();
 | |
|     auto plane_out = output->width() * output->height() * output->batch();
 | |
|     auto core = static_cast<CPUBackend*>(backend())->functions();
 | |
|     auto depth = UP_DIV(channel_input, core->pack);
 | |
|     
 | |
|     if (core->pack == 8) {
 | |
|         MNNPackC2Origin(mInputTemp.get()->host<double>(), input->host<double>(), plane_in, depth, plane_in);
 | |
|         mThreadFunction(mInputTemp.get(), mOutputTemp.get());
 | |
|         MNNUnpackC2Origin(output->host<double>(), mOutputTemp.get()->host<double>(), plane_out, depth, plane_out);
 | |
|     }
 | |
|     else if (core->pack == 4) {
 | |
|         MNNPackC4Origin(mInputTemp.get()->host<float>(), input->host<float>(), plane_in, depth, plane_in);
 | |
|         mThreadFunction(mInputTemp.get(), mOutputTemp.get());
 | |
|         MNNUnpackC4Origin(output->host<float>(), mOutputTemp.get()->host<float>(), plane_out, depth, plane_out);
 | |
|     }
 | |
|     else if (core->pack == 16) {
 | |
|         mThreadFunction(input, output);
 | |
|     }
 | |
|     return NO_ERROR;
 | |
| }
 | |
| 
 | |
| class CPUPoolInt8Creator : public CPUBackend::Creator {
 | |
| public:
 | |
|     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
 | |
|                                 const MNN::Op *op, Backend *backend) const override {
 | |
|         return new CPUPoolInt8(backend, op->main_as_Pool());
 | |
|     }
 | |
| };
 | |
| 
 | |
| REGISTER_CPU_OP_CREATOR(CPUPoolInt8Creator, OpType_PoolInt8);
 | |
| 
 | |
| } // namespace MNN
 |