mirror of https://github.com/alibaba/MNN.git
				
				
				
			
		
			
				
	
	
		
			214 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			C++
		
	
	
	
| //
 | |
| //  ConvolutionDepthwise3x3.cpp
 | |
| //  MNN
 | |
| //
 | |
| //  Created by MNN on 2019/4/3.
 | |
| //  Copyright © 2018, Alibaba Group Holding Limited
 | |
| //
 | |
| 
 | |
| #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
 | |
| #include "backend/cpu/CPUBackend.hpp"
 | |
| #include "CommonOptFunction.h"
 | |
| #include "core/Concurrency.h"
 | |
| #include "core/Macro.h"
 | |
| 
 | |
| namespace MNN {
 | |
| ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
 | |
|     mResource = resource;
 | |
| }
 | |
| 
 | |
| ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
 | |
|                                                  const float *originWeight, size_t originWeightSize, const float *bias,
 | |
|                                                  size_t biasSize)
 | |
|     : CPUConvolution(common, b) {
 | |
|     MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
 | |
|     MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
 | |
|     MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
 | |
|     mResource.reset(new Resource);
 | |
|     mResource->backend = b;
 | |
|     auto core = static_cast<CPUBackend*>(b)->functions();
 | |
|     auto pack = core->pack;
 | |
|     auto bytes = core->bytes;
 | |
|     auto success = mResource->copyBiasAlign(bias, biasSize);
 | |
|     if (!success) {
 | |
|         mValid = false;
 | |
|         return;
 | |
|     }
 | |
|     auto channel   = common->outputCount();
 | |
|     auto channelC4 = UP_DIV(channel, pack);
 | |
|     auto unitSize = channelC4 * pack * 3 * 4;
 | |
|     mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
 | |
|     mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
 | |
|     if (!mValid) {
 | |
|         return;
 | |
|     }
 | |
|     AutoStorage<float> tempWeightStorge;
 | |
|     auto weightHost = mResource->mWeight->host<float>();
 | |
|     if (bytes < 4) {
 | |
|         // Lowp need extra float storage for transform
 | |
|         tempWeightStorge.reset(unitSize);
 | |
|         if (nullptr == tempWeightStorge.get()) {
 | |
|             mValid = false;
 | |
|             return;
 | |
|         }
 | |
|         weightHost = tempWeightStorge.get();
 | |
|     }
 | |
|     ::memset(weightHost, 0,  unitSize * sizeof(float));
 | |
|     /* 1D-Winograd F(2,3) and tiling */
 | |
|     for (int c = 0; c < channel; ++c) {
 | |
|         auto cIndex     = c / pack;
 | |
|         auto cRemain    = c % pack;
 | |
|         auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
 | |
|         auto weightSrcZ = originWeight + c * 9;
 | |
|         for (int y = 0; y < 3; ++y) {
 | |
|             auto k0 = weightSrcZ[3 * y + 0];
 | |
|             auto k1 = weightSrcZ[3 * y + 1];
 | |
|             auto k2 = weightSrcZ[3 * y + 2];
 | |
| 
 | |
|             auto m0 = k0;
 | |
|             auto m1 = 0.5f * (k0 + k1 + k2);
 | |
|             auto m2 = 0.5f * (k0 - k1 + k2);
 | |
|             auto m3 = k2;
 | |
| 
 | |
|             weightDstZ[(y * 4 + 0) * pack] = m0;
 | |
|             weightDstZ[(y * 4 + 1) * pack] = m1;
 | |
|             weightDstZ[(y * 4 + 2) * pack] = m2;
 | |
|             weightDstZ[(y * 4 + 3) * pack] = m3;
 | |
|         }
 | |
|     }
 | |
|     if (bytes < 4) {
 | |
|         core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
 | |
|     }
 | |
| }
 | |
| 
 | |
| ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
 | |
|     // Do nothing
 | |
| }
 | |
| 
 | |
| bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
 | |
|     if (nullptr == dst) {
 | |
|         return true;
 | |
|     }
 | |
|     auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
 | |
|     *dst = dstExe;
 | |
|     return true;
 | |
| }
 | |
| 
 | |
| ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 | |
|     CPUConvolution::onResize(inputs, outputs);
 | |
|     int numberThread = ((CPUBackend *)backend())->threadNumber();
 | |
|     auto output      = outputs[0];
 | |
|     auto owUnit      = UP_DIV(output->width(), 2);
 | |
|     auto core        = static_cast<CPUBackend*>(backend())->functions();
 | |
|     // 3 cacheline
 | |
|     mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
 | |
|     auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
 | |
|     if (!valid) {
 | |
|         return OUT_OF_MEMORY;
 | |
|     }
 | |
|     backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
 | |
|     auto iw       = inputs[0]->width();
 | |
|     mSourceStartX = UP_DIV(mPadX, 2);
 | |
|     mSourceEndX   = std::max((iw + mPadX - 4) / 2, mSourceStartX);
 | |
|     mPostParameters = getPostParameters();
 | |
|     // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
 | |
|     // FUNC_PRINT_ALL(rate, f);
 | |
|     return NO_ERROR;
 | |
| }
 | |
| 
 | |
| ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
 | |
|                                              const std::vector<Tensor *> &outputs) {
 | |
|     auto input    = inputs[0];
 | |
|     auto output   = outputs[0];
 | |
|     auto core        = static_cast<CPUBackend*>(backend())->functions();
 | |
| 
 | |
|     int channelC4 = UP_DIV(input->channel(), core->pack);
 | |
|     int initSize  = std::min(input->height(), 2);
 | |
|     int batch     = input->batch();
 | |
|     int ow        = output->width();
 | |
|     int oh        = output->height();
 | |
|     int owUnit    = UP_DIV(ow, 2);
 | |
| 
 | |
|     auto iw           = input->width();
 | |
|     auto ih           = input->height();
 | |
|     auto kernelOrigin = mResource->mWeight->host<uint8_t>();
 | |
| 
 | |
|     /*oy-mPadY>=0*/
 | |
|     int middelYStart = mPadY;
 | |
| 
 | |
|     /*oy-mPadY+3-1 < ih*/
 | |
|     int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
 | |
| 
 | |
|     int threadNumber = ((CPUBackend *)backend())->threadNumber();
 | |
|     auto maxKernelH  = std::min(mPadY + ih, 3);
 | |
|     auto total = channelC4 * batch;
 | |
|     auto inputOrigin  = input->host<uint8_t>();
 | |
|     auto outputOrigin = output->host<uint8_t>();
 | |
|     MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
 | |
|         auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
 | |
|         for (int index = (int)tId; index < total; index += threadNumber) {
 | |
|             int z = index / batch;
 | |
|             auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
 | |
|             auto inputZ     = inputOrigin + core->pack * index * iw * ih * core->bytes;
 | |
|             auto outputZ    = outputOrigin + core->pack * index * ow * oh * core->bytes;
 | |
|             auto kernelZ    = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
 | |
|             auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
 | |
|             auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
 | |
|             auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
 | |
| 
 | |
|             float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
 | |
| 
 | |
|             // Init
 | |
|             for (int i = 0; i < initSize; ++i) {
 | |
|                 core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
 | |
|                                        mSourceEndX);
 | |
|             }
 | |
| 
 | |
|             // Compute Top
 | |
|             for (int y = 0; y < middelYStart; ++y) {
 | |
|                 auto outputY      = outputZ + y * core->bytes * core->pack * ow;
 | |
|                 int cacheLineSize = y - mPadY + maxKernelH;
 | |
|                 if (cacheLineSize <= 0) {
 | |
|                     ::memset(outputY, 0, core->bytes * ow * core->pack);
 | |
|                     core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
 | |
|                     continue;
 | |
|                 }
 | |
|                 auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
 | |
|                 cacheLineSize = std::min(cacheLineSize, ih);
 | |
|                 core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
 | |
|             }
 | |
| 
 | |
|             // Compute Mid
 | |
|             for (int y = middelYStart; y < middelYEnd; ++y) {
 | |
|                 auto outputY = outputZ + y * core->bytes * core->pack * ow;
 | |
|                 auto iy      = y - mPadY + 2;
 | |
|                 core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
 | |
|                                        mSourceEndX);
 | |
|                 // FUNC_PRINT(ow);
 | |
|                 core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
 | |
| 
 | |
|                 auto temp    = cacheLine[0];
 | |
|                 cacheLine[0] = cacheLine[1];
 | |
|                 cacheLine[1] = cacheLine[2];
 | |
|                 cacheLine[2] = temp;
 | |
|             }
 | |
| 
 | |
|             // Compute Bottom
 | |
|             for (int y = middelYEnd; y < oh; ++y) {
 | |
|                 auto outputY      = outputZ + y * core->bytes * core->pack * ow;
 | |
|                 int cacheLineSize = (ih - y + mPadY);
 | |
|                 if (cacheLineSize <= 0) {
 | |
|                     ::memset(outputY, 0, ow * core->bytes * core->pack);
 | |
|                     core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
 | |
|                     continue;
 | |
|                 }
 | |
|                 core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
 | |
|                 cacheLine[0] = cacheLine[1];
 | |
|                 cacheLine[1] = cacheLine[2];
 | |
|             }
 | |
|         }
 | |
|     } MNN_CONCURRENCY_END();
 | |
|     return NO_ERROR;
 | |
| }
 | |
| } // namespace MNN
 |