MNN/source/backend/opencl/execution/image/RoiPoolingExecution.cpp

//
//  RoiPoolingExecution.cpp
//  MNN
//
//  Created by MNN on 2019/02/28.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "backend/opencl/execution/image/RoiPoolingExecution.hpp"
#include "core/Macro.h"
#include <float.h>
#include "core/TensorUtils.hpp"

namespace MNN {
namespace OpenCL {

RoiPooling::RoiPooling(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : Execution(backend) {
#ifdef LOG_VERBOSE
    MNN_PRINT("start RoiPooling init !\n");
#endif
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
    auto roi       = op->main_as_RoiParameters();
    mPooledWidth   = roi->pooledWidth();
    mPooledHeight  = roi->pooledHeight();
    mSpatialScale  = roi->spatialScale();
    mAreadySetArg  = false;
    std::set<std::string> buildOptions;
    std::string kernelName = "roi_pooling";
    std::vector<int> roiShape    = tensorShapeFormat(inputs[1]);
    const int roiHeight   = roiShape.at(1);
    const int roiWidth    = roiShape.at(2);
    const int roiChannels = roiShape.at(3);
    if (roiWidth == 5) {
        buildOptions.emplace("-DROI_C1H1W5");
    }else if(roiChannels == 5){
        buildOptions.emplace("-DROI_C5H1W1");
    }
    mKernel                = mOpenCLBackend->getOpenCLRuntime()->buildKernel("roi_pooling", kernelName, buildOptions);
    mMaxWorkGroupSize      = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel));
#ifdef LOG_VERBOSE
    MNN_PRINT("end RoiPooling init !\n");
#endif
}

ErrorCode RoiPooling::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    Tensor *input  = inputs[0];
    Tensor *output = outputs[0];
    Tensor *roi    = inputs[1];

    auto runtime = mOpenCLBackend->getOpenCLRuntime();
    startRecord(runtime, mRecording);

    std::vector<int> inputShape  = tensorShapeFormat(input);
    std::vector<int> outputShape = tensorShapeFormat(output);
    std::vector<int> roiShape    = tensorShapeFormat(roi);

    const int batch        = outputShape.at(0);
    const int outputHeight = outputShape.at(1);
    const int outputWidth  = outputShape.at(2);
    const int channels     = outputShape.at(3);

    const int inputBatch    = inputShape.at(0);
    const int inputHeight   = inputShape.at(1);
    const int inputWidth    = inputShape.at(2);
    const int inputChannels = inputShape.at(3);

    int channelBlocks = (channels + 3) / 4;

    mGWS = {static_cast<uint32_t>(channelBlocks),
            static_cast<uint32_t>(outputWidth),
            static_cast<uint32_t>(batch * outputHeight),
            };

    uint32_t idx = 0;

    mKernel.setArg(idx++, mGWS[0]);
    mKernel.setArg(idx++, mGWS[1]);
    mKernel.setArg(idx++, mGWS[2]);

    mKernel.setArg(idx++, openCLImage(input));
    mKernel.setArg(idx++, openCLImage(roi));
    mKernel.setArg(idx++, static_cast<int32_t>(inputHeight));
    mKernel.setArg(idx++, static_cast<int32_t>(inputWidth));
    mKernel.setArg(idx++, static_cast<int32_t>(inputBatch));
    mKernel.setArg(idx++, static_cast<int32_t>(outputHeight));
    mKernel.setArg(idx++, static_cast<int32_t>(outputWidth));
    mKernel.setArg(idx++, static_cast<float>(mSpatialScale));
    mKernel.setArg(idx++, openCLImage(output));
    
    mLWS = roiPoolingLocalWS(mGWS, mMaxWorkGroupSize);
    recordKernel3d(mKernel, mGWS, mLWS, runtime);
    endRecord(runtime, mRecording);
    return NO_ERROR;
}

std::vector<uint32_t> RoiPooling::roiPoolingLocalWS(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize) {
    std::vector<uint32_t> lws(4, 0);
    GpuType gpuType             = mOpenCLBackend->getOpenCLRuntime()->getGpuType();
    uint32_t deviceComputeUnits = mOpenCLBackend->getOpenCLRuntime()->deviceComputeUnits();
    int coreNum = deviceComputeUnits;
    for (int i = 0, totalSizeNow = 1; i < gws.size(); ++i) {
        int remain = gws[i] % coreNum, groupSize = gws[i] / coreNum;
        if (remain == 0) {
            lws[i] = groupSize;
        } else {
            while(groupSize) {
                int remain = gws[i] % groupSize;
                if (remain == 0 && (i > 0 || groupSize <= maxWorkGroupSize)) {
                    lws[i] = groupSize;
                    break;
                }
                --groupSize;
            }
        }
        lws[i] = std::max<uint32_t>(std::min<uint32_t>(lws[i], maxWorkGroupSize / totalSizeNow), 1);
        totalSizeNow *= lws[i];
    }
    return lws;
}

ErrorCode RoiPooling::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
#ifdef LOG_VERBOSE
    MNN_PRINT("start RoiPooling onExecute !\n");
#endif

#ifdef ENABLE_OPENCL_TIME_PROFILER
    cl::Event event;
    run3DKernelDefault(mKernel, mGWS, mLWS,
                       mOpenCLBackend->getOpenCLRuntime(), &event);
    
    int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
    MNN_PRINT("kernel cost:%d    us RoiPooling\n",costTime);
#else
    if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
        mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
#ifdef LOG_VERBOSE
        MNN_PRINT("End RoiPooling onExecute... \n");
#endif
        return NO_ERROR;
    }
    run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());
#endif
    
#ifdef LOG_VERBOSE
    MNN_PRINT("end RoiPooling onExecute !\n");
#endif
    return NO_ERROR;
}

OpenCLCreatorRegister<TypedCreator<RoiPooling>> __roi_pooling_op(OpType_ROIPooling, IMAGE);

} // namespace OpenCL
} // namespace MNN
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// RoiPoolingExecution.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/02/28.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`#include "backend/opencl/execution/image/RoiPoolingExecution.hpp"`
Update 2019-12-27 22:16:57 +08:00			`#include "core/Macro.h"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`#include <float.h>`
Update 2019-12-27 22:16:57 +08:00			`#include "core/TensorUtils.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`namespace MNN {`
			`namespace OpenCL {`

			`RoiPooling::RoiPooling(const std::vector<Tensor > &inputs, const MNN::Op op, Backend *backend) : Execution(backend) {`
			`#ifdef LOG_VERBOSE`
			`MNN_PRINT("start RoiPooling init !\n");`
			`#endif`
			`mOpenCLBackend = static_cast<OpenCLBackend *>(backend);`
[MNN:Bugfix] Fix Compile bug for other backends 2021-11-12 17:49:50 +08:00			`auto roi = op->main_as_RoiParameters();`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`mPooledWidth = roi->pooledWidth();`
			`mPooledHeight = roi->pooledHeight();`
			`mSpatialScale = roi->spatialScale();`
			`mAreadySetArg = false;`
			`std::set<std::string> buildOptions;`
			`std::string kernelName = "roi_pooling";`
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`std::vector<int> roiShape = tensorShapeFormat(inputs[1]);`
			`const int roiHeight = roiShape.at(1);`
			`const int roiWidth = roiShape.at(2);`
			`const int roiChannels = roiShape.at(3);`
			`if (roiWidth == 5) {`
			`buildOptions.emplace("-DROI_C1H1W5");`
			`}else if(roiChannels == 5){`
			`buildOptions.emplace("-DROI_C5H1W1");`
			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("roi_pooling", kernelName, buildOptions);`
			`mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel));`
			`#ifdef LOG_VERBOSE`
			`MNN_PRINT("end RoiPooling init !\n");`
			`#endif`
			`}`

			`ErrorCode RoiPooling::onResize(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
add opencl kernel profile & revise some info in onExecute to onResize stage 2020-05-28 19:04:27 +08:00			`Tensor *input = inputs[0];`
			`Tensor *output = outputs[0];`
			`Tensor *roi = inputs[1];`

			`auto runtime = mOpenCLBackend->getOpenCLRuntime();`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`startRecord(runtime, mRecording);`
add opencl kernel profile & revise some info in onExecute to onResize stage 2020-05-28 19:04:27 +08:00
			`std::vector<int> inputShape = tensorShapeFormat(input);`
			`std::vector<int> outputShape = tensorShapeFormat(output);`
			`std::vector<int> roiShape = tensorShapeFormat(roi);`

			`const int batch = outputShape.at(0);`
			`const int outputHeight = outputShape.at(1);`
			`const int outputWidth = outputShape.at(2);`
			`const int channels = outputShape.at(3);`

			`const int inputBatch = inputShape.at(0);`
			`const int inputHeight = inputShape.at(1);`
			`const int inputWidth = inputShape.at(2);`
			`const int inputChannels = inputShape.at(3);`

			`int channelBlocks = (channels + 3) / 4;`

			`mGWS = {static_cast<uint32_t>(channelBlocks),`
			`static_cast<uint32_t>(outputWidth),`
			`static_cast<uint32_t>(batch * outputHeight),`
			`};`

			`uint32_t idx = 0;`

			`mKernel.setArg(idx++, mGWS[0]);`
			`mKernel.setArg(idx++, mGWS[1]);`
			`mKernel.setArg(idx++, mGWS[2]);`

			`mKernel.setArg(idx++, openCLImage(input));`
			`mKernel.setArg(idx++, openCLImage(roi));`
			`mKernel.setArg(idx++, static_cast<int32_t>(inputHeight));`
			`mKernel.setArg(idx++, static_cast<int32_t>(inputWidth));`
[MNN:Sync] Sync Internal Gitlab 2023-02-28 10:41:24 +08:00			`mKernel.setArg(idx++, static_cast<int32_t>(inputBatch));`
			`mKernel.setArg(idx++, static_cast<int32_t>(outputHeight));`
			`mKernel.setArg(idx++, static_cast<int32_t>(outputWidth));`
add opencl kernel profile & revise some info in onExecute to onResize stage 2020-05-28 19:04:27 +08:00			`mKernel.setArg(idx++, static_cast<float>(mSpatialScale));`
			`mKernel.setArg(idx++, openCLImage(output));`

			`mLWS = roiPoolingLocalWS(mGWS, mMaxWorkGroupSize);`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`recordKernel3d(mKernel, mGWS, mLWS, runtime);`
			`endRecord(runtime, mRecording);`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return NO_ERROR;`
			`}`

			`std::vector<uint32_t> RoiPooling::roiPoolingLocalWS(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize) {`
			`std::vector<uint32_t> lws(4, 0);`
			`GpuType gpuType = mOpenCLBackend->getOpenCLRuntime()->getGpuType();`
			`uint32_t deviceComputeUnits = mOpenCLBackend->getOpenCLRuntime()->deviceComputeUnits();`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`int coreNum = deviceComputeUnits;`
			`for (int i = 0, totalSizeNow = 1; i < gws.size(); ++i) {`
			`int remain = gws[i] % coreNum, groupSize = gws[i] / coreNum;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`if (remain == 0) {`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`lws[i] = groupSize;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`} else {`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`while(groupSize) {`
			`int remain = gws[i] % groupSize;`
			`if (remain == 0 && (i > 0 \|\| groupSize <= maxWorkGroupSize)) {`
			`lws[i] = groupSize;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`break;`
			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`--groupSize;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`}`
Github release 1.1.0 2020-11-05 16:41:56 +08:00			`lws[i] = std::max<uint32_t>(std::min<uint32_t>(lws[i], maxWorkGroupSize / totalSizeNow), 1);`
			`totalSizeNow *= lws[i];`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`}`
			`return lws;`
			`}`

			`ErrorCode RoiPooling::onExecute(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
			`#ifdef LOG_VERBOSE`
			`MNN_PRINT("start RoiPooling onExecute !\n");`
			`#endif`

add opencl kernel profile & revise some info in onExecute to onResize stage 2020-05-28 19:04:27 +08:00			`#ifdef ENABLE_OPENCL_TIME_PROFILER`
			`cl::Event event;`
			`run3DKernelDefault(mKernel, mGWS, mLWS,`
			`mOpenCLBackend->getOpenCLRuntime(), &event);`

			`int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);`
			`MNN_PRINT("kernel cost:%d us RoiPooling\n",costTime);`
			`#else`
[MNN:Sync] Sync Internal 2.5.3 2023-06-16 09:42:45 +08:00			`if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){`
			`mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);`
			`#ifdef LOG_VERBOSE`
			`MNN_PRINT("End RoiPooling onExecute... \n");`
			`#endif`
			`return NO_ERROR;`
			`}`
add opencl kernel profile & revise some info in onExecute to onResize stage 2020-05-28 19:04:27 +08:00			`run3DKernelDefault(mKernel, mGWS, mLWS, mOpenCLBackend->getOpenCLRuntime());`
			`#endif`

beta 0.1.0 2019-04-17 10:49:11 +08:00			`#ifdef LOG_VERBOSE`
			`MNN_PRINT("end RoiPooling onExecute !\n");`
			`#endif`
			`return NO_ERROR;`
			`}`

[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`OpenCLCreatorRegister<TypedCreator<RoiPooling>> __roi_pooling_op(OpType_ROIPooling, IMAGE);`
beta 0.1.0 2019-04-17 10:49:11 +08:00
			`} // namespace OpenCL`
			`} // namespace MNN`