MNN/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp

//
//  SoftmaxBufExecution.cpp
//  MNN
//
//  Created by MNN on 2019/02/28.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#ifndef MNN_OPENCL_BUFFER_CLOSED

#include "backend/opencl/execution/buffer/SoftmaxBufExecution.hpp"

namespace MNN {
namespace OpenCL {

SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, int axis, const MNN::Op* Op, Backend *backend)
    : CommonExecution(backend, Op) {
    mAxis          = axis;
    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_buf", {"-DSOFTMAX_LOCAL_SIZE=512"}, mOpenCLBackend->getPrecision());
    mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
}

int SoftmaxBufExecution::getLocalSize(int size, int maxGroupSize){
    int local_size = 1;
    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
        local_size *= 2;
    }
    return local_size;
}

ErrorCode SoftmaxBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    mUnits.clear();
    Tensor *input  = inputs[0];
    Tensor *output = outputs[0];
    
    const auto dims = input->buffer().dimensions;
    auto runtime       = mOpenCLBackend->getOpenCLRuntime();

    auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));

    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
    if (mNeedUnpackC4) {
        int totalSize = 1;
        for (int i = 1; i < dims; ++i) {
            totalSize *= input->length(i);
        }
        mTempTensor.reset(Tensor::createDevice<float>({totalSize}));
        mOpenCLBackend->onAcquireBuffer(mTempTensor.get(), Backend::DYNAMIC);
        mOpenCLBackend->onReleaseBuffer(mTempTensor.get(), Backend::DYNAMIC);
    }
    
    int inside  = 1;
    int outside = 1;
    int channel = 1;
    for (int i = 0; i < mAxis; ++i) {
        outside *= input->length(i);
    }
    channel = input->length(mAxis);
    for (int i = mAxis + 1; i < dims; ++i) {
        inside *= input->length(i);
    }
    
    // NC4HW4 -> NCHW
    if(mNeedUnpackC4){
        Unit unit;
        std::vector<int> outputShape = tensorShapeFormat(input);
        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
        std::set<std::string> buildOptions;
        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, mOpenCLBackend->getPrecision(), input, output);
        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
        cl_int ret = CL_SUCCESS;
        uint32_t idx = 0;
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");

        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
        mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
        
        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
        mUnits.emplace_back(unit);
    }
    
    // softmax
    {
        Unit unit;
        int localSize = getLocalSize(channel, MaxLocalSize);
        if(localSize < 4){
            localSize = 1;
        }
        std::set<std::string> buildOptions = mBuildOptions;
        buildOptions.emplace("-DARGMAX_LOCAL_SIZE=" + std::to_string(localSize));
        std::string kernelName;
        if(inside == 1){
            buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
            unit.kernel = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOptions, mOpenCLBackend->getPrecision(), inputs[0], outputs[0]);
            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outside), static_cast<uint32_t>(1)};
        }
        else if(inside % 4 == 0){
            unit.kernel = runtime->buildKernel("softmax_buf", "softmax_v4_buf", buildOptions, mOpenCLBackend->getPrecision());
            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};
        }else {
            unit.kernel = runtime->buildKernel("softmax_buf", "softmax_buf", buildOptions, mOpenCLBackend->getPrecision());
            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};
        }
        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
        mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
        
        cl_int ret = CL_SUCCESS;
        
        uint32_t idx    = 0;
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
        if(mNeedUnpackC4){
            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
            ret |= unit.kernel->get().setArg(idx++, openCLImage(mTempTensor.get()));
        }else{
            ret |= unit.kernel->get().setArg(idx++, openCLImage(input));
            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
        }
        if(inside == 1){
            ret |= unit.kernel->get().setArg(idx++, channel);
            int shape[4] = {1, outside, channel, 1};
            ret |= unit.kernel->get().setArg(idx++, shape);
        } else {
            ret |= unit.kernel->get().setArg(idx++, inside);
            ret |= unit.kernel->get().setArg(idx++, outside);
            ret |= unit.kernel->get().setArg(idx++, channel);
        }
        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution");
        if(localSize == 1){
            std::string programName = "softmax_buf";
            if(inside == 1){
                programName = "self_attention_buf";
            }
            mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax_buf", unit.kernel, mOpenCLBackend->getCLTuneLevel(), programName).first;
        }
        
        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
        mUnits.emplace_back(unit);
    }
    
    // NCHW -> NC4HW4
    if(mNeedUnpackC4){
        Unit unit;
        std::vector<int> outputShape = tensorShapeFormat(output);
        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
        std::set<std::string> buildOptions;
        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, mOpenCLBackend->getPrecision(), input, output);
        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
        cl_int ret = CL_SUCCESS;
        uint32_t idx = 0;
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempTensor.get()));
        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");

        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
        mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
        
        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
        mUnits.emplace_back(unit);
    }
    
    return NO_ERROR;
}

class SoftmaxBufCreator : public OpenCLBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const override {
        for (int i = 0; i < inputs.size(); ++i) {
            TensorUtils::setTensorSupportPack(inputs[i], false);
        }
        for (int i = 0; i < outputs.size(); ++i) {
            TensorUtils::setTensorSupportPack(outputs[i], false);
        }
        auto dimType = inputs[0]->getDimensionType();
        if (dimType == Tensor::TENSORFLOW && inputs[0]->dimensions() == 4) {
            int index[4] = {0, 2, 3, 1};
            auto axis = op->main_as_Axis()->axis();
            if (axis < 0) {
                axis = inputs[0]->dimensions() + axis;
            }

            axis = index[axis];
            //1 : channel //2 : height
            if (1 == axis || 2 == axis || 3 == axis) {
                return new SoftmaxBufExecution(inputs, axis, op, backend);
            }
            return nullptr;
        } else {
            auto axis = op->main_as_Axis()->axis();
            if (axis < 0) {
                axis = inputs[0]->dimensions() + axis;
            }

            if (1 == axis || 2 == axis || 3 == axis) {
                return new SoftmaxBufExecution(inputs, axis, op, backend);
            }
            return nullptr;
        }
    }
};
REGISTER_OPENCL_OP_CREATOR(SoftmaxBufCreator, OpType_Softmax, BUFFER);

} // namespace OpenCL
} // namespace MNN
#endif/* MNN_OPENCL_BUFFER_CLOSED */
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`//`
			`// SoftmaxBufExecution.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2019/02/28.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#ifndef MNN_OPENCL_BUFFER_CLOSED`

			`#include "backend/opencl/execution/buffer/SoftmaxBufExecution.hpp"`

			`namespace MNN {`
			`namespace OpenCL {`

[MNN:Sync] Sync Internal 2.8.4 2024-04-19 11:58:21 +08:00			`SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor > &inputs, int axis, const MNN::Op Op, Backend *backend)`
			`: CommonExecution(backend, Op) {`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`mAxis = axis;`
			`mOpenCLBackend = static_cast<OpenCLBackend *>(backend);`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_buf", {"-DSOFTMAX_LOCAL_SIZE=512"}, mOpenCLBackend->getPrecision());`
[MNN:Sync] Sync Internal 2.8.1 2023-12-27 17:26:44 +08:00			`mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`}`

[MNN:Sync] Sync Internal 2.7.1 2023-09-20 20:16:25 +08:00			`int SoftmaxBufExecution::getLocalSize(int size, int maxGroupSize){`
			`int local_size = 1;`
			`while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){`
			`local_size *= 2;`
			`}`
			`return local_size;`
			`}`

[MNN:Sync] Sync Internal 2.8.4 2024-04-19 11:58:21 +08:00			`ErrorCode SoftmaxBufExecution::onEncode(const std::vector<Tensor > &inputs, const std::vector<Tensor > &outputs) {`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mUnits.clear();`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`Tensor *input = inputs[0];`
			`Tensor *output = outputs[0];`
[MNN:Sync] Sync Internal 2.7.0 2023-09-04 10:42:11 +08:00
			`const auto dims = input->buffer().dimensions;`
[MNN:Sync] Sync Internal 2.8.1 2023-12-27 17:26:44 +08:00			`auto runtime = mOpenCLBackend->getOpenCLRuntime();`
MNN:Sync Sync Internal 2.9.0 2024-05-11 19:17:02 +08:00
[MNN:Sync] Sync Internal 2.9.1 2024-06-03 20:09:34 +08:00			`auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));`
MNN:Sync Sync Internal 2.9.0 2024-05-11 19:17:02 +08:00
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;`
			`mNeedUnpackC4 = layout == MNN_DATA_FORMAT_NC4HW4;`
			`if (mNeedUnpackC4) {`
			`int totalSize = 1;`
			`for (int i = 1; i < dims; ++i) {`
			`totalSize *= input->length(i);`
			`}`
			`mTempTensor.reset(Tensor::createDevice<float>({totalSize}));`
			`mOpenCLBackend->onAcquireBuffer(mTempTensor.get(), Backend::DYNAMIC);`
			`mOpenCLBackend->onReleaseBuffer(mTempTensor.get(), Backend::DYNAMIC);`
			`}`

[MNN:Sync] Sync Internal 2.7.0 2023-09-04 10:42:11 +08:00			`int inside = 1;`
			`int outside = 1;`
			`int channel = 1;`
			`for (int i = 0; i < mAxis; ++i) {`
			`outside *= input->length(i);`
			`}`
			`channel = input->length(mAxis);`
			`for (int i = mAxis + 1; i < dims; ++i) {`
			`inside *= input->length(i);`
			`}`

MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`// NC4HW4 -> NCHW`
			`if(mNeedUnpackC4){`
			`Unit unit;`
			`std::vector<int> outputShape = tensorShapeFormat(input);`
			`int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W`
			`std::set<std::string> buildOptions;`
			`buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");`
			`buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NCHW");`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, mOpenCLBackend->getPrecision(), input, output);`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};`
			`cl_int ret = CL_SUCCESS;`
			`uint32_t idx = 0;`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);`
			`ret \|= unit.kernel->get().setArg(idx++, openCLBuffer(input));`
			`ret \|= unit.kernel->get().setArg(idx++, sizeof(shape), shape);`
			`ret \|= unit.kernel->get().setArg(idx++, openCLBuffer(output));`
			`MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));`
			`mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};`

			`mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);`
			`unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};`
			`unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};`
			`mUnits.emplace_back(unit);`
[MNN:Sync] Sync Internal 2.7.0 2023-09-04 10:42:11 +08:00			`}`
[MNN:Sync] Sync Internal 2.9.1 2024-06-03 20:09:34 +08:00
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`// softmax`
			`{`
			`Unit unit;`
			`int localSize = getLocalSize(channel, MaxLocalSize);`
			`if(localSize < 4){`
			`localSize = 1;`
			`}`
			`std::set<std::string> buildOptions = mBuildOptions;`
			`buildOptions.emplace("-DARGMAX_LOCAL_SIZE=" + std::to_string(localSize));`
			`std::string kernelName;`
			`if(inside == 1){`
			`buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`unit.kernel = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOptions, mOpenCLBackend->getPrecision(), inputs[0], outputs[0]);`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outside), static_cast<uint32_t>(1)};`
			`}`
			`else if(inside % 4 == 0){`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`unit.kernel = runtime->buildKernel("softmax_buf", "softmax_v4_buf", buildOptions, mOpenCLBackend->getPrecision());`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};`
			`}else {`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`unit.kernel = runtime->buildKernel("softmax_buf", "softmax_buf", buildOptions, mOpenCLBackend->getPrecision());`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};`
			`}`
			`mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));`
			`mLocalWorkSize = {(uint32_t)(localSize), 1, 1};`

			`cl_int ret = CL_SUCCESS;`

			`uint32_t idx = 0;`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);`
			`if(mNeedUnpackC4){`
			`ret \|= unit.kernel->get().setArg(idx++, openCLImage(output));`
			`ret \|= unit.kernel->get().setArg(idx++, openCLImage(mTempTensor.get()));`
			`}else{`
			`ret \|= unit.kernel->get().setArg(idx++, openCLImage(input));`
			`ret \|= unit.kernel->get().setArg(idx++, openCLImage(output));`
			`}`
			`if(inside == 1){`
			`ret \|= unit.kernel->get().setArg(idx++, channel);`
			`int shape[4] = {1, outside, channel, 1};`
			`ret \|= unit.kernel->get().setArg(idx++, shape);`
			`} else {`
			`ret \|= unit.kernel->get().setArg(idx++, inside);`
			`ret \|= unit.kernel->get().setArg(idx++, outside);`
			`ret \|= unit.kernel->get().setArg(idx++, channel);`
			`}`
			`MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution");`
			`if(localSize == 1){`
MNN:Sync: Sync Internal 3.2.1 2025-06-17 11:08:21 +08:00			`std::string programName = "softmax_buf";`
			`if(inside == 1){`
			`programName = "self_attention_buf";`
			`}`
			`mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax_buf", unit.kernel, mOpenCLBackend->getCLTuneLevel(), programName).first;`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`}`

			`mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);`
			`unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};`
			`unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};`
			`mUnits.emplace_back(unit);`
			`}`
[MNN:Sync] Sync Internal 2.7.1 2023-09-20 20:16:25 +08:00
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`// NCHW -> NC4HW4`
			`if(mNeedUnpackC4){`
			`Unit unit;`
			`std::vector<int> outputShape = tensorShapeFormat(output);`
			`int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W`
			`std::set<std::string> buildOptions;`
			`buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NCHW");`
			`buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");`
MNN:Sync: Sync Internal 3.1.3 2025-04-28 11:38:44 +08:00			`unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, mOpenCLBackend->getPrecision(), input, output);`
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};`
			`cl_int ret = CL_SUCCESS;`
			`uint32_t idx = 0;`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);`
			`ret \|= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);`
			`ret \|= unit.kernel->get().setArg(idx++, openCLBuffer(mTempTensor.get()));`
			`ret \|= unit.kernel->get().setArg(idx++, sizeof(shape), shape);`
			`ret \|= unit.kernel->get().setArg(idx++, openCLBuffer(output));`
			`MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");`
[MNN:Sync] Sync Internal 2.8.4 2024-04-19 11:58:21 +08:00
MNN:Sync: Sync Internal 2.9.5 2024-09-12 12:57:57 +08:00			`const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));`
			`mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};`

			`mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);`
			`unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};`
			`unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};`
			`mUnits.emplace_back(unit);`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`}`
[MNN:Sync] Sync Internal 2.8.1 2023-12-27 17:26:44 +08:00
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`return NO_ERROR;`
			`}`

			`class SoftmaxBufCreator : public OpenCLBackend::Creator {`
			`public:`
			`virtual Execution onCreate(const std::vector<Tensor > &inputs, const std::vector<Tensor *> &outputs,`
			`const MNN::Op op, Backend backend) const override {`
[MNN:Sync] Sync Internal 2.6.2 2023-07-31 14:24:48 +08:00			`for (int i = 0; i < inputs.size(); ++i) {`
			`TensorUtils::setTensorSupportPack(inputs[i], false);`
			`}`
			`for (int i = 0; i < outputs.size(); ++i) {`
			`TensorUtils::setTensorSupportPack(outputs[i], false);`
			`}`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`auto dimType = inputs[0]->getDimensionType();`
			`if (dimType == Tensor::TENSORFLOW && inputs[0]->dimensions() == 4) {`
			`int index[4] = {0, 2, 3, 1};`
			`auto axis = op->main_as_Axis()->axis();`
			`if (axis < 0) {`
			`axis = inputs[0]->dimensions() + axis;`
			`}`

			`axis = index[axis];`
			`//1 : channel //2 : height`
			`if (1 == axis \|\| 2 == axis \|\| 3 == axis) {`
[MNN:Sync] Sync Internal 2.8.4 2024-04-19 11:58:21 +08:00			`return new SoftmaxBufExecution(inputs, axis, op, backend);`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`}`
			`return nullptr;`
			`} else {`
			`auto axis = op->main_as_Axis()->axis();`
			`if (axis < 0) {`
			`axis = inputs[0]->dimensions() + axis;`
			`}`

			`if (1 == axis \|\| 2 == axis \|\| 3 == axis) {`
[MNN:Sync] Sync Internal 2.8.4 2024-04-19 11:58:21 +08:00			`return new SoftmaxBufExecution(inputs, axis, op, backend);`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00			`}`
			`return nullptr;`
			`}`
			`}`
			`};`
[MNN:Sync] Sync Internal 2.8.1 2023-12-27 17:26:44 +08:00			`REGISTER_OPENCL_OP_CREATOR(SoftmaxBufCreator, OpType_Softmax, BUFFER);`
[MNN:Sync] Sync opencl from Internal Gitlab, support Buffer and more auto-turning mode 2021-03-12 18:41:50 +08:00
			`} // namespace OpenCL`
			`} // namespace MNN`
			`#endif/* MNN_OPENCL_BUFFER_CLOSED */`