MNN/source/backend/cpu/CPUConcat.cpp

//
//  CPUConcat.cpp
//  MNN
//
//  Created by MNN on 2018/07/06.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "CPUConcat.hpp"
#include "CPUBackend.hpp"
#include "CommonOptFunction.h"
#include "Macro.h"
#include "TensorUtils.hpp"
using namespace std;

namespace MNN {

static int _concatWidth(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {
    auto outputDim              = outputTensor->buffer().dim;
    const int depthQuad         = UP_DIV(outputDim[1].extent, 4);
    const int height            = outputDim[2].extent;
    const int width             = outputDim[3].extent;
    const int outputPlaneStride = 4 * height * width;
    const int outputLineStride  = 4 * width;

    int batchSize = outputDim[0].extent;

    for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {
        int currentPositionW = 0;
        float* outputOrigin  = reinterpret_cast<float*>(outputTensor->buffer().host) + outputDim[0].stride * batchIndex;

        for (size_t b = 0; b < inputTensors.size(); b++) {
            auto& inputTensor    = inputTensors[b]->buffer();
            float* inputOrigin   = reinterpret_cast<float*>(inputTensor.host) + inputTensor.dim[0].stride * batchIndex;
            int inputPlaneStride = inputTensor.dim[3].extent * inputTensor.dim[2].extent * 4;
            int inputLineStride  = inputTensor.dim[3].extent * 4;
            int inputW           = inputTensor.dim[3].extent;
            for (int z = 0; z < depthQuad; ++z) {
                float* dstZ = outputOrigin + outputPlaneStride * z;
                float* srcZ = inputOrigin + inputPlaneStride * z;
                for (int y = 0; y < height; ++y) {
                    float* dstY = dstZ + outputLineStride * y + currentPositionW * 4;
                    float* srcY = srcZ + inputLineStride * y;
                    memcpy(dstY, srcY, 4 * inputW * sizeof(float));
                }
            }
            currentPositionW += inputW;
        }
    }
    return 0;
}

static int _concatHeight(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {
    auto outputDim              = outputTensor->buffer().dim;
    const int batchSize         = outputDim[0].extent;
    const int depthQuad         = UP_DIV(outputDim[1].extent, 4);
    const int height            = outputDim[2].extent;
    const int width             = outputDim[3].extent;
    const int outputPlaneStride = 4 * height * width;
    const int outputLineStride  = 4 * width;
    for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {
        float* outputOrigin  = reinterpret_cast<float*>(outputTensor->buffer().host) + outputDim[0].stride * batchIndex;
        int currentPositionH = 0;
        for (size_t b = 0; b < inputTensors.size(); b++) {
            auto& inputTensor    = inputTensors[b]->buffer();
            float* inputOrigin   = reinterpret_cast<float*>(inputTensor.host) + inputTensor.dim[0].stride * batchIndex;
            int inputPlaneStride = inputTensor.dim[2].extent * inputTensor.dim[3].extent * 4;
            int inputH           = inputTensor.dim[2].extent;
            for (int z = 0; z < depthQuad; ++z) {
                float* dstZ = outputOrigin + outputPlaneStride * z;
                float* srcZ = inputOrigin + inputPlaneStride * z;

                memcpy(dstZ + currentPositionH * outputLineStride, srcZ, inputPlaneStride * sizeof(float));
            }
            currentPositionH += inputH;
        }
    }
    return 0;
}

static int _concatBatch(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {
    auto outputDim      = outputTensor->buffer().dim;
    const int batchSize = outputDim[0].extent;
    for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {
        float* outputOrigin = reinterpret_cast<float*>(outputTensor->buffer().host) + outputDim[0].stride * batchIndex;
        for (size_t b = 0; b < inputTensors.size(); b++) {
            auto& inputTensor  = inputTensors[b]->buffer();
            float* inputOrigin = reinterpret_cast<float*>(inputTensor.host) + inputTensor.dim[0].stride * batchIndex;
            ::memcpy(outputOrigin, inputOrigin, inputTensor.dim[0].stride * sizeof(float));
        }
    }
    return 0;
}

static int _concatChannel(const Tensor* outputTensor, const vector<Tensor*>& inputTensors, bool useSlowMethod,
                          const Tensor* tempOutputTensor) {
    auto outputDim        = outputTensor->buffer().dim;
    float* outputOrigin   = reinterpret_cast<float*>(outputTensor->buffer().host);
    int batchSize         = outputDim[0].extent;

    if (useSlowMethod) {
        auto tempOutput = tempOutputTensor->host<float>();
        MNN_ASSERT(nullptr != tempOutput);
        for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {
            float* currentOutput = tempOutput;
            for (int b = 0; b < inputTensors.size(); b++) {
                auto inputTensor = inputTensors[b];

                int size = inputTensor->width() * inputTensor->height() * inputTensor->channel();
                MNNUnpackC4(currentOutput, inputTensor->host<float>() + inputTensor->stride(0) * batchIndex,
                            inputTensor->width() * inputTensor->height(), inputTensor->channel());
                currentOutput += size;
            }
            MNNPackC4(outputTensor->host<float>() + batchIndex * outputTensor->stride(0), tempOutput,
                      outputTensor->width() * outputTensor->height(), outputTensor->channel());
        }
        return 0;
    }
    for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {
        int currentPositionZ = 0;
        for (size_t b = 0; b < inputTensors.size(); b++) {
            auto& inputTensor  = inputTensors[b]->buffer();
            float* inputOrigin = reinterpret_cast<float*>(inputTensor.host) + inputTensor.dim[0].stride * batchIndex;
            int inputZ         = UP_DIV(inputTensor.dim[1].extent, 4);
            float* dst         = outputOrigin + outputDim[1].stride * currentPositionZ * 4 + outputDim[0].stride * batchIndex;
            float* src         = inputOrigin;

            memcpy(dst, src, outputDim[1].stride * 4 * inputZ * sizeof(float));
            currentPositionZ += inputZ;
        }
    }

    return 0;
}

static int _concatTf(const Tensor* outputTensor, const vector<Tensor*>& inputTensors, int axis) {
    auto& ob        = outputTensor->buffer();
    int outsideSize = 1;
    for (int i = 0; i < axis; ++i) {
        outsideSize *= ob.dim[i].extent;
    }
    int insideStride = ob.type.bytes();
    for (int i = axis + 1; i < ob.dimensions; ++i) {
        insideStride *= ob.dim[i].extent;
    }
    int outsideStride = insideStride * ob.dim[axis].extent;

    int sumAxis           = 0;
    uint8_t* outputOrigin = reinterpret_cast<uint8_t*>(outputTensor->buffer().host);
    for (size_t b = 0; b < inputTensors.size(); b++) {
        auto& inputTensor = inputTensors[b]->buffer();
        if (0 == inputTensor.dimensions) {
            continue;
        }
        uint8_t* inputOrigin = reinterpret_cast<uint8_t*>(inputTensor.host);
        int inputPlaneStride = inputTensor.dim[axis].extent * insideStride;

        for (int z = 0; z < outsideSize; ++z) {
            uint8_t* dstZ = outputOrigin + outsideStride * z + sumAxis * insideStride;
            uint8_t* srcZ = inputOrigin + inputPlaneStride * z;

            memcpy(dstZ, srcZ, inputPlaneStride);
        }
        sumAxis += inputTensor.dim[axis].extent;
    }
    return 0;
}

ErrorCode CPUConcat::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    MNN_ASSERT(outputs.size() == 1);
    MNN_ASSERT(inputs.size() >= 2);
    auto output    = outputs[0];
    mUseSlowMethod = false;
    mTempOutput.reset();
    if (output->buffer().dimensions > 1 && TensorUtils::getDescribe(output)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
        if (1 == mAxis) {
            // The last tensor needn't be aligned
            for (size_t b = 0; b < inputs.size() - 1; b++) {
                if (inputs[b]->length(1) % 4 != 0) {
                    mUseSlowMethod = true;
                    break;
                }
            }
            if (mUseSlowMethod) {
                mTempOutput.reset(Tensor::createDevice<float>(output->shape()));
                mTempOutput->setLength(0, 1);
                bool success = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
                if (false == success) {
                    return OUT_OF_MEMORY;
                }
                backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
            }
        }
    }

    return NO_ERROR;
}

ErrorCode CPUConcat::onExecute(const vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    MNN_ASSERT(1 == outputs.size());
    MNN_ASSERT(inputs.size() >= 2);
    auto input = inputs[0];
    if (input->buffer().dimensions > 1 && TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
        switch (mAxis) {
            case 0:
                _concatBatch(outputs[0], inputs);
                break;
            case 1:
                _concatChannel(outputs[0], inputs, mUseSlowMethod, mTempOutput.get());
                break;
            case 2:
                _concatHeight(outputs[0], inputs);
                break;
            case 3:
                _concatWidth(outputs[0], inputs);
                break;

            default:
                break;
        }
    } else {
        int axis = mAxis;
        // tf concat
        _concatTf(outputs[0], inputs, axis);
    }

    return NO_ERROR;
}

class CPUConcatCreator : public CPUBackend::Creator {
public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const {
        auto axis = op->main_as_Axis();
        if (nullptr != axis) {
            if (axis->axis() < 0) {
                return new CPUConcat(backend, outputs[0]->dimensions() + axis->axis());
            }
            return new CPUConcat(backend, axis->axis());
        }
        return new CPUConcat(backend, 0);
    }
};

REGISTER_CPU_OP_CREATOR(CPUConcatCreator, OpType_Concat);
} // namespace MNN
beta 0.1.0 2019-04-17 10:49:11 +08:00			`//`
			`// CPUConcat.cpp`
			`// MNN`
			`//`
			`// Created by MNN on 2018/07/06.`
			`// Copyright © 2018, Alibaba Group Holding Limited`
			`//`

			`#include "CPUConcat.hpp"`
			`#include "CPUBackend.hpp"`
			`#include "CommonOptFunction.h"`
			`#include "Macro.h"`
beta 0.2.0.8 - add NaN check-up - add quantification support for ScaleAdd Op - add binary to eltwise optimization - add console logs for quantization tool - better document for quantization tool - replace redundant dimension flags with dimension format - optimize performance of TensorFlow Lite Quantized Convolution - fix axis support for ONNX softmax - fix get performance compile error on Windows 2019-08-22 20:13:46 +08:00			`#include "TensorUtils.hpp"`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`using namespace std;`

			`namespace MNN {`

			`static int _concatWidth(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {`
			`auto outputDim = outputTensor->buffer().dim;`
			`const int depthQuad = UP_DIV(outputDim[1].extent, 4);`
			`const int height = outputDim[2].extent;`
			`const int width = outputDim[3].extent;`
			`const int outputPlaneStride = 4 * height * width;`
			`const int outputLineStride = 4 * width;`

			`int batchSize = outputDim[0].extent;`

			`for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {`
			`int currentPositionW = 0;`
			`float* outputOrigin = reinterpret_cast<float>(outputTensor->buffer().host) + outputDim[0].stride batchIndex;`

			`for (size_t b = 0; b < inputTensors.size(); b++) {`
			`auto& inputTensor = inputTensors[b]->buffer();`
			`float* inputOrigin = reinterpret_cast<float>(inputTensor.host) + inputTensor.dim[0].stride batchIndex;`
			`int inputPlaneStride = inputTensor.dim[3].extent * inputTensor.dim[2].extent * 4;`
			`int inputLineStride = inputTensor.dim[3].extent * 4;`
			`int inputW = inputTensor.dim[3].extent;`
			`for (int z = 0; z < depthQuad; ++z) {`
			`float* dstZ = outputOrigin + outputPlaneStride * z;`
			`float* srcZ = inputOrigin + inputPlaneStride * z;`
			`for (int y = 0; y < height; ++y) {`
			`float* dstY = dstZ + outputLineStride * y + currentPositionW * 4;`
			`float* srcY = srcZ + inputLineStride * y;`
			`memcpy(dstY, srcY, 4 * inputW * sizeof(float));`
			`}`
			`}`
			`currentPositionW += inputW;`
			`}`
			`}`
			`return 0;`
			`}`

			`static int _concatHeight(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {`
			`auto outputDim = outputTensor->buffer().dim;`
			`const int batchSize = outputDim[0].extent;`
			`const int depthQuad = UP_DIV(outputDim[1].extent, 4);`
			`const int height = outputDim[2].extent;`
			`const int width = outputDim[3].extent;`
			`const int outputPlaneStride = 4 * height * width;`
			`const int outputLineStride = 4 * width;`
			`for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {`
			`float* outputOrigin = reinterpret_cast<float>(outputTensor->buffer().host) + outputDim[0].stride batchIndex;`
			`int currentPositionH = 0;`
			`for (size_t b = 0; b < inputTensors.size(); b++) {`
			`auto& inputTensor = inputTensors[b]->buffer();`
			`float* inputOrigin = reinterpret_cast<float>(inputTensor.host) + inputTensor.dim[0].stride batchIndex;`
			`int inputPlaneStride = inputTensor.dim[2].extent * inputTensor.dim[3].extent * 4;`
			`int inputH = inputTensor.dim[2].extent;`
			`for (int z = 0; z < depthQuad; ++z) {`
			`float* dstZ = outputOrigin + outputPlaneStride * z;`
			`float* srcZ = inputOrigin + inputPlaneStride * z;`

			`memcpy(dstZ + currentPositionH * outputLineStride, srcZ, inputPlaneStride * sizeof(float));`
			`}`
			`currentPositionH += inputH;`
			`}`
			`}`
			`return 0;`
			`}`

			`static int _concatBatch(const Tensor* outputTensor, const vector<Tensor*>& inputTensors) {`
			`auto outputDim = outputTensor->buffer().dim;`
			`const int batchSize = outputDim[0].extent;`
			`for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {`
			`float* outputOrigin = reinterpret_cast<float>(outputTensor->buffer().host) + outputDim[0].stride batchIndex;`
			`for (size_t b = 0; b < inputTensors.size(); b++) {`
			`auto& inputTensor = inputTensors[b]->buffer();`
			`float* inputOrigin = reinterpret_cast<float>(inputTensor.host) + inputTensor.dim[0].stride batchIndex;`
			`::memcpy(outputOrigin, inputOrigin, inputTensor.dim[0].stride * sizeof(float));`
			`}`
			`}`
			`return 0;`
			`}`

			`static int _concatChannel(const Tensor* outputTensor, const vector<Tensor*>& inputTensors, bool useSlowMethod,`
			`const Tensor* tempOutputTensor) {`
			`auto outputDim = outputTensor->buffer().dim;`
			`float* outputOrigin = reinterpret_cast<float*>(outputTensor->buffer().host);`
			`int batchSize = outputDim[0].extent;`

			`if (useSlowMethod) {`
			`auto tempOutput = tempOutputTensor->host<float>();`
			`MNN_ASSERT(nullptr != tempOutput);`
			`for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {`
			`float* currentOutput = tempOutput;`
			`for (int b = 0; b < inputTensors.size(); b++) {`
			`auto inputTensor = inputTensors[b];`

			`int size = inputTensor->width() * inputTensor->height() * inputTensor->channel();`
			`MNNUnpackC4(currentOutput, inputTensor->host<float>() + inputTensor->stride(0) * batchIndex,`
			`inputTensor->width() * inputTensor->height(), inputTensor->channel());`
			`currentOutput += size;`
			`}`
			`MNNPackC4(outputTensor->host<float>() + batchIndex * outputTensor->stride(0), tempOutput,`
			`outputTensor->width() * outputTensor->height(), outputTensor->channel());`
			`}`
			`return 0;`
			`}`
			`for (int batchIndex = 0; batchIndex < batchSize; ++batchIndex) {`
			`int currentPositionZ = 0;`
			`for (size_t b = 0; b < inputTensors.size(); b++) {`
			`auto& inputTensor = inputTensors[b]->buffer();`
			`float* inputOrigin = reinterpret_cast<float>(inputTensor.host) + inputTensor.dim[0].stride batchIndex;`
			`int inputZ = UP_DIV(inputTensor.dim[1].extent, 4);`
- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00			`float* dst = outputOrigin + outputDim[1].stride * currentPositionZ * 4 + outputDim[0].stride * batchIndex;`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`float* src = inputOrigin;`

- build: - unify schema building in core and converter; - add more build script for android; - add linux build script for python; - ops impl: - add floor mod support in binary; - use eltwise impl in add/max/sub/mul binary for optimization; - remove fake double support in cast; - fix 5d support for concat; - add adjX and adjY support for batch matmul; - optimize conv2d back prop filter; - add pad mode support for conv3d; - fix bug in conv2d & conv depthwise with very small feature map; - optimize binary without broacast; - add data types support for gather; - add gather ND support; - use uint8 data type in gather v2; - add transpose support for matmul; - add matrix band part; - add dim != 4 support for padding, reshape & tensor convert; - add pad type support for pool3d; - make ops based on TensorFlow Lite quantization optional; - add all & any support for reduction; - use type in parameter as output type in reduction; - add int support for unary; - add variable weight support for conv2d; - fix conv2d depthwise weights initialization; - fix type support for transpose; - fix grad outputs count for reduce grad and reshape grad; - fix priorbox & detection output; - fix metal softmax error; - python: - add runSessionWithCallBackInfo interface; - add max nodes limit (1400) for visualization tool; - fix save error in python3; - align default dim; - convert: - add extra design for optimization; - add more post converting optimizers; - add caffe v1 weights blob support; - add cast, unary, conv transpose support for onnx model; - optimize batchnorm, conv with variable weights, prelu, reshape, slice, upsample for onnx model; - add cos/sin/atan/tan support for unary for tensorflow model; - add any/all support for reduction for tensorflow model; - add elu, conv3d, pool3d support for tensorflow model; - optimize argmax, batchnorm, concat, batch to space, conv with variable weights, prelu, slice for tensorflow model; - others: - fix size computer lock; - fix thread pool deadlock; - add express & parameters in express; - rewrite blitter chooser without static map; - add tests for expr; 2019-10-29 13:37:26 +08:00			`memcpy(dst, src, outputDim[1].stride * 4 * inputZ * sizeof(float));`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`currentPositionZ += inputZ;`
			`}`
			`}`

			`return 0;`
			`}`

			`static int _concatTf(const Tensor* outputTensor, const vector<Tensor*>& inputTensors, int axis) {`
			`auto& ob = outputTensor->buffer();`
			`int outsideSize = 1;`
			`for (int i = 0; i < axis; ++i) {`
			`outsideSize *= ob.dim[i].extent;`
			`}`
			`int insideStride = ob.type.bytes();`
			`for (int i = axis + 1; i < ob.dimensions; ++i) {`
			`insideStride *= ob.dim[i].extent;`
			`}`
			`int outsideStride = insideStride * ob.dim[axis].extent;`

			`int sumAxis = 0;`
			`uint8_t* outputOrigin = reinterpret_cast<uint8_t*>(outputTensor->buffer().host);`
			`for (size_t b = 0; b < inputTensors.size(); b++) {`
			`auto& inputTensor = inputTensors[b]->buffer();`
			`if (0 == inputTensor.dimensions) {`
			`continue;`
			`}`
			`uint8_t* inputOrigin = reinterpret_cast<uint8_t*>(inputTensor.host);`
			`int inputPlaneStride = inputTensor.dim[axis].extent * insideStride;`

			`for (int z = 0; z < outsideSize; ++z) {`
			`uint8_t* dstZ = outputOrigin + outsideStride * z + sumAxis * insideStride;`
			`uint8_t* srcZ = inputOrigin + inputPlaneStride * z;`

			`memcpy(dstZ, srcZ, inputPlaneStride);`
			`}`
			`sumAxis += inputTensor.dim[axis].extent;`
			`}`
			`return 0;`
			`}`

			`ErrorCode CPUConcat::onResize(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs) {`
			`MNN_ASSERT(outputs.size() == 1);`
			`MNN_ASSERT(inputs.size() >= 2);`
beta 0.2.0.1 - support both armv7/arm64 in podspec (pod version >= 1.5.0 required) - refactor neg axis support - fix memory overlap in de-conv - fix CONVOLUTION_TILED_NUMBER spell error - fix few warnings - add binary / interp / permute / relu / reshape / softmax support and optimize conv for OpenGL backend - add clean in nmake build script 2019-06-24 11:32:41 +08:00			`auto output = outputs[0];`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`mUseSlowMethod = false;`
			`mTempOutput.reset();`
beta 0.2.0.8 - add NaN check-up - add quantification support for ScaleAdd Op - add binary to eltwise optimization - add console logs for quantization tool - better document for quantization tool - replace redundant dimension flags with dimension format - optimize performance of TensorFlow Lite Quantized Convolution - fix axis support for ONNX softmax - fix get performance compile error on Windows 2019-08-22 20:13:46 +08:00			`if (output->buffer().dimensions > 1 && TensorUtils::getDescribe(output)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`if (1 == mAxis) {`
			`// The last tensor needn't be aligned`
			`for (size_t b = 0; b < inputs.size() - 1; b++) {`
			`if (inputs[b]->length(1) % 4 != 0) {`
			`mUseSlowMethod = true;`
			`break;`
			`}`
			`}`
			`if (mUseSlowMethod) {`
			`mTempOutput.reset(Tensor::createDevice<float>(output->shape()));`
			`mTempOutput->setLength(0, 1);`
			`bool success = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);`
			`if (false == success) {`
			`return OUT_OF_MEMORY;`
			`}`
			`backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);`
			`}`
			`}`
			`}`

			`return NO_ERROR;`
			`}`

			`ErrorCode CPUConcat::onExecute(const vector<Tensor>& inputs, const std::vector<Tensor>& outputs) {`
			`MNN_ASSERT(1 == outputs.size());`
			`MNN_ASSERT(inputs.size() >= 2);`
			`auto input = inputs[0];`
beta 0.2.0.8 - add NaN check-up - add quantification support for ScaleAdd Op - add binary to eltwise optimization - add console logs for quantization tool - better document for quantization tool - replace redundant dimension flags with dimension format - optimize performance of TensorFlow Lite Quantized Convolution - fix axis support for ONNX softmax - fix get performance compile error on Windows 2019-08-22 20:13:46 +08:00			`if (input->buffer().dimensions > 1 && TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`switch (mAxis) {`
			`case 0:`
			`_concatBatch(outputs[0], inputs);`
			`break;`
			`case 1:`
			`_concatChannel(outputs[0], inputs, mUseSlowMethod, mTempOutput.get());`
			`break;`
			`case 2:`
			`_concatHeight(outputs[0], inputs);`
			`break;`
			`case 3:`
			`_concatWidth(outputs[0], inputs);`
			`break;`

			`default:`
			`break;`
			`}`
			`} else {`
			`int axis = mAxis;`
			`// tf concat`
			`_concatTf(outputs[0], inputs, axis);`
			`}`

			`return NO_ERROR;`
			`}`

			`class CPUConcatCreator : public CPUBackend::Creator {`
			`public:`
			`virtual Execution* onCreate(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs,`
			`const MNN::Op* op, Backend* backend) const {`
			`auto axis = op->main_as_Axis();`
			`if (nullptr != axis) {`
beta 0.2.0.1 - support both armv7/arm64 in podspec (pod version >= 1.5.0 required) - refactor neg axis support - fix memory overlap in de-conv - fix CONVOLUTION_TILED_NUMBER spell error - fix few warnings - add binary / interp / permute / relu / reshape / softmax support and optimize conv for OpenGL backend - add clean in nmake build script 2019-06-24 11:32:41 +08:00			`if (axis->axis() < 0) {`
			`return new CPUConcat(backend, outputs[0]->dimensions() + axis->axis());`
			`}`
beta 0.1.0 2019-04-17 10:49:11 +08:00			`return new CPUConcat(backend, axis->axis());`
			`}`
			`return new CPUConcat(backend, 0);`
			`}`
			`};`

			`REGISTER_CPU_OP_CREATOR(CPUConcatCreator, OpType_Concat);`
			`} // namespace MNN`