mirror of https://github.com/alibaba/MNN.git
211 lines
7.4 KiB
C++
211 lines
7.4 KiB
C++
//
|
|
// CPUPermute.cpp
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2018/07/18.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#include "backend/cpu/CPUPermute.hpp"
|
|
#include "backend/cpu/CPUTranspose.hpp"
|
|
#include "backend/cpu/compute/CommonOptFunction.h"
|
|
#include "core/Macro.h"
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
namespace MNN {
|
|
|
|
CPUPermute::CPUPermute(Backend *b, const MNN::Op *op) : MNN::Execution(b) {
|
|
auto shape = op->main_as_Permute()->dims();
|
|
for (int i = 0; i < shape->size(); ++i) {
|
|
mDims.push_back(shape->data()[i]);
|
|
}
|
|
}
|
|
|
|
ErrorCode CPUPermute::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
return NO_ERROR;
|
|
}
|
|
|
|
ErrorCode CPUPermute::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
MNN_ASSERT(1 == inputs.size());
|
|
MNN_ASSERT(1 == outputs.size());
|
|
|
|
auto input = inputs[0];
|
|
auto output = outputs[0];
|
|
|
|
MNN_ASSERT(output->dimensions() == input->dimensions());
|
|
MNN_ASSERT(2 <= output->dimensions() && output->dimensions() <= 5); // 2 <= tensor dim <= 5
|
|
|
|
auto originInput = input->host<float>();
|
|
auto originOutput = output->host<float>();
|
|
|
|
{
|
|
bool noChange = true;
|
|
for (int i = 0; i < (int)mDims.size(); ++i) {
|
|
if (mDims[i] != i) {
|
|
noChange = false;
|
|
break;
|
|
}
|
|
}
|
|
// mDims[i] == i, no change at all.
|
|
if (noChange) {
|
|
::memcpy(originOutput, originInput, inputs[0]->size());
|
|
return NO_ERROR;
|
|
}
|
|
}
|
|
const int outputChannel = output->length(1);
|
|
|
|
int strides[5][4]; // map from change of output index to change of input index on N, C4, H and W
|
|
|
|
for (int i = 0; i < 5; ++i) {
|
|
if (i >= input->dimensions()) {
|
|
strides[i][0] = strides[i][1] = strides[i][2] = strides[i][3] = 0;
|
|
continue;
|
|
}
|
|
int dim = mDims[i];
|
|
int temp = input->stride(dim);
|
|
if (dim >= 1) {
|
|
temp *= 4;
|
|
}
|
|
if (dim == 1) {
|
|
strides[i][0] = strides[i][1] = strides[i][2] = 1;
|
|
strides[i][3] = temp - 3;
|
|
} else {
|
|
strides[i][0] = strides[i][1] = strides[i][2] = strides[i][3] = temp;
|
|
}
|
|
}
|
|
const int ocTotalStride = strides[1][0] + strides[1][1] + strides[1][2] + strides[1][3];
|
|
// compute prefix sum of output 0 dim stride to avoid frequent assignment of variables in the deepest loops
|
|
for (int i = 1; i < 4; ++i) {
|
|
strides[1][i] += strides[1][i - 1];
|
|
}
|
|
#define PTR_DEFINE(i) \
|
|
const int outputLength##i = ALIMAX(output->length(i - 1), 1); \
|
|
|
|
#define PTR_BEGIN(var, i) \
|
|
for (int var = 0; var < outputLength##i; ++var) { \
|
|
const int inputIndex##i = inputIndex;
|
|
|
|
#define PTR_END(var, i) \
|
|
inputIndex = inputIndex##i + strides[i - 1][var % 4]; \
|
|
}
|
|
|
|
PTR_DEFINE(3)
|
|
PTR_DEFINE(4)
|
|
PTR_DEFINE(5)
|
|
for (int ob = 0, outputIndex = 0, inputIndex = 0; ob < output->length(0); ++ob) {
|
|
const int inputIndex1 = inputIndex;
|
|
for (int oz = 0; oz <= outputChannel - 4; oz += 4) {
|
|
const int inputIndex2 = inputIndex;
|
|
PTR_BEGIN(od, 3)
|
|
PTR_BEGIN(oy, 4)
|
|
PTR_BEGIN(ox, 5)
|
|
originOutput[outputIndex++] = originInput[inputIndex];
|
|
originOutput[outputIndex++] = originInput[inputIndex + strides[1][0]];
|
|
originOutput[outputIndex++] = originInput[inputIndex + strides[1][1]];
|
|
originOutput[outputIndex++] = originInput[inputIndex + strides[1][2]];
|
|
PTR_END(ox, 5)
|
|
PTR_END(oy, 4)
|
|
PTR_END(od, 3)
|
|
inputIndex = inputIndex2 + ocTotalStride;
|
|
}
|
|
if (outputChannel % 4 != 0) {
|
|
PTR_BEGIN(od, 3)
|
|
PTR_BEGIN(oy, 4)
|
|
PTR_BEGIN(ox, 5)
|
|
originOutput[outputIndex++] = originInput[inputIndex];
|
|
for (int oz = 0; oz < outputChannel % 4 - 1; ++oz) {
|
|
originOutput[outputIndex++] = originInput[inputIndex + strides[1][oz]];
|
|
}
|
|
for (int oz = outputChannel % 4; oz < 4; ++oz) {
|
|
originOutput[outputIndex++] = 0.0f;
|
|
}
|
|
PTR_END(ox, 5)
|
|
PTR_END(oy, 4)
|
|
PTR_END(od, 3)
|
|
}
|
|
inputIndex = inputIndex1 + strides[0][ob % 4];
|
|
}
|
|
#undef PTR_DEFINE
|
|
#undef PTR_BEGIN
|
|
#undef PTR_END
|
|
|
|
return NO_ERROR;
|
|
}
|
|
|
|
class CPUWrapPermute : public Execution {
|
|
public:
|
|
CPUWrapPermute(Backend *bn, const MNN::Op *op) : Execution(bn) {
|
|
auto shape = op->main_as_Permute()->dims();
|
|
mDims.reset(Tensor::create<int>({(int)shape->size()}));
|
|
if (nullptr == mDims->host<int>()) {
|
|
mValid = false;
|
|
return;
|
|
}
|
|
::memcpy(mDims->host<int>(), shape->data(), mDims->size());
|
|
mTranspose.reset(new CPUTranspose(bn, DataType_DT_FLOAT));
|
|
}
|
|
virtual ~CPUWrapPermute() = default;
|
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
|
auto format = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
|
|
if (MNN_DATA_FORMAT_NC4HW4 == format) {
|
|
mTempSource.reset(Tensor::createDevice<float>(inputs[0]->shape(), Tensor::CAFFE));
|
|
mTempDest.reset(Tensor::createDevice<float>(outputs[0]->shape(), Tensor::CAFFE));
|
|
|
|
bool valid = backend()->onAcquireBuffer(mTempSource.get(), Backend::DYNAMIC);
|
|
valid = valid && backend()->onAcquireBuffer(mTempDest.get(), Backend::DYNAMIC);
|
|
if (!valid) {
|
|
return OUT_OF_MEMORY;
|
|
}
|
|
|
|
backend()->onReleaseBuffer(mTempSource.get(), Backend::DYNAMIC);
|
|
backend()->onReleaseBuffer(mTempDest.get(), Backend::DYNAMIC);
|
|
|
|
mWrapInputs = {mTempSource.get(), mDims.get()};
|
|
mWrapOutputs = {mTempDest.get()};
|
|
mNeedCopy = true;
|
|
} else {
|
|
mWrapOutputs = outputs;
|
|
mWrapInputs = {inputs[0], mDims.get()};
|
|
mNeedCopy = false;
|
|
}
|
|
return mTranspose->onResize(mWrapInputs, mWrapOutputs);
|
|
}
|
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
|
if (mNeedCopy) {
|
|
backend()->onCopyBuffer(inputs[0], mTempSource.get());
|
|
}
|
|
auto code = mTranspose->onExecute(mWrapInputs, mWrapOutputs);
|
|
if (NO_ERROR != code) {
|
|
return code;
|
|
}
|
|
if (mNeedCopy) {
|
|
backend()->onCopyBuffer(mTempDest.get(), outputs[0]);
|
|
}
|
|
return NO_ERROR;
|
|
}
|
|
|
|
private:
|
|
std::shared_ptr<Tensor> mDims;
|
|
std::shared_ptr<Tensor> mTempSource;
|
|
std::shared_ptr<Tensor> mTempDest;
|
|
std::shared_ptr<Execution> mTranspose;
|
|
std::vector<Tensor *> mWrapInputs;
|
|
std::vector<Tensor *> mWrapOutputs;
|
|
bool mNeedCopy = false;
|
|
};
|
|
|
|
class CPUPermuteCreator : public CPUBackend::Creator {
|
|
public:
|
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
|
const MNN::Op *op, Backend *backend) const override {
|
|
if (op->main_as_Permute()->dims()->size() > 5 ||
|
|
TensorUtils::getDescribe(inputs[0])->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
|
return new CPUWrapPermute(backend, op);
|
|
}
|
|
return new CPUPermute(backend, op);
|
|
}
|
|
};
|
|
|
|
REGISTER_CPU_OP_CREATOR(CPUPermuteCreator, OpType_Permute);
|
|
} // namespace MNN
|