MNN/express/module/StaticModule.cpp

//
//  StaticModule.cpp
//  MNN
//
//  Created by MNN on b'2020/09/10'.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "StaticModule.hpp"
#include <MNN/AutoTime.hpp>
#include <MNN/expr/Executor.hpp>
#include <MNN/expr/ExecutorScope.hpp>
#include <MNN/expr/ExprCreator.hpp>
#include "Utils.hpp"
#include "core/WrapExecution.hpp"
#include "core/MNNMemoryUtils.h"
#include "RuntimeAttr.hpp"
#include "core/TensorUtils.hpp"

namespace MNN {
namespace Express {

static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLINT
    Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend) {
    auto&& pipelineInfo = scheduleInfo.pipelineInfo[0].second;
    std::vector<std::shared_ptr<BufferStorage>> splitOps(pipelineInfo.size());
    for (int i = 0; i < pipelineInfo.size(); ++i) {
        auto& info = pipelineInfo[i];
        auto op       = pipelineInfo[i].op;
        std::unique_ptr<OpT> op_table(op->UnPack());
        std::shared_ptr<Execution> exe;
        switch (op->type()) {
            case MNN::OpType_DepthwiseConvInt8:
            case MNN::OpType_ConvInt8:
            case MNN::OpType_ConvolutionDepthwise:
            case MNN::OpType_Convolution: {
                DataType type = DataType_DT_FLOAT;
                auto conv2d = op->main_as_Convolution2D();
                // Create Default Inputs and Outputs
                auto tempInput = info.inputs[0];
                auto tempOutput = info.outputs[0];
                auto common = conv2d->common();
                if (scheduleInfo.pipelineInfo[0].first.needComputeGeometry) {
                    // Set default shape to create execution
                    int ow = 2, oh = 2;
                    int iw = (common->kernelX() - 1) * common->dilateX() + common->strideX() * (ow - 1) + 1;
                    int ih = (common->kernelY() - 1) * common->dilateY() + common->strideY() * (oh - 1) + 1;
                    TensorUtils::getDescribe(tempInput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
                    tempInput->setLength(0, 1);
                    tempInput->setLength(1, conv2d->common()->inputCount());
                    tempInput->setLength(2, ih);
                    tempInput->setLength(3, iw);
                    TensorUtils::getDescribe(tempOutput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
                    tempOutput->setLength(0, 1);
                    tempOutput->setLength(1, conv2d->common()->outputCount());
                    tempOutput->setLength(2, oh);
                    tempOutput->setLength(3, ow);
                    if (op->main_as_Convolution2D()->quanParameter()) {
                        type = DataType_DT_INT8;
                        int inputIdx = op->inputIndexes()->Get(0);
                        auto& inputQuantAttr = TensorUtils::getDescribe(tempInput)->quantAttr;
                        if (nullptr != inputQuantAttr.get()) {
                            TensorUtils::getDescribe(tempInput)->type = DataType_DT_INT8;
                        }
                        auto& outputQuantAttr = TensorUtils::getDescribe(tempOutput)->quantAttr;
                        if (nullptr != outputQuantAttr.get()) {
                            TensorUtils::getDescribe(tempOutput)->type = DataType_DT_INT8;
                        }
                    }
                }
                exe.reset(backend->onCreate(info.inputs, info.outputs, op));
                if (exe.get() == nullptr) {
                    exe.reset(backupBackend->onCreate(info.inputs, info.outputs, op));
                }
                if (nullptr == exe) {
                    break;
                }
                // The exe can't clone
                if (!exe->onClone(nullptr, op, nullptr)) {
                    exe = nullptr;
                    break;
                }
                if (OpParameter_Convolution2D == op_table->main.type) {
                    op_table->main.AsConvolution2D()->bias.clear();
                    op_table->main.AsConvolution2D()->weight.clear();
                    if (nullptr != op_table->main.AsConvolution2D()->symmetricQuan) {
                        op_table->main.AsConvolution2D()->symmetricQuan->bias.clear();
                        op_table->main.AsConvolution2D()->symmetricQuan->weight.clear();
                    }
                    if (nullptr != op_table->main.AsConvolution2D()->quanParameter) {
                        op_table->main.AsConvolution2D()->quanParameter->alpha.clear();
                        op_table->main.AsConvolution2D()->quanParameter->buffer.clear();
                    }
                }
                break;
            }
            default: {
                break;
            }
        }
        flatbuffers::FlatBufferBuilder opBuilder;
        opBuilder.Finish(Op::Pack(opBuilder, op_table.get()));
        std::shared_ptr<BufferStorage> buf(new BufferStorage);
        buf->storage = opBuilder.ReleaseRaw(buf->allocated_size, buf->offset);
        info.op = flatbuffers::GetRoot<Op>(buf->buffer());
        if (nullptr != exe) {
            // Clone Execution to reset op info
            Execution* dstExe;
            exe->onClone(exe->backend(), info.op, &dstExe);
            std::shared_ptr<Execution> dstExeP(dstExe);
            info.executionCache.insert(std::make_pair(info.op, dstExeP));
        }
        splitOps[i] = buf;
    }
    return splitOps;
}

static bool _reshapeTensor(Tensor* tensor, const Tensor* dims) {
    bool dirty = false;
    if (tensor->buffer().dimensions != dims->dimensions()) {
        dirty = true;
    } else {
        for (int i = 0; i < dims->dimensions(); ++i) {
            if (tensor->buffer().dim[i].extent != dims->length(i)) {
                dirty = true;
                break;
            }
        }
    }
    return dirty;
}
static void _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, Schedule::TENSORCACHE* cacheTensor) {
    MNN_ASSERT(nullptr != tensor);
    bool dirty = _reshapeTensor(tensor, dims);

    if (!dirty) {
        return;
    }

    tensor->buffer().dimensions = (int)dims->dimensions();
    for (int i = 0; i < dims->dimensions(); ++i) {
        tensor->buffer().dim[i].extent = dims->length(i);
        tensor->buffer().dim[i].stride = dims->stride(i);
    }
    if (nullptr != cacheTensor) {
        auto t = std::get<1>(*cacheTensor).get();
        if (nullptr != t) {
            t->buffer().dimensions = (int)dims->dimensions();
            for (int i = 0; i < dims->dimensions(); ++i) {
                t->buffer().dim[i].extent = dims->length(i);
                t->buffer().dim[i].stride = dims->stride(i);
            }
            std::get<2>(*cacheTensor) = true;
        }
    }
    session->setNeedResize();
}
void StaticModule::resetInputOutputs() {
    mPrevInputTensor.resize(mResource->mInputs.size());
    mInputTensors.resize(mResource->mInputs.size());
    auto& pipelineInfo = mSession->getPipelineInfo(0);
    for (int i = 0; i < mResource->mInputs.size(); ++i) {
        mInputTensors[i] = mSession->getTensor(mResource->mInputs[i]);
        auto des = TensorUtils::getDescribe(mInputTensors[i]);
        if (des->usage != Tensor::InsideDescribe::CONSTANT && des->usage != Tensor::InsideDescribe::TRAINABLE) {
            des->usage = Tensor::InsideDescribe::INPUT;
        }
        pipelineInfo.first.inputTensorCopyCache.insert(std::make_pair(mInputTensors[i], std::make_tuple(nullptr, nullptr, true, true)));
        mPrevInputTensor[i] = nullptr;
    }
    mOutputTensors.resize(mResource->mOutputFromTensor.size());
    for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
        mOutputTensors[i] = mSession->getTensor(mResource->mOutputs[mResource->mOutputFromTensor[i]]);
        auto des = TensorUtils::getDescribe(mOutputTensors[i]);
        if (des->usage == Tensor::InsideDescribe::NORMAL) {
            des->usage = Tensor::InsideDescribe::OUTPUT;
        }
    }
}

StaticModule::StaticModule(std::vector<int> inputs,
                           std::vector<int> outputs,
                           std::vector<std::shared_ptr<BufferStorage>>&& buffer,
                           Schedule::ScheduleInfo&& scheduleInfo,
                           std::shared_ptr<Schedule::ScheduleInfo> sharedConst,
                           Session::ModeGroup&& mode,
                           RuntimeInfo&& rt,
                           const Module::Config& config
                           ) {
    setType("StaticModule");
    mResource.reset(new Resource);
    mResource->mSharedConst = sharedConst;
    mResource->mModes = std::move(mode);
    mResource->mBnInfo.user = &mResource->mBnConfig;
    mResource->mModes.inputMode = config.shapeMutable ? Interpreter::Session_Input_User : Interpreter::Session_Input_Inside;
    mResource->mModes.outputMode = Interpreter::Session_Output_User;
    std::shared_ptr<BufferStorage> net_storage;
    std::map<const Op*, std::pair<std::shared_ptr<Execution>, DataType>> exeCache;
    MNN_ASSERT(1 == scheduleInfo.pipelineInfo.size());
    auto& bnCache = scheduleInfo.pipelineInfo[0].first;
    bnCache.cache.first.reset(rt.first[bnCache.info.type]->onCreate(bnCache.info.user));
    if (bnCache.cache.first->type() == MNN_FORWARD_CPU) {
        bnCache.cache.second = bnCache.cache.first;
    } else {
        BackendConfig defaultConfig;
        defaultConfig.flags = 4;
        bnCache.cache.second.reset(rt.second->onCreate(&defaultConfig));
    }
    if (config.rearrange) {
        mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get());
    } else {
        mResource->mBuffer = std::move(buffer);
    }
    mResource->mOutputNumbers = (int)outputs.size();
    /** Compute:
     std::vector<int, int> mOutputFromTensor;
     std::vector<int, int> mOutputFromInput;
     */
    for (int i = 0; i < outputs.size(); ++i) {
        auto& t        = outputs[i];
        bool fromInput = false;
        for (int j = 0; j < inputs.size(); ++j) {
            if (inputs[j] == t) {
                fromInput = true;
                mResource->mOutputFromInput.emplace_back(std::make_pair(i, j));
                break;
            }
        }
        if (fromInput) {
            continue;
        }
        mResource->mOutputFromTensor.emplace_back(i);
    }
    if (mResource->mOutputFromTensor.empty()) {
        return;
    }
    mResource->mUseContentInputs = scheduleInfo.needInputContentForShape;
    if (mResource->mUseContentInputs) {
        mResource->mModes.inputMode = Interpreter::Session_Input_User;
    }
    mResource->mInputs = std::move(inputs);
    mResource->mInputNeedCPU.resize(mResource->mInputs.size());
    for (int i=0; i<mResource->mInputs.size(); ++i) {
        mResource->mInputNeedCPU[i] = false;
    }
    if (mResource->mUseContentInputs) {
        for (int i=0; i<mResource->mInputs.size(); ++i) {
            auto subT = scheduleInfo.allTensors[mResource->mInputs[i]].get();
            if (TensorUtils::getDescribe(subT)->usage == Tensor::InsideDescribe::CONSTANT) {
                mResource->mInputNeedCPU[i] = true;
            }
        }
    }
    mResource->mOutputs = std::move(outputs);

    bool needResize = scheduleInfo.validForResize && mResource->mModes.inputMode == Interpreter::Session_Input_Inside;
    mSession.reset(new Session(std::move(scheduleInfo), mResource->mModes, std::move(rt)));
    resetInputOutputs();
    if (needResize) {
        mSession->resize();
    }
}
StaticModule::~StaticModule() {
    mSession         = nullptr;
}
void StaticModule::onClearCache() {
    if (nullptr != mSession) {
        for (int i=0; i<mPrevInputTensor.size(); ++i) {
            mPrevInputTensor[i] = nullptr;
        }
        for (auto& iter : mSession->getPipelineInfo(0).first.inputTensorCopyCache) {
            std::get<3>(iter.second) = true;
        }
    }
}

std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {

    AUTOTIME;
    std::vector<Express::VARP> outputs(mResource->mOutputNumbers);
    for (auto& iter : mResource->mOutputFromInput) {
        outputs[iter.first] = inputs[iter.second];
    }
    if (mResource->mOutputFromTensor.empty()) {
        return outputs;
    }
    Variable::compute(inputs);
#ifdef MNN_DUMP_MEMORY
    auto rt = Executor::getRuntime();
    auto mem = rt.second->onGetMemoryInMB();
    for (auto iter : rt.first) {
        if (iter.second.get() != rt.second.get()) {
            mem += iter.second->onGetMemoryInMB();
        }
    }
    FUNC_PRINT_ALL(mem, f);
#endif

    MNN_ASSERT(inputs.size() == mInputTensors.size());
    auto& pipelineInfo = mSession->getPipelineInfo(0);
    if (mResource->mModes.inputMode == Interpreter::Session_Input_User) {
        for (int i = 0; i < inputs.size(); ++i) {
            if (nullptr == mInputTensors[i]) {
                continue;
            }
            auto inputTensor = Utils::getTensor(inputs[i]);
            Schedule::TENSORCACHE* cacheTensor = nullptr;

            if (mPrevInputTensor[i] != inputTensor) {
                auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
                cacheTensor = &cacheIter->second;
                MNN_ASSERT(cacheIter != pipelineInfo.first.inputTensorCopyCache.end());
                std::get<3>(cacheIter->second) = true;
                mPrevInputTensor[i] = inputTensor;
                if (std::get<1>(*cacheTensor) != nullptr) {
                    if (!WrapExecution::needWrap(inputTensor,   TensorUtils::getDescribe(std::get<0>(*cacheTensor))->getBackend())) {
                        // No need copy now, reset it
                        cacheIter->second = std::make_tuple(nullptr, nullptr, true, true);
                    }
                }
            }
            auto srcDes = TensorUtils::getDescribe(inputTensor);
            auto des = TensorUtils::getDescribe(mInputTensors[i]);
            bool needCopy = false;
            if (nullptr != srcDes->quantAttr.get()) {
                if (nullptr == des->quantAttr.get()) {
                    needCopy = true;
                }
            }
            if (mResource->mInputNeedCPU[i]) {
                if (0 != inputTensor->buffer().device) {
                    needCopy = true;
                }
            }
            if (srcDes->tensorArrayAttr.get() != nullptr) {
                // For tensorArray, don't need content
                needCopy = false;
            }
            bool needMalloc;
            if (needCopy) {
                auto srcPtr = (uint8_t*)inputs[i]->readMap<uint8_t>();
                needMalloc = mInputTensors[i]->buffer().host != srcPtr;
                mInputTensors[i]->buffer().host = srcPtr;
                mInputTensors[i]->buffer().device = 0;
                des->setBackend(pipelineInfo.first.cache.second.get());
                if (nullptr == srcDes->quantAttr.get()) {
                    // For device need copy, cache device tensor
                    auto cacheIter = pipelineInfo.first.inputTensorCopyCache.find(mInputTensors[i]);
                    MNN_ASSERT(cacheIter != pipelineInfo.first.inputTensorCopyCache.end());
                    std::get<0>(cacheIter->second) = inputTensor;
                    std::get<1>(cacheIter->second) = nullptr;
                    std::get<2>(cacheIter->second) = false;
                    std::get<3>(cacheIter->second) = false;
                }
            } else {
                needMalloc = TensorUtils::refTensorContent(mInputTensors[i], inputTensor);
            }
            des->type = srcDes->type;
            des->dimensionFormat = srcDes->dimensionFormat;
            des->tensorArrayAttr = srcDes->tensorArrayAttr;
            mInputTensors[i]->buffer().type = inputTensor->buffer().type;
            _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), cacheTensor);
            if (needMalloc) {
                mSession->setNeedMalloc();
            }
        }
        if (mResource->mUseContentInputs) {
            mSession->setNeedResize();
        }
        mSession->resize();
    } else {
        // Resize
        for (int i = 0; i < inputs.size(); ++i) {
            if (nullptr == mInputTensors[i]) {
                continue;
            }
            auto inputTensor = Utils::getTensor(inputs[i]);
            auto srcDes = TensorUtils::getDescribe(inputTensor);
            auto des = TensorUtils::getDescribe(mInputTensors[i]);
            des->dimensionFormat = srcDes->dimensionFormat;
            mInputTensors[i]->buffer().type = inputTensor->buffer().type;
            _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), nullptr);
        }
        mSession->resize();
        // Copy
        for (int i = 0; i < inputs.size(); ++i) {
            if (nullptr == mInputTensors[i]) {
                continue;
            }
            auto exprInfo    = inputs[i]->expr();
            auto inputTensor = Utils::getTensor(inputs[i]);
            mInputTensors[i]->copyFromHostTensor(inputTensor);
        }
    }


#ifdef LOG_VERBOSE
    for (auto& inputTensor : mInputTensors) {
        MNN_PRINT("static module, before run, input ptr:%p, hostPtr:%p,  shape:", inputTensor, inputTensor->host<void>());
        inputTensor->printShape();
        MNN_PRINT("\n");
        auto shape = inputTensor->shape();
    }
    MNN_PRINT("staticmodule before run\n");
#endif


    ErrorCode code;
    if (mResource->mModes.callBackMode == Interpreter::Session_Debug) {
        auto globalExecutor = ExecutorScope::Current();
        auto debug = globalExecutor->getDebugTools();
        if (debug->after != nullptr && debug->before != nullptr) {
            code = mSession->runWithCallBack(debug->before, debug->after);
        } else {
            code = mSession->run();
        }
    } else {
        code = mSession->run();
    }
    if (NO_ERROR != code) {
        return {};
    }
    for (int i = 0; i < mOutputTensors.size(); ++i) {
        auto tensor = Tensor::clone(mOutputTensors[i]);
        outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
        auto backend = TensorUtils::getDescribe(tensor)->getBackend();
        if (backend == pipelineInfo.first.cache.first.get()) {
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.first;
        } else if (backend == pipelineInfo.first.cache.second.get()) {
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
        } else if (backend == mResource->mSharedConst->defaultBackend.get()) {
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
        } else if (backend == mResource->mSharedConst->constReplaceBackend.get()) {
            outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
        }
    }

#ifdef MNN_INTERNAL_ENABLED
    auto glo = ExecutorScope::Current();
    float flops = 0.0f;
    mSession->getInfo(Interpreter::FLOPS, &flops);
    glo->getDebugTools()->flops += flops;
#endif
    return outputs;
}

Module* StaticModule::clone(CloneContext* ctx) const {
    StaticModule* module(new StaticModule);
    module->mResource = mResource;
    if (mResource->mOutputFromTensor.empty()) {
        return this->cloneBaseTo(ctx, module);
    }
    // TODO: If RuntimeManager is not the same as Runtime, may copy error
    auto rt = Executor::getRuntime();
    module->mSession.reset(mSession->clone(std::move(rt), mResource->mSharedConst));
    module->resetInputOutputs();
    return this->cloneBaseTo(ctx, module);
}

} // namespace Express
} // namespace MNN