mirror of https://github.com/alibaba/MNN.git
Synchronize internal master to Github
This commit is contained in:
parent
6b0c16f24f
commit
ab711d484c
|
@ -512,6 +512,8 @@
|
|||
|
||||
#if defined(_MSC_VER)
|
||||
# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
|
||||
#elif defined(__MINGW32__)
|
||||
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
|
||||
#else
|
||||
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
|
||||
#endif // !_MSC_VER
|
||||
|
|
|
@ -368,6 +368,9 @@ list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Math
|
|||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NeuralNetWorkOp.hpp")
|
||||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Optimizer.hpp")
|
||||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Executor.hpp")
|
||||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NN.hpp")
|
||||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Module.hpp")
|
||||
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NeuralNetWorkOp.hpp")
|
||||
|
||||
set(MNN_DEPS "")
|
||||
set(MNN_EXTRA_DEPENDS "")
|
||||
|
@ -552,7 +555,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
|
|||
# https://stackoverflow.com/questions/23250863/difference-between-pthread-and-lpthread-while-compiling
|
||||
target_link_libraries(MNN PUBLIC -pthread dl)
|
||||
elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
|
||||
target_link_libraries(MNN PUBLIC log android m)
|
||||
target_link_libraries(MNN PUBLIC log m)
|
||||
else()
|
||||
endif()
|
||||
if (NOT MNN_BUILD_SHARED_LIBS)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
Pod::Spec.new do |s|
|
||||
s.name = "MNN"
|
||||
s.version = "1.1.0"
|
||||
s.version = "1.1.1"
|
||||
s.summary = "MNN"
|
||||
|
||||
s.description = <<-DESC
|
||||
|
|
|
@ -42,6 +42,7 @@ using namespace MNN;
|
|||
input_2 --> region_2 --/
|
||||
|
||||
3. This example read a json file and construct some Rasters and compute.
|
||||
Example input file at $<MNN-ROOT>/resource/exec/rasterDemo_transpose.json
|
||||
The input json file format is as below:
|
||||
{
|
||||
"inputs" : [
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 104 KiB After Width: | Height: | Size: 341 KiB |
|
@ -126,7 +126,7 @@ void Expr::_addLinkForInputs(EXPRP expr) {
|
|||
}
|
||||
}
|
||||
}
|
||||
EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy) {
|
||||
EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, Expr::MemoryType memtype) {
|
||||
EXPRP expr(new Expr(1));
|
||||
expr->mOp = nullptr;
|
||||
auto originPtr = ptr;
|
||||
|
@ -144,7 +144,7 @@ EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type,
|
|||
// VARP::TRAINABLE
|
||||
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::TRAINABLE;
|
||||
}
|
||||
if (dstInfo.size > 0 && copy) {
|
||||
if (dstInfo.size > 0 && memtype == COPY) {
|
||||
auto res = Utils::allocMemoryForHostTensor(expr->mInside->mOutputTensors[0]);
|
||||
if (!res) {
|
||||
MNN_ASSERT(false);
|
||||
|
@ -160,11 +160,13 @@ EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type,
|
|||
return expr;
|
||||
}
|
||||
expr->mInside->mContentDirty = false;
|
||||
if (copy) {
|
||||
if (memtype == COPY) {
|
||||
::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
|
||||
} else {
|
||||
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
|
||||
expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
|
||||
if (memtype == REF) {
|
||||
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
|
||||
}
|
||||
}
|
||||
return expr;
|
||||
}
|
||||
|
@ -813,7 +815,6 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
|
|||
} else if (info.type.code == halide_type_int && info.type.bits == 8) {
|
||||
blob->dataType = DataType_DT_INT8;
|
||||
blob->int8s.resize(info.size);
|
||||
auto pptr = (int8_t *)ptr;
|
||||
::memcpy(blob->int8s.data(), ptr, info.size * sizeof(int8_t));
|
||||
} else if (info.type.code == halide_type_uint && info.type.bits == 8) {
|
||||
blob->dataType = DataType_DT_UINT8;
|
||||
|
|
|
@ -115,7 +115,7 @@ VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP
|
|||
if(!bias.empty()) {
|
||||
ipParam->biasTerm = 1;
|
||||
}
|
||||
ipParam->weightSize = weight.size();
|
||||
ipParam->weightSize = (int)weight.size();
|
||||
|
||||
ipParam->weight = std::move(weight);
|
||||
ipParam->bias = std::move(bias);
|
||||
|
|
|
@ -118,7 +118,7 @@ void Module::clearCache() {
|
|||
this->onClearCache();
|
||||
}
|
||||
|
||||
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic) {
|
||||
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Module::Config* config) {
|
||||
AutoStorage<uint8_t> buffer;
|
||||
{
|
||||
FileLoader loader(fileName);
|
||||
|
@ -135,11 +135,15 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
|
|||
return {};
|
||||
}
|
||||
}
|
||||
return load(inputs, outputs, buffer.get(), buffer.size(), dynamic);
|
||||
return load(inputs, outputs, buffer.get(), buffer.size(), config);
|
||||
}
|
||||
|
||||
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
|
||||
return PipelineModule::load(inputs, outputs, buffer, length, dynamic);
|
||||
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config) {
|
||||
return PipelineModule::load(inputs, outputs, buffer, length, config);
|
||||
}
|
||||
|
||||
Module* Module::extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph) {
|
||||
return PipelineModule::extract(inputs, outputs, fortrain, subGraph);
|
||||
}
|
||||
|
||||
EXPRP Module::CloneContext::getOrClone(EXPRP expr) {
|
||||
|
|
|
@ -396,7 +396,7 @@ void PipelineModule::onClearCache() {
|
|||
// Do nothing
|
||||
}
|
||||
|
||||
static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool dynamic) {
|
||||
static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, const Module::Config* config) {
|
||||
std::map<std::string, SubGraph> subGraphMap;
|
||||
auto subGraphs = net->subgraphs();
|
||||
if (nullptr == subGraphs) {
|
||||
|
@ -426,10 +426,10 @@ static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool
|
|||
flatbuffers::FlatBufferBuilder builder(1024);
|
||||
auto offset = Net::Pack(builder, _tempNet.get());
|
||||
builder.Finish(offset);
|
||||
if (dynamic) {
|
||||
submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), dynamic));
|
||||
if (config->dynamic) {
|
||||
submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), config));
|
||||
} else {
|
||||
submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs));
|
||||
submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs, !config->shapeMutable));
|
||||
}
|
||||
if (graph->name() != nullptr) {
|
||||
submodule->setName(graph->name()->str());
|
||||
|
@ -569,6 +569,11 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, cons
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (!find) {
|
||||
if (net->tensorName() != nullptr) {
|
||||
MNN_PRINT("%d tensor [ %s ] is input but not found\n", index, net->tensorName()->GetAsString(index)->c_str());
|
||||
}
|
||||
}
|
||||
MNN_ASSERT(find);
|
||||
}
|
||||
}
|
||||
|
@ -578,7 +583,7 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, cons
|
|||
return submodule;
|
||||
}
|
||||
|
||||
static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs) {
|
||||
static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, bool shapeFix) {
|
||||
if (1 == info.opList.size()) {
|
||||
auto op = net->oplists()->GetAs<Op>(info.opList[0]);
|
||||
if (OpType_If == op->type()) {
|
||||
|
@ -622,25 +627,29 @@ static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info,
|
|||
auto offset = Net::Pack(builder, _tempNet.get());
|
||||
builder.Finish(offset);
|
||||
_tempNet.reset();
|
||||
return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames);
|
||||
return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames, shapeFix);
|
||||
}
|
||||
|
||||
Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
|
||||
Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config) {
|
||||
// Create Subgraph
|
||||
auto net = GetNet(buffer);
|
||||
Module::Config defaultConfig;
|
||||
if (nullptr == config) {
|
||||
config = &defaultConfig;
|
||||
}
|
||||
auto subGraphs = net->subgraphs();
|
||||
if (nullptr == net->oplists() || nullptr == net->tensorName()) {
|
||||
MNN_ERROR("Invalid net, for null oplist or tensorName\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!dynamic) {
|
||||
if (!config->dynamic) {
|
||||
if (nullptr == subGraphs) {
|
||||
// Has no control flow, can just use static module
|
||||
return new StaticModule(buffer, length, inputs, outputs);
|
||||
return new StaticModule(buffer, length, inputs, outputs, !config->shapeMutable);
|
||||
}
|
||||
}
|
||||
auto subGraphMap = _createSubGraph(net, dynamic);
|
||||
if (dynamic) {
|
||||
auto subGraphMap = _createSubGraph(net, config);
|
||||
if (config->dynamic) {
|
||||
// For dynamic mode
|
||||
auto varMaps = Variable::loadMap(buffer, length);
|
||||
std::vector<VARP> inputVars(inputs.size());
|
||||
|
@ -686,7 +695,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
|||
auto subModulesInfo = _createSubModuleInfo(net, inputIndexes, outputIndexes);
|
||||
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
|
||||
for (int i=0; i<subModulesInfo.size(); ++i) {
|
||||
subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap));
|
||||
subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap, !config->shapeMutable));
|
||||
}
|
||||
auto result = new PipelineModule;
|
||||
/**
|
||||
|
|
|
@ -17,11 +17,8 @@ namespace Express {
|
|||
class MNN_PUBLIC PipelineModule : public Module {
|
||||
public:
|
||||
typedef std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(Express::EXPRP)> Transformer;
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config = nullptr);
|
||||
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
|
||||
static Module* extractOrigin(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain) {
|
||||
return extract(inputs, outputs, fortrain);
|
||||
}
|
||||
static bool turnQuantize(Module* module, const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor, NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
|
||||
void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
|
||||
NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
|
||||
|
|
|
@ -14,6 +14,9 @@
|
|||
#include <MNN/expr/Executor.hpp>
|
||||
#include <MNN/AutoTime.hpp>
|
||||
#include <MNN/expr/ExecutorScope.hpp>
|
||||
#include "core/MNNMemoryUtils.h"
|
||||
#include "Utils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix) : mInputs(inputs), mOutputs(outputs) {
|
||||
|
@ -53,6 +56,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
|
|||
} else {
|
||||
mNet->setSessionMode(Interpreter::Session_Input_User);
|
||||
}
|
||||
|
||||
auto rt = Express::ExecutorScope::Current()->getRuntime();
|
||||
// TODO: Add Config
|
||||
ScheduleConfig config;
|
||||
|
@ -107,9 +111,16 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
mNet->resizeSession(mSession);
|
||||
if (mShapeFix) {
|
||||
for (int i=0; i<inputs.size(); ++i) {
|
||||
auto srcPtr = inputs[i]->readMap<void>();
|
||||
// For Shape only usage input, don't alloc memory
|
||||
if (nullptr != mInputTensors[i]->host<void>()) {
|
||||
::memcpy(mInputTensors[i]->host<void>(), inputs[i]->readMap<void>(), mInputTensors[i]->size());
|
||||
if (nullptr != mInputTensors[i]->host<void>() && nullptr != srcPtr) {
|
||||
::memcpy(mInputTensors[i]->host<void>(), srcPtr, mInputTensors[i]->size());
|
||||
} else if (mInputTensors[i]->deviceId() != 0) {
|
||||
// Other backend
|
||||
// TODO: Non-copy methed
|
||||
auto exprInfo = inputs[i]->expr();
|
||||
auto inside = exprInfo.first->inside();
|
||||
mInputTensors[i]->copyFromHostTensor(inside->mOutputTensors[exprInfo.second]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -132,8 +143,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
#endif
|
||||
for (int i=0; i<mOutputTensors.size(); ++i) {
|
||||
Express::Variable::Info info;
|
||||
info.dim = mOutputTensors[i]->shape();
|
||||
info.type = mOutputTensors[i]->getType();
|
||||
auto currentTensor = mOutputTensors[i];
|
||||
info.dim = currentTensor->shape();
|
||||
info.type = currentTensor->getType();
|
||||
auto format = TensorUtils::getDescribe(mOutputTensors[i])->dimensionFormat;
|
||||
info.order = Express::NHWC;
|
||||
if (format == MNN_DATA_FORMAT_NCHW) {
|
||||
|
@ -141,8 +153,14 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
} else if (format == MNN_DATA_FORMAT_NC4HW4) {
|
||||
info.order = Express::NC4HW4;
|
||||
}
|
||||
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, true), 0);
|
||||
//::memcpy(outputs[i]->writeMap<void>(), mOutputTensors[i]->host<void>(), mOutputTensors[i]->size());
|
||||
if (currentTensor->buffer().device != 0) {
|
||||
std::shared_ptr<Tensor> tmpTensor(new Tensor(currentTensor, Tensor::CAFFE, false));
|
||||
tmpTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(currentTensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
|
||||
currentTensor->copyToHostTensor(tmpTensor.get());
|
||||
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), tmpTensor->host<void>(), Express::VARP::CONSTANT, Expr::MemoryType::MOVE), 0);
|
||||
} else {
|
||||
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, Expr::MemoryType::REF), 0);
|
||||
}
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
|
|
@ -127,7 +127,6 @@ public:
|
|||
/**
|
||||
* @brief The API shoud be called before create session.
|
||||
* @param mode session mode
|
||||
* @return void
|
||||
*/
|
||||
void setSessionMode(SessionMode mode);
|
||||
|
||||
|
@ -137,14 +136,13 @@ public:
|
|||
* After createSession, try to save cache to file.
|
||||
* @param cacheFile cache file name
|
||||
* @param keySize the first `keySize` bytes used as the key to check if the `cacheFile` exists.
|
||||
* @return void
|
||||
*/
|
||||
void setCacheFile(const char* cacheFile, size_t keySize = 128);
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief create runtimeInfo seperately with schedule config.
|
||||
* @param config session schedule configs.
|
||||
* @param configs session schedule configs.
|
||||
*/
|
||||
static RuntimeInfo createRuntime(const std::vector<ScheduleConfig>& configs);
|
||||
|
||||
|
@ -275,7 +273,7 @@ public:
|
|||
* @brief get session info
|
||||
* @param session given session.
|
||||
* @param code given info code.
|
||||
* @param void* given info ptr, see SessionInfoCode for detail
|
||||
* @param ptr given info ptr, see SessionInfoCode for detail
|
||||
* @return true if support the code, false otherwise.
|
||||
*/
|
||||
bool getSessionInfo(const Session* session, SessionInfoCode code, void* ptr);
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
namespace MNN {
|
||||
namespace Express {
|
||||
|
||||
struct ExecutorScope final {
|
||||
struct MNN_PUBLIC ExecutorScope final {
|
||||
public:
|
||||
ExecutorScope() = delete;
|
||||
explicit ExecutorScope(const ExecutorScope&) = delete;
|
||||
|
|
|
@ -173,7 +173,12 @@ private:
|
|||
class MNN_PUBLIC Expr {
|
||||
public:
|
||||
struct Inside;
|
||||
static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy = true);
|
||||
enum MemoryType {
|
||||
COPY,
|
||||
MOVE,
|
||||
REF
|
||||
};
|
||||
static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, MemoryType copy = COPY);
|
||||
static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
|
||||
static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
|
||||
static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
|
||||
|
@ -226,14 +231,6 @@ public:
|
|||
return mValid;
|
||||
}
|
||||
|
||||
void setEntry(const std::vector<VARP>& entries) {
|
||||
mEntries = entries;
|
||||
}
|
||||
|
||||
const std::vector<VARP>& getEntry() const {
|
||||
return mEntries;
|
||||
}
|
||||
|
||||
private:
|
||||
static void _addLinkForInputs(EXPRP expr);
|
||||
|
||||
|
@ -254,9 +251,6 @@ private:
|
|||
bool mVisited = false;
|
||||
std::vector<WeakEXPRP> mTo;
|
||||
|
||||
// Only the enter input has entries, and it helps to get info for enter
|
||||
// input expression.
|
||||
std::vector<VARP> mEntries;
|
||||
};
|
||||
} // namespace Express
|
||||
} // namespace MNN
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
struct SubGraph;
|
||||
class MNN_PUBLIC Module {
|
||||
public:
|
||||
Module() = default;
|
||||
|
@ -45,8 +46,17 @@ public:
|
|||
|
||||
void setParameter(Express::VARP parameter, int index);
|
||||
static Module* createEmpty(const std::vector<Express::VARP>& parameters);
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic = false);
|
||||
|
||||
struct Config {
|
||||
// Load module as dynamic, default static
|
||||
bool dynamic = false;
|
||||
|
||||
// for static mode, if the shape is mutable, set true, otherwise set false to avoid resizeSession freqencily
|
||||
bool shapeMutable = true;
|
||||
};
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr);
|
||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Config* config = nullptr);
|
||||
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
|
||||
|
||||
static Module* clone(const Module* module, const bool shareParams = false);
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
#include <MNN/Interpreter.hpp> // Backend
|
||||
#include <MNN/Tensor.hpp>
|
||||
#include <Tensor_generated.h>
|
||||
#include "Tensor_generated.h"
|
||||
|
||||
namespace MNN {
|
||||
namespace plugin {
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# ./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
|
||||
# ./package_scripts/linux/build_lib.sh -o MNN-CPU-OPENCL/lib -b
|
||||
|
||||
# MNN
|
||||
# |--- Debug
|
||||
# | |--- libMNN.a
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
|
||||
# ./package_scripts/linux/build_tools.sh -o MNN-CPU-OPENCL/tools -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
# ./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
|
||||
# ./package_scripts/linux/build_whl.sh -o MNN-CPU-OPENCL/py_whl -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-v MNN dist version"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
@ -13,6 +11,7 @@ usage() {
|
|||
while getopts "o:v:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
v ) mnn_version=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
|
@ -38,7 +37,7 @@ rm -rf wheelhouse && mkdir wheelhouse
|
|||
#Compile wheels
|
||||
for PYBIN in /opt/python/*/bin; do
|
||||
"${PYBIN}/pip" install -U numpy
|
||||
"${PYBIN}/python" setup.py bdist_wheel
|
||||
"${PYBIN}/python" setup.py bdist_wheel --version $mnn_version
|
||||
done
|
||||
|
||||
# Bundle external shared libraries into the wheels
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# ./package_scripts/mac/build_lib.sh -o MNN-CPU/lib
|
||||
# ./package_scripts/mac/build_lib.sh -o MNN-CPU-OPENCL/lib -b
|
||||
|
||||
# MNN
|
||||
# |--- Debug
|
||||
# | |--- Dynamic
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# ./package_scripts/mac/build_tools.sh -o MNN-CPU/tools
|
||||
# ./package_scripts/mac/build_tools.sh -o MNN-CPU-OPENCL/tools -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
|
|
|
@ -1,22 +1,21 @@
|
|||
# ./package_scripts/mac/build_whl.sh -o MNN-CPU/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0
|
||||
# ./package_scripts/mac/build_whl.sh -o MNN-CPU-OPENCL/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0 -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path -v python_versions [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-v python versions in pyenv"
|
||||
echo -e "\t-p python versions in pyenv"
|
||||
echo -e "\t-v MNN dist version"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:v:hb" opt; do
|
||||
while getopts "o:p:v:b" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
v ) IFS="," read -a python_versions <<< $OPTARG ;;
|
||||
p ) IFS="," read -a python_versions <<< $OPTARG ;;
|
||||
v ) mnn_version=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
* ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
@ -38,7 +37,7 @@ pushd pymnn/pip_package
|
|||
rm -rf dist && mkdir dist
|
||||
for env in $python_versions; do
|
||||
pyenv global $env
|
||||
python build_wheel.py
|
||||
python build_wheel.py --version $mnn_version
|
||||
done
|
||||
cp dist/* $PACKAGE_PATH
|
||||
|
||||
|
|
|
@ -1,8 +1,3 @@
|
|||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -opencl
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -opencl
|
||||
|
||||
# MNN
|
||||
# |-- Debug
|
||||
# | |--- MD
|
||||
|
|
|
@ -1,8 +1,3 @@
|
|||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x64
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x86
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x64 -opencl
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x86 -opencl
|
||||
|
||||
Param(
|
||||
[Parameter(Mandatory=$true)][String]$path,
|
||||
[Switch]$opencl
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
|
||||
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
|
||||
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
|
||||
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
|
||||
Param(
|
||||
[Parameter(Mandatory=$true)][String]$version,
|
||||
[Parameter(Mandatory=$true)][String]$pyenvs,
|
||||
[Parameter(Mandatory=$true)][String]$path,
|
||||
[Switch]$x86,
|
||||
|
@ -15,7 +12,7 @@ $python_versions = $pyenvs.Split(",")
|
|||
Remove-Item $path -Recurse -ErrorAction Ignore
|
||||
mkdir -p $path
|
||||
$PACKAGE_PATH = $(Resolve-Path $path).Path
|
||||
$ARGS = ""
|
||||
$ARGS = "--version $version"
|
||||
if ($x86) {
|
||||
$ARGS = " --x86"
|
||||
}
|
||||
|
|
|
@ -7,6 +7,10 @@
|
|||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */; };
|
||||
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */; };
|
||||
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */; };
|
||||
11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */; };
|
||||
1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F722397BA5A004E8721 /* HalideRuntime.h */; settings = {ATTRIBUTES = (Public, ); }; };
|
||||
1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F732397BA5A004E8721 /* MNNDefine.h */; settings = {ATTRIBUTES = (Public, ); }; };
|
||||
1F501F812397BA5B004E8721 /* AutoTime.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F742397BA5A004E8721 /* AutoTime.hpp */; settings = {ATTRIBUTES = (Public, ); }; };
|
||||
|
@ -45,6 +49,7 @@
|
|||
4829A2DE23CC26AE00623BF5 /* ReverseSequenceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */; };
|
||||
4829A2DF23CC26AE00623BF5 /* ReplaceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */; };
|
||||
4829A2E023CC26AE00623BF5 /* PaddingTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */; };
|
||||
4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4836CEE4257744120068F6CE /* ShapePlugin.cpp */; };
|
||||
48417FF024D13BF50056D9A7 /* GeometryThreshold.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */; };
|
||||
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */; };
|
||||
48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */; };
|
||||
|
@ -251,6 +256,8 @@
|
|||
48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD0349246AA40300456AF5 /* GeometryConvert.cpp */; };
|
||||
48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */; };
|
||||
48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */; };
|
||||
6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */; };
|
||||
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3E25823349002EC3D6 /* PluginKernel.cpp */; };
|
||||
9200049921EDBDF600BCE892 /* TensorTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045D21EDBDF600BCE892 /* TensorTest.cpp */; };
|
||||
9200049A21EDBDF600BCE892 /* ImageProcessTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045F21EDBDF600BCE892 /* ImageProcessTest.cpp */; };
|
||||
9200049B21EDBDF600BCE892 /* MatrixTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200046021EDBDF600BCE892 /* MatrixTest.cpp */; };
|
||||
|
@ -731,6 +738,10 @@
|
|||
0F1465B71FA18D1000F9860A /* MNN.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = MNN.framework; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
0F1465BB1FA18D1000F9860A /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
|
||||
0F78AC261FCD495800205A7C /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Int32.S; sourceTree = "<group>"; };
|
||||
11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
|
||||
11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
|
||||
11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Int32.S; sourceTree = "<group>"; };
|
||||
1F501F722397BA5A004E8721 /* HalideRuntime.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HalideRuntime.h; path = MNN/HalideRuntime.h; sourceTree = "<group>"; };
|
||||
1F501F732397BA5A004E8721 /* MNNDefine.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNDefine.h; path = MNN/MNNDefine.h; sourceTree = "<group>"; };
|
||||
1F501F742397BA5A004E8721 /* AutoTime.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = AutoTime.hpp; path = MNN/AutoTime.hpp; sourceTree = "<group>"; };
|
||||
|
@ -767,6 +778,7 @@
|
|||
4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReverseSequenceTest.cpp; sourceTree = "<group>"; };
|
||||
4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReplaceTest.cpp; sourceTree = "<group>"; };
|
||||
4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PaddingTest.cpp; sourceTree = "<group>"; };
|
||||
4836CEE4257744120068F6CE /* ShapePlugin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapePlugin.cpp; sourceTree = "<group>"; };
|
||||
48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryThreshold.cpp; sourceTree = "<group>"; };
|
||||
48417FED24D13BF50056D9A7 /* GeometryELU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryELU.cpp; sourceTree = "<group>"; };
|
||||
48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySelect.cpp; sourceTree = "<group>"; };
|
||||
|
@ -973,6 +985,8 @@
|
|||
48FD0349246AA40300456AF5 /* GeometryConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConvert.cpp; sourceTree = "<group>"; };
|
||||
48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryImageOp.cpp; sourceTree = "<group>"; };
|
||||
48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConv2DBackPropFilter.cpp; sourceTree = "<group>"; };
|
||||
6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginShapeInference.cpp; sourceTree = "<group>"; };
|
||||
6A131E3E25823349002EC3D6 /* PluginKernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginKernel.cpp; sourceTree = "<group>"; };
|
||||
9200045321EDBCF700BCE892 /* MNNTestSuite.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = MNNTestSuite.h; path = ../../../test/MNNTestSuite.h; sourceTree = "<group>"; };
|
||||
9200045521EDBCF700BCE892 /* TestUtils.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = TestUtils.h; path = ../../../test/TestUtils.h; sourceTree = "<group>"; };
|
||||
9200045721EDBCF700BCE892 /* TestUtils.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = TestUtils.mm; path = ../../../test/TestUtils.mm; sourceTree = "<group>"; };
|
||||
|
@ -1619,6 +1633,7 @@
|
|||
488873A8215B639D0079B12E /* source */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
6A131E3C2582331C002EC3D6 /* plugin */,
|
||||
489D7A152550FDC800AD896A /* metal */,
|
||||
48C84B9D250F725600EE7666 /* utils */,
|
||||
48747D51245D9E33000B9709 /* geometry */,
|
||||
|
@ -2014,6 +2029,15 @@
|
|||
path = ../../../test/speed;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
6A131E3C2582331C002EC3D6 /* plugin */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
6A131E3E25823349002EC3D6 /* PluginKernel.cpp */,
|
||||
6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */,
|
||||
);
|
||||
path = plugin;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
9200045021EDBCEC00BCE892 /* Tests */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
|
@ -2160,6 +2184,8 @@
|
|||
92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */,
|
||||
11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */,
|
||||
48034562254157CE004738E3 /* MNNNV21ToBGRAUnit.S */,
|
||||
48BB6EF525220AA80056E195 /* MNNTranspose32Bit4x4.S */,
|
||||
C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */,
|
||||
|
@ -2231,6 +2257,8 @@
|
|||
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */,
|
||||
11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */,
|
||||
48034566254157DF004738E3 /* MNNNV21ToBGRAUnit.S */,
|
||||
48BB6EEF25220A930056E195 /* MNNTranspose32Bit4x4.S */,
|
||||
C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */,
|
||||
|
@ -2350,6 +2378,7 @@
|
|||
EBB38EC621E748B9005F76D7 /* shape */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4836CEE4257744120068F6CE /* ShapePlugin.cpp */,
|
||||
48C84B6B250F709E00EE7666 /* SizeComputer.cpp */,
|
||||
48C84B6A250F709E00EE7666 /* SizeComputer.hpp */,
|
||||
486E1A9B24F507A600C16006 /* ShapeRandomUniform.cpp */,
|
||||
|
@ -2828,6 +2857,7 @@
|
|||
92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
|
||||
92FF02D223AA0B5A00AC97F6 /* MNNNV21ToRGBAUnit.S in Sources */,
|
||||
48747D66245D9E33000B9709 /* GeometryDepthToSpace.cpp in Sources */,
|
||||
6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */,
|
||||
92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */,
|
||||
48034563254157CE004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
|
||||
48FA474823AA127B00172C3B /* Expr.cpp in Sources */,
|
||||
|
@ -2836,6 +2866,7 @@
|
|||
92FF042923AA0B7100AC97F6 /* ShapeLinSpace.cpp in Sources */,
|
||||
92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */,
|
||||
48FB9DC224A8445A008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
|
||||
4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */,
|
||||
92FF027523AA0B5A00AC97F6 /* CPUConvolution.cpp in Sources */,
|
||||
48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
|
||||
EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */,
|
||||
|
@ -2843,6 +2874,7 @@
|
|||
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
|
||||
92FF03A023AA0B5A00AC97F6 /* ConvolutionWinograd.cpp in Sources */,
|
||||
48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
|
||||
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
|
||||
92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
|
||||
92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
|
||||
92FF041E23AA0B7100AC97F6 /* ShapeRange.cpp in Sources */,
|
||||
|
@ -2855,6 +2887,7 @@
|
|||
92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
|
||||
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
|
||||
92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */,
|
||||
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */,
|
||||
48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
|
||||
92FF044023AA0B7100AC97F6 /* ShapeSlice.cpp in Sources */,
|
||||
92FF044723AA0B7100AC97F6 /* ShapeSqueeze.cpp in Sources */,
|
||||
|
@ -2893,6 +2926,7 @@
|
|||
C43C81DF2518944F00A0FF84 /* WinogradHelper.cpp in Sources */,
|
||||
92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */,
|
||||
92FF044A23AA0B7100AC97F6 /* ShapeConvolution.cpp in Sources */,
|
||||
11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */,
|
||||
92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */,
|
||||
92FF02E923AA0B5A00AC97F6 /* MNNDepthWiseInt8AddBiasScaleUnit.S in Sources */,
|
||||
92FF026A23AA0B5A00AC97F6 /* CPUNonMaxSuppressionV2.cpp in Sources */,
|
||||
|
@ -2960,6 +2994,7 @@
|
|||
92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
|
||||
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
|
||||
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
||||
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
|
||||
92FF035323AA0B5A00AC97F6 /* CPUScatterNd.cpp in Sources */,
|
||||
48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
|
||||
92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
|
||||
|
@ -3004,6 +3039,7 @@
|
|||
48FB9DCA24A848D0008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
|
||||
489D7A832550FDC900AD896A /* MetalMatMul.mm in Sources */,
|
||||
92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
|
||||
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
|
||||
48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
|
||||
EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */,
|
||||
EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
|
||||
|
@ -3430,6 +3466,7 @@
|
|||
"MNN_METAL_ENABLED=1",
|
||||
"MNN_SUPPORT_TFLITE_QUAN=1",
|
||||
"ENABLE_ARMV82=1",
|
||||
"MNN_WITH_PLUGIN=1",
|
||||
);
|
||||
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
|
||||
GCC_WARN_SHADOW = NO;
|
||||
|
@ -3489,6 +3526,7 @@
|
|||
"MNN_METAL_ENABLED=1",
|
||||
"MNN_SUPPORT_TFLITE_QUAN=1",
|
||||
"ENABLE_ARMV82=1",
|
||||
"MNN_WITH_PLUGIN=1",
|
||||
);
|
||||
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
|
||||
GCC_WARN_SHADOW = YES;
|
||||
|
|
|
@ -1,23 +1,33 @@
|
|||
# TODO: avoid import everything from _mnncengine._nn for visable control
|
||||
from _mnncengine._nn import *
|
||||
|
||||
import _mnncengine._expr as _F
|
||||
import _mnncengine._nn as _nn
|
||||
|
||||
# old call: load_module_from_file(file_name, for_training)
|
||||
# new call: load_module_from_file(file_name, dynamic=False, shape_mutable=True)
|
||||
# support two by args and kwargs
|
||||
def load_module_from_file(file_name, *args, **kwargs):
|
||||
old_call = len(args) > 0 #for_training
|
||||
|
||||
def load_module_from_file(file_name, for_training):
|
||||
m = _F.load_as_dict(file_name)
|
||||
inputs_outputs = _F.get_inputs_and_outputs(m)
|
||||
|
||||
inputs = []
|
||||
for key in inputs_outputs[0].keys():
|
||||
inputs.append(inputs_outputs[0][key])
|
||||
inputs.append(inputs_outputs[0][key] if old_call else key)
|
||||
|
||||
outputs = []
|
||||
for key in inputs_outputs[1].keys():
|
||||
outputs.append(inputs_outputs[1][key])
|
||||
outputs.append(inputs_outputs[1][key] if old_call else key)
|
||||
|
||||
if old_call:
|
||||
for_training = args[0]
|
||||
module = _nn.load_module(inputs, outputs, for_training)
|
||||
|
||||
else:
|
||||
dynamic = kwargs.get('dynamic', False)
|
||||
shape_mutable = kwargs.get('shape_mutable', True)
|
||||
module = _nn.load_module_from_file(inputs, outputs, file_name, dynamic, shape_mutable)
|
||||
return module
|
||||
|
||||
|
||||
|
@ -53,17 +63,3 @@ class Module(_nn._Module):
|
|||
else:
|
||||
self._vars[name] = value
|
||||
self._add_parameter(value)
|
||||
|
||||
|
||||
class FixModule(object):
|
||||
def __init__(self, module):
|
||||
super(FixModule, self).__init__()
|
||||
self.module = module
|
||||
|
||||
def forward(self, x):
|
||||
self.module.train(False)
|
||||
return self.module.forward(x)
|
||||
|
||||
def __call__(self, x):
|
||||
self.module.train(False)
|
||||
return self.module(x)
|
||||
|
|
|
@ -6,6 +6,8 @@ import argparse
|
|||
parser = argparse.ArgumentParser(description='build pymnn wheel')
|
||||
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
|
||||
help='build wheel for 32bit arch, only usable on windows')
|
||||
parser.add_argument('--version', dest='version', type=str, required=True,
|
||||
help='MNN dist version')
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
|
@ -18,11 +20,11 @@ if __name__ == '__main__':
|
|||
os.system("pip install -U numpy")
|
||||
if os.path.exists('build'):
|
||||
shutil.rmtree('build')
|
||||
if IS_DARWIN:
|
||||
os.system('python setup.py bdist_wheel')
|
||||
comm_args = '--version ' + args.version
|
||||
if IS_LINUX:
|
||||
os.system('python setup.py bdist_wheel --plat-name=manylinux1_x86_64')
|
||||
comm_args += ' --plat-name=manylinux1_x86_64'
|
||||
if IS_WINDOWS:
|
||||
os.putenv('DISTUTILS_USE_SDK', '1')
|
||||
os.putenv('MSSdk', '1')
|
||||
os.system('python setup.py bdist_wheel %s' % ('--x86' if args.x86 else ''))
|
||||
comm_args += ' --x86' if args.x86 else ''
|
||||
os.system('python setup.py bdist_wheel %s' % comm_args)
|
||||
|
|
|
@ -2,8 +2,18 @@
|
|||
# Created by ruhuan on 2019.08.31
|
||||
""" setup tool """
|
||||
from __future__ import print_function
|
||||
import os
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='build pymnn wheel')
|
||||
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
|
||||
help='build wheel for 32bit arch, only usable on windows')
|
||||
parser.add_argument('--version', dest='version', type=str, required=True,
|
||||
help='MNN dist version')
|
||||
args, unknown = parser.parse_known_args()
|
||||
sys.argv = [sys.argv[0]] + unknown
|
||||
|
||||
import os
|
||||
import platform
|
||||
try:
|
||||
import numpy as np
|
||||
|
@ -19,9 +29,8 @@ IS_LINUX = (platform.system() == 'Linux')
|
|||
BUILD_DIR = 'pymnn_build'
|
||||
BUILD_TYPE = 'RELEASE'
|
||||
BUILD_ARCH = 'x64'
|
||||
if '--x86' in sys.argv:
|
||||
if args.x86:
|
||||
BUILD_ARCH = ''
|
||||
sys.argv.remove('--x86')
|
||||
|
||||
def check_env_flag(name, default=''):
|
||||
""" check whether a env is set to Yes """
|
||||
|
@ -46,7 +55,7 @@ if os.path.isdir('../../schema/private'):
|
|||
|
||||
print ('Building with python wheel with package name ', package_name)
|
||||
|
||||
version = '1.1.0'
|
||||
version = args.version
|
||||
depend_pip_packages = ['flatbuffers', 'numpy']
|
||||
if package_name == 'MNN':
|
||||
README = os.path.join(os.getcwd(), "README.md")
|
||||
|
@ -106,9 +115,9 @@ def configure_extension_build():
|
|||
]
|
||||
if check_env_flag('WERROR'):
|
||||
extra_compile_args.append('-Werror')
|
||||
extra_compile_args += ['-DUSE_V3_API']
|
||||
extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE']
|
||||
root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd())))
|
||||
engine_compile_args = ['-DBUILD_OPTYPE', '-DBUILD_TRAIN']
|
||||
engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API']
|
||||
engine_libraries = []
|
||||
engine_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
|
||||
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
|
||||
|
@ -121,6 +130,7 @@ def configure_extension_build():
|
|||
engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")]
|
||||
engine_include_dirs = [os.path.join(root_dir, "include")]
|
||||
engine_include_dirs += [os.path.join(root_dir, "express")]
|
||||
engine_include_dirs += [os.path.join(root_dir, "express", "module")]
|
||||
engine_include_dirs += [os.path.join(root_dir, "source")]
|
||||
engine_include_dirs += [os.path.join(root_dir, "tools", "train", "source", "grad")]
|
||||
engine_include_dirs += [os.path.join(root_dir, "tools", "train", "source", "module")]
|
||||
|
|
746
pymnn/src/MNN.cc
746
pymnn/src/MNN.cc
File diff suppressed because it is too large
Load Diff
|
@ -161,8 +161,7 @@ static PyMethodDef module_methods[] = {
|
|||
#else
|
||||
#define MOD_INIT(name) PyMODINIT_FUNC init##name(void)
|
||||
#endif
|
||||
MOD_INIT(_tools)
|
||||
{
|
||||
MOD_INIT(_tools) {
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
PyObject *m = PyModule_Create(&moduledef);
|
||||
// module import failed!
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef PYMNN_USE_ALINNPYTHON
|
||||
#ifndef PYMNN_EXPR_API
|
||||
#error PYMNN_EXPR_API macro should be define on official python (PYMNN_USE_ALINNPYTHON=OFF)
|
||||
#endif
|
||||
#ifndef PYMNN_NUMPY_USABLE
|
||||
#error PYMNN_NUMPY_USABLE macro should be define on official python (PYMNN_USE_ALINNPYTHON=OFF)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(ANDROID) || defined(__ANDROID__)
|
||||
#undef _FILE_OFFSET_BITS
|
||||
#endif
|
||||
#include <fstream>
|
||||
|
||||
#ifdef PYMNN_USE_ALINNPYTHON
|
||||
#include <AliNNPython/Python.h>
|
||||
#include <AliNNPython/frameobject.h>
|
||||
#include <AliNNPython/pythread.h>
|
||||
#include "renameForAliNNPython.h"
|
||||
|
||||
#ifdef PYMNN_NUMPY_USABLE
|
||||
#include <numpy/ndarrayobject.h>
|
||||
#include <numpy/ndarraytypes.h>
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define PyType_FindTLSType
|
||||
#include <Python.h>
|
||||
#include "structmember.h"
|
||||
#include "numpy/arrayobject.h"
|
||||
#endif
|
|
@ -0,0 +1,190 @@
|
|||
#pragma once
|
||||
|
||||
#define PyObject WeObject
|
||||
#define PyImport_Import WeImport_Import
|
||||
#define PyObject_GetAttrString WeObject_GetAttrString
|
||||
#define PyObject_HEAD WeObject_HEAD
|
||||
#define PyTypeObject WeTypeObject
|
||||
#define PyObject_HEAD_INIT WeObject_HEAD_INIT
|
||||
#define PyString_AsString WeString_AsString
|
||||
#define PyErr_SetString WeErr_SetString
|
||||
#define PyTuple_GetItem WeTuple_GetItem
|
||||
#define PyTuple_Size WeTuple_Size
|
||||
#define PyDict_New WeDict_New
|
||||
#define PyDict_SetItem WeDict_SetItem
|
||||
#define PyDict_GetItemString WeDict_GetItemString
|
||||
#define PyCallable_Check WeCallable_Check
|
||||
#define PyArg_ParseTuple WeArg_ParseTuple
|
||||
#define PyLong_AsLong WeLong_AsLong
|
||||
#define PyObject_Call WeObject_Call
|
||||
#define PyType_Ready WeType_Ready
|
||||
#define PyCapsule_New WeCapsule_New
|
||||
#define PyLong_FromLong WeLong_FromLong
|
||||
#define PyModule_AddObject WeModule_AddObject
|
||||
#define PyTuple_SetItem WeTuple_SetItem
|
||||
#define PyFloat_FromDouble WeFloat_FromDouble
|
||||
#define PyFloat_AsDouble WeFloat_AsDouble
|
||||
#define PyTuple_New WeTuple_New
|
||||
#define PyString_FromString WeString_FromString
|
||||
#define PyCapsule_GetPointer WeCapsule_GetPointer
|
||||
#define PyObject_TypeCheck WeObject_TypeCheck
|
||||
#define PyObject_IsInstance WeObject_IsInstance
|
||||
#define PySequence_Tuple WeSequence_Tuple
|
||||
|
||||
#define PyExc_Exception (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_Exception)
|
||||
#define PyExc_StopIteration (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_StopIteration)
|
||||
#define PyExc_MemoryError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_MemoryError)
|
||||
#define PyExc_ImportError ((WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_ImportError))
|
||||
#define PyExc_IndexError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_IndexError)
|
||||
#define PyExc_KeyError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_KeyError)
|
||||
#define PyExc_ValueError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_ValueError)
|
||||
#define PyExc_TypeError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_TypeError)
|
||||
#define PyExc_BufferError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_BufferError)
|
||||
#define PyExc_RuntimeError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_RuntimeError)
|
||||
#define PyExc_SystemError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_SystemError)
|
||||
#define PyExc_FutureWarning (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_FutureWarning)
|
||||
#define PyExc_AttributeError ((WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_AttributeError))
|
||||
|
||||
#define PyErr_ExceptionMatches WeErr_ExceptionMatches
|
||||
#define PyErr_Fetch WeErr_Fetch
|
||||
#define PyErr_Restore WeErr_Restore
|
||||
#define PyBuffer_Release WeBuffer_Release
|
||||
#define PyObject_HasAttr WeObject_HasAttr
|
||||
#define PyObject_HasAttrString WeObject_HasAttrString
|
||||
#define PyObject_DelAttr WeObject_DelAttr
|
||||
#define PyObject_DelAttrString WeObject_DelAttrString
|
||||
#define PyObject_GetAttr WeObject_GetAttr
|
||||
#define PyErr_GivenExceptionMatches WeErr_GivenExceptionMatches
|
||||
#define PyErr_Clear WeErr_Clear
|
||||
#define PyObject_SetAttr WeObject_SetAttr
|
||||
#define PyObject_SetAttrString WeObject_SetAttrString
|
||||
#define PyObject_Hash WeObject_Hash
|
||||
#define PyObject_GetItem WeObject_GetItem
|
||||
#define PyObject_SetItem WeObject_SetItem
|
||||
#define PySequence_GetItem WeSequence_GetItem
|
||||
#define PySequence_SetItem WeSequence_SetItem
|
||||
#define PyList_GetItem WeList_GetItem
|
||||
#define PyList_SetItem WeList_SetItem
|
||||
#define PySequence_Fast_ITEMS WeSequence_Fast_ITEMS
|
||||
#define PyDict_Next WeDict_Next
|
||||
#define PyObject_GetIter WeObject_GetIter
|
||||
#define PyStaticMethod_Type WeStaticMethod_Type
|
||||
#define PyIter_Next WeIter_Next
|
||||
#define PyErr_Occurred WeErr_Occurred
|
||||
#define PyObject_Str WeObject_Str
|
||||
#define PyString_AsStringAndSize WeString_AsStringAndSize
|
||||
#define PyString_FromStringAndSize WeString_FromStringAndSize
|
||||
#define PyObject_IsTrue WeObject_IsTrue
|
||||
#define PyLong_AsUnsignedLong WeLong_AsUnsignedLong
|
||||
#define PyLong_FromUnsignedLong WeLong_FromUnsignedLong
|
||||
#define PyLong_AsLongLong WeLong_AsLongLong
|
||||
#define PyLong_FromLongLong WeLong_FromLongLong
|
||||
#define PyLong_AsLong WeLong_AsLong
|
||||
#define PyLong_AsUnsignedLongLong WeLong_AsUnsignedLongLong
|
||||
#define PyLong_FromUnsignedLongLong WeLong_FromUnsignedLongLong
|
||||
#define PyNumber_Long WeNumber_Long
|
||||
#define PyNumber_Float WeNumber_Float
|
||||
#define PySequence_Check WeSequence_Check
|
||||
#define PySequence_Size WeSequence_Size
|
||||
#define PySequence_List WeSequence_List
|
||||
#define PySlice_New WeSlice_New
|
||||
#define PySlice_GetIndicesEx WeSlice_GetIndicesEx
|
||||
#define PySlice_GetIndicesEx WeSlice_GetIndicesEx
|
||||
#define PyCapsule_GetContext WeCapsule_GetContext
|
||||
#define PyCapsule_SetContext WeCapsule_SetContext
|
||||
#define PyCapsule_GetName WeCapsule_GetName
|
||||
#define PyDict_Size WeDict_Size
|
||||
#define PyDict_Clear WeDict_Clear
|
||||
#define PyObject_CallFunctionObjArgs WeObject_CallFunctionObjArgs
|
||||
#define PySet_New WeSet_New
|
||||
#define PySet_Size WeSet_Size
|
||||
#define PySet_Clear WeSet_Clear
|
||||
#define PyStaticMethod_New WeStaticMethod_New
|
||||
#define PyObject_CheckBuffer WeObject_CheckBuffer
|
||||
#define PyObject_GetBuffer WeObject_GetBuffer
|
||||
#define PyWeakref_NewRef WeWeakref_NewRef
|
||||
#define PyDict_Type WeDict_Type
|
||||
#define PyList_New WeList_New
|
||||
#define PyList_Size WeList_Size
|
||||
#define PyMemoryView_FromBuffer WeMemoryView_FromBuffer
|
||||
#define PyObject_Length WeObject_Length
|
||||
#define PyObject_Repr WeObject_Repr
|
||||
#define PyThread_create_key WeThread_create_key
|
||||
#define PyGILState_Ensure WeGILState_Ensure
|
||||
#define PyGILState_Release WeGILState_Release
|
||||
#define PyEval_InitThreads WeEval_InitThreads
|
||||
#define PyThreadState WeThreadState
|
||||
#define PyThreadState_Get WeThreadState_Get
|
||||
#define PyThread_create_key WeThread_create_key
|
||||
#define PyThread_set_key_value WeThread_set_key_value
|
||||
#define PyMemoryView_FromObject WeMemoryView_FromObject
|
||||
#define PyEval_GetBuiltins WeEval_GetBuiltins
|
||||
#define PyList_Append WeList_Append
|
||||
#define PyMem_Free WeMem_Free
|
||||
#define PyErr_NormalizeException WeErr_NormalizeException
|
||||
#define PyFrame_GetLineNumber WeFrame_GetLineNumber
|
||||
#define PyType_IsSubtype WeType_IsSubtype
|
||||
#define PyNumber_Check WeNumber_Check
|
||||
#define PyInt_FromSsize_t WeInt_FromSsize_t
|
||||
#define PyString_Size WeString_Size
|
||||
#define _PyThreadState_Current _WeThreadState_Current
|
||||
#define PyProperty_Type WeProperty_Type
|
||||
#define PyType_Type WeType_Type
|
||||
#define _PyType_Lookup _WeType_Lookup
|
||||
#define PyBaseObject_Type WeBaseObject_Type
|
||||
#define _PyObject_GetDictPtr _WeObject_GetDictPtr
|
||||
#define PyInt_FromSize_t WeInt_FromSize_t
|
||||
#define PyObject_ClearWeakRefs WeObject_ClearWeakRefs
|
||||
#define PyErr_Format WeErr_Format
|
||||
#define PyObject_MALLOC WeObject_MALLOC
|
||||
#define PyCFunction_NewEx WeCFunction_NewEx
|
||||
#define PyMethod_New WeMethod_New
|
||||
#define PyDict_DelItemString WeDict_DelItemString
|
||||
#define PyModule_GetName WeModule_GetName
|
||||
#define PyImport_AddModule WeImport_AddModule
|
||||
#define PyImport_ImportModule WeImport_ImportModule
|
||||
#define PyImport_ReloadModule WeImport_ReloadModule
|
||||
#define PyEval_GetGlobals WeEval_GetGlobals
|
||||
#define PyErr_NewException WeErr_NewException
|
||||
#define PyThread_get_key_value WeThread_get_key_value
|
||||
#define PyGILState_GetThisThreadState WeGILState_GetThisThreadState
|
||||
#define PyThreadState_New WeThreadState_New
|
||||
#define PyEval_AcquireThread WeEval_AcquireThread
|
||||
#define PyErr_WarnEx WeErr_WarnEx
|
||||
#define PyThread_delete_key_value WeThread_delete_key_value
|
||||
#define PyThreadState_Clear WeThreadState_Clear
|
||||
#define PyThreadState_DeleteCurrent WeThreadState_DeleteCurrent
|
||||
#define PyEval_SaveThread WeEval_SaveThread
|
||||
#define PyEval_RestoreThread WeEval_RestoreThread
|
||||
#define PyFrame_FastToLocals WeFrame_FastToLocals
|
||||
#define PyDict_GetItem WeDict_GetItem
|
||||
#define PyObject_CallObject WeObject_CallObject
|
||||
#define PyObject_RichCompareBool WeObject_RichCompareBool
|
||||
#define PyNumber_Invert WeNumber_Invert
|
||||
#define PyNumber_Negative WeNumber_Negative
|
||||
#define PyNumber_Add WeNumber_Add
|
||||
#define PyNumber_InPlaceAdd WeNumber_InPlaceAdd
|
||||
#define PyNumber_Subtract WeNumber_Subtract
|
||||
#define PyNumber_InPlaceSubtract WeNumber_InPlaceSubtract
|
||||
#define PyNumber_Multiply WeNumber_Multiply
|
||||
#define PyNumber_InPlaceMultiply WeNumber_InPlaceMultiply
|
||||
#define PyNumber_TrueDivide WeNumber_TrueDivide
|
||||
#define PyNumber_InPlaceTrueDivide WeNumber_InPlaceTrueDivide
|
||||
#define PyNumber_Or WeNumber_Or
|
||||
#define PyNumber_InPlaceOr WeNumber_InPlaceOr
|
||||
#define PyNumber_And WeNumber_And
|
||||
#define PyNumber_InPlaceAnd WeNumber_InPlaceAnd
|
||||
#define PyNumber_Xor WeNumber_Xor
|
||||
#define PyNumber_InPlaceXor WeNumber_InPlaceXor
|
||||
#define PyNumber_Lshift WeNumber_Lshift
|
||||
#define PyNumber_InPlaceLshift WeNumber_InPlaceLshift
|
||||
#define PyNumber_Rshift WeNumber_Rshift
|
||||
#define PyNumber_InPlaceRshift WeNumber_InPlaceRshift
|
||||
#define PyDict_Contains WeDict_Contains
|
||||
#define PyLong_AsLongLongAndOverflow WeLong_AsLongLongAndOverflow
|
||||
#define PySequence_Length WeSequence_Length
|
||||
#define PySequence_Fast WeSequence_Fast
|
||||
#define PySequence_Fast_GET_SIZE WeSequence_Fast_GET_SIZE
|
||||
#define PyCFunction_Type WeCFunction_Type
|
||||
#define PyType_FindTLSType WeType_FindTLSType
|
||||
#define PyInterpreterState_Get WeInterpreterState_Get
|
|
@ -1,20 +1,10 @@
|
|||
#pragma once
|
||||
#include <string>
|
||||
#include <MNN/expr/Expr.hpp>
|
||||
#include <MNN/expr/ExprCreator.hpp>
|
||||
#ifdef USE_PRIVATE
|
||||
#include "private_define.h"
|
||||
#else
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
#include "pybind11/operators.h"
|
||||
#include "numpy/arrayobject.h"
|
||||
#include <Python.h>
|
||||
#include "structmember.h"
|
||||
#endif
|
||||
using namespace MNN;
|
||||
using namespace MNN::Express;
|
||||
#include <vector>
|
||||
#include "common.h"
|
||||
|
||||
using namespace std;
|
||||
typedef vector<int> INTS;
|
||||
// Returns true if obj is a bytes/str or unicode object
|
||||
inline bool checkString(PyObject* obj) {
|
||||
return PyBytes_Check(obj) || PyUnicode_Check(obj);
|
||||
|
@ -176,9 +166,8 @@ halide_type_t dtype2htype(DType dtype) {
|
|||
CONVERT(DType_INT8, halide_type_of<int8_t>(), dtype);
|
||||
return halide_type_of<float>();
|
||||
}
|
||||
#ifndef USE_PRIVATE
|
||||
inline int getitemsize(int dtype, int npy_type)
|
||||
{
|
||||
#ifdef PYMNN_NUMPY_USABLE
|
||||
inline int getitemsize(int dtype, int npy_type) {
|
||||
switch(dtype) {
|
||||
case DType_FLOAT:
|
||||
if(npy_type != NPY_FLOAT) {
|
||||
|
@ -210,8 +199,7 @@ inline int getitemsize(int dtype, int npy_type)
|
|||
}
|
||||
}
|
||||
#endif
|
||||
inline int getitemsize(int dtype)
|
||||
{
|
||||
inline int getitemsize(int dtype) {
|
||||
switch(dtype) {
|
||||
case DType_FLOAT:
|
||||
return 4;
|
||||
|
@ -229,3 +217,4 @@ inline int getitemsize(int dtype)
|
|||
throw std::runtime_error("does not support this dtype");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -p python_version [-t]"
|
||||
echo -e "\t-p python versions in pyenv"
|
||||
echo -e "\t-t include train API wrapper"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "p:t" opt; do
|
||||
case "$opt" in
|
||||
p ) py_version=$OPTARG ;;
|
||||
t ) train_api=true ;;
|
||||
* ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
rm -rf /tmp/mnn_py && mkdir -p /tmp/mnn_py
|
||||
cp -r pip_package/MNN /tmp/mnn_py
|
||||
pushd /tmp/mnn_py/MNN
|
||||
|
||||
rm -rf tools
|
||||
cat __init__.py | sed '/from . import tools/d' > __init__.py.tmp
|
||||
mv __init__.py.tmp __init__.py
|
||||
|
||||
if [ -z $train_api ]; then
|
||||
rm -rf data optim
|
||||
cat __init__.py | sed '/from . import data/d' | sed '/from . import optim/d' > __init__.py.tmp
|
||||
mv __init__.py.tmp __init__.py
|
||||
fi
|
||||
|
||||
find . -name __pycache__ | xargs rm -rf
|
||||
pyenv global $py_version
|
||||
python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)"
|
||||
find . -name *.py | xargs rm -rf
|
||||
cd ..
|
||||
zip -r MNN.zip MNN
|
||||
popd
|
||||
|
||||
rm -f android/src/main/assets/MNN.zip
|
||||
rm -rf iOS/MNNPyBridge/lib/MNN
|
||||
cp /tmp/mnn_py/MNN.zip android/src/main/assets
|
||||
cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib
|
||||
|
||||
rm -rf /tmp/mnn_py
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"inputs" : [
|
||||
{
|
||||
"id" : 0,
|
||||
"type" : "int",
|
||||
"dims" : [1, 1, 5, 4],
|
||||
"data" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id" : 0,
|
||||
"type" : "int",
|
||||
"dims" : [1, 1, 4, 5],
|
||||
"data" : []
|
||||
}
|
||||
],
|
||||
"regions" : [
|
||||
{
|
||||
"id" : 0,
|
||||
"size" : [1, 4, 5],
|
||||
"src" : {
|
||||
"offset": 0,
|
||||
"stride": [1, 1, 4]
|
||||
},
|
||||
"dst" : {
|
||||
"offset" : 0,
|
||||
"stride" : [1, 5, 1]
|
||||
},
|
||||
"origin": 0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -78,7 +78,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
|
|||
// The default data type of input tensor for arm82 backend is FLOAT32.
|
||||
// However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
|
||||
// then divide size by 2
|
||||
auto size = sizeof(int16_t);
|
||||
int size = sizeof(int16_t);
|
||||
const int dimensions = buffer.dimensions;
|
||||
for (int i = 0; i < dimensions; i++) {
|
||||
int currentDimSize = buffer.dim[i].extent;
|
||||
|
@ -87,7 +87,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
|
|||
}
|
||||
size *= currentDimSize;
|
||||
}
|
||||
auto res = allocBuffer(size, buffer, storageType);
|
||||
auto res = allocBuffer(size, (Tensor*)nativeTensor, storageType);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -97,7 +97,7 @@ ErrorCode Arm82Relu::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
|
||||
mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
|
||||
MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
|
||||
for (int b = tId; b < batchAndChannel; b += mThreadNumbers) {
|
||||
for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
|
||||
_MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT,
|
||||
src + b * plane * ARMV82_CHANNEL_UNIT,
|
||||
slopeHalf,
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "backend/arm82/Arm82Backend.hpp"
|
||||
#endif
|
||||
#define MAX_THREAD_NUMBER 32
|
||||
#define LARGE_MEMORY 1024 * 1024 * 100
|
||||
#define LARGE_MEMORY 1024 * 1024 * 500
|
||||
|
||||
//#define MNN_DUMP_MEMORY_USAGE
|
||||
#define MNN_CPU_CHECK_NAN 1
|
||||
|
@ -35,8 +35,7 @@ struct cpuinfo_arm_isa gCPUInfo;
|
|||
#endif
|
||||
|
||||
CPURuntime::CPURuntime(const Backend::Info& info) {
|
||||
mDynamicAllocator.reset(new BufferAllocator);
|
||||
mStaticAllocator.reset(new BufferAllocator);
|
||||
mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault()));
|
||||
mThreadNumber = info.numThread;
|
||||
mThreadNumber = std::max(1, mThreadNumber);
|
||||
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
||||
|
@ -88,9 +87,8 @@ CPURuntime:: ~ CPURuntime() {
|
|||
#endif
|
||||
}
|
||||
float CPURuntime::onGetMemoryInMB() {
|
||||
auto dynamicMemoryInMB = mDynamicAllocator->totalSize() / 1024.0f / 1024.0f;
|
||||
auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
|
||||
return dynamicMemoryInMB + staticMemoryInMB;
|
||||
return staticMemoryInMB;
|
||||
}
|
||||
Backend* CPURuntime::onCreate() const{
|
||||
#if defined(__aarch64__) && ENABLE_ARMV82
|
||||
|
@ -102,9 +100,6 @@ Backend* CPURuntime::onCreate() const{
|
|||
}
|
||||
void CPURuntime::onGabageCollect(int level) {
|
||||
mStaticAllocator->release(false);
|
||||
if (level > 50) {
|
||||
mDynamicAllocator->release(false);
|
||||
}
|
||||
}
|
||||
std::map<OpType, CPUBackend::Creator*>* CPUBackend::gCreator = nullptr;
|
||||
|
||||
|
@ -129,7 +124,8 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
|
|||
CPUBackend::CPUBackend(const CPURuntime* runtime, MNNForwardType type) : Backend(type) {
|
||||
mRuntime = runtime;
|
||||
mCheckNAN = runtime->mFlags == MNN_CPU_CHECK_NAN;
|
||||
mDynamicAllocator = runtime->mDynamicAllocator;
|
||||
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
|
||||
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
|
||||
mStaticAllocator = runtime->mStaticAllocator;
|
||||
}
|
||||
bool CPUBackend::supportDot() const {
|
||||
|
@ -137,9 +133,7 @@ bool CPUBackend::supportDot() const {
|
|||
}
|
||||
|
||||
CPUBackend::~CPUBackend() {
|
||||
for (auto p : mDynamic) {
|
||||
mDynamicAllocator->free(p);
|
||||
}
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void CPUBackend::onExecuteBegin() const {
|
||||
|
@ -162,47 +156,45 @@ void CPUBackend::onExecuteEnd() const {
|
|||
#endif
|
||||
}
|
||||
|
||||
bool CPUBackend::allocBuffer(int size, halide_buffer_t& buffer, StorageType storageType) {
|
||||
bool CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
|
||||
// MNN_PRINT("Acquire size = %d\n", size);
|
||||
if (size <= 0) {
|
||||
MNN_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
if (size > LARGE_MEMORY) {
|
||||
MNN_PRINT("Size larger the 100 M :%d\n", size);
|
||||
MNN_PRINT("Size larger than 500 M :%d\n", size);
|
||||
}
|
||||
auto& buffer = dest->buffer();
|
||||
auto des = TensorUtils::getDescribe(dest);
|
||||
std::pair<void*, int> points;
|
||||
switch (storageType) {
|
||||
case STATIC: {
|
||||
#ifdef MNN_DUMP_MEMORY_USAGE
|
||||
buffer.host = (uint8_t*)malloc(size);
|
||||
#else
|
||||
buffer.host = (uint8_t*)(mStaticAllocator->alloc(size, false));
|
||||
#endif
|
||||
points = mStaticAllocator->alloc(size, false);
|
||||
break;
|
||||
}
|
||||
case DYNAMIC: {
|
||||
buffer.host = (uint8_t*)(mDynamicAllocator->alloc(size, false));
|
||||
points = mDynamicAllocator->alloc(size, false);
|
||||
break;
|
||||
}
|
||||
case DYNAMIC_SEPERATE: {
|
||||
buffer.host = (uint8_t*)(mDynamicAllocator->alloc(size, true));
|
||||
points = mDynamicAllocator->alloc(size, true);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
if (nullptr == buffer.host) {
|
||||
if (nullptr == points.first) {
|
||||
MNN_ERROR("Alloc buffer error for cpu backend\n");
|
||||
return false;
|
||||
}
|
||||
if (STATIC == storageType) {
|
||||
// Do nothing
|
||||
} else {
|
||||
mDynamic.insert(buffer.host);
|
||||
}
|
||||
buffer.host = (uint8_t*)points.first + points.second;
|
||||
des->extra.offset = points.second;
|
||||
if (buffer.type.code == halide_type_handle) {
|
||||
// For handle we needn't recycle the buffer, use extra as hanleFreeFunction
|
||||
::memset(buffer.host, 0, size);
|
||||
des->extra.handleFreeFunction = (decltype(des->extra.handleFreeFunction))free;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -213,32 +205,29 @@ bool CPUBackend::onAcquireBuffer(const MNN::Tensor* nativeTensorConst, StorageTy
|
|||
}
|
||||
//FUNC_PRINT_ALL(nativeTensorConst, p);
|
||||
auto nativeTensor = (Tensor*)nativeTensorConst;
|
||||
auto& buffer = nativeTensor->buffer();
|
||||
|
||||
auto size = nativeTensor->size();
|
||||
return allocBuffer(size, buffer, storageType);
|
||||
return allocBuffer(size, nativeTensor, storageType);
|
||||
}
|
||||
|
||||
bool CPUBackend::onReleaseBuffer(const MNN::Tensor* nativeTensor, StorageType storageType) {
|
||||
if (DYNAMIC_SEPERATE == storageType) {
|
||||
return true;
|
||||
}
|
||||
if (nativeTensor == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (nullptr == nativeTensor->buffer().host) {
|
||||
return false;
|
||||
}
|
||||
auto des = TensorUtils::getDescribe(nativeTensor);
|
||||
std::pair<void*, int> pointer;
|
||||
pointer.second = des->extra.offset;
|
||||
pointer.first = (uint8_t*)nativeTensor->buffer().host - des->extra.offset;
|
||||
if (STATIC == storageType) {
|
||||
#ifdef MNN_DUMP_MEMORY_USAGE
|
||||
free(nativeTensor->buffer().host);
|
||||
#else
|
||||
mStaticAllocator->free(nativeTensor->buffer().host);
|
||||
#endif
|
||||
mStaticAllocator->free(pointer);
|
||||
return true;
|
||||
}
|
||||
if (DYNAMIC_SEPERATE == storageType) {
|
||||
return true;
|
||||
}
|
||||
mDynamic.erase(nativeTensor->buffer().host);
|
||||
mDynamicAllocator->free(nativeTensor->buffer().host);
|
||||
mDynamicAllocator->free(pointer);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -338,10 +327,7 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
|
|||
}
|
||||
|
||||
bool CPUBackend::onClearBuffer() {
|
||||
for (auto p : mDynamic) {
|
||||
mDynamicAllocator->free(p);
|
||||
}
|
||||
mDynamic.clear();
|
||||
mDynamicAllocator->release(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,6 @@ public:
|
|||
virtual float onGetMemoryInMB() override;
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
||||
int mThreadNumber;
|
||||
int mTaskIndex;
|
||||
size_t mFlags;
|
||||
|
@ -97,12 +96,11 @@ public:
|
|||
static void initCreatorMap();
|
||||
|
||||
protected:
|
||||
bool allocBuffer(int size, halide_buffer_t& buffer, StorageType storageType);
|
||||
bool allocBuffer(int size, Tensor* dest, StorageType storageType);
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
||||
bool mCheckNAN = false;
|
||||
std::set<void*> mDynamic;
|
||||
const CPURuntime* mRuntime;
|
||||
static std::map<OpType, CPUBackend::Creator*>* getCreatorMap();
|
||||
static std::map<OpType, CPUBackend::Creator*>* gCreator;
|
||||
|
|
|
@ -70,17 +70,7 @@ ErrorCode CPUBatchMatMul::onResize(const std::vector<Tensor*>& inputs, const std
|
|||
TensorUtils::setLinearLayout(unit.mMatrixB.get());
|
||||
TensorUtils::setLinearLayout(unit.mMatrixC.get());
|
||||
|
||||
auto res = backend()->onAcquireBuffer(unit.mMatrixA.get(), Backend::DYNAMIC);
|
||||
res = res && backend()->onAcquireBuffer(unit.mMatrixB.get(), Backend::DYNAMIC);
|
||||
res = res && backend()->onAcquireBuffer(unit.mMatrixC.get(), Backend::DYNAMIC);
|
||||
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto code = unit.mMatMul->onResize(unit.mTempInputs, unit.mTempOutputs);
|
||||
backend()->onReleaseBuffer(unit.mMatrixA.get(), Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(unit.mMatrixB.get(), Backend::DYNAMIC);
|
||||
backend()->onReleaseBuffer(unit.mMatrixC.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -109,10 +99,10 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const st
|
|||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
auto& unit = mUnits[tId];
|
||||
for (int i = (int)tId; i < mBatch; i+=threadNumber) {
|
||||
::memcpy(unit.mMatrixA->host<float>(), input0Ptr + i * input0Stride, input0Stride * sizeof(float));
|
||||
::memcpy(unit.mMatrixB->host<float>(), input1Ptr + i * input1Stride, input1Stride * sizeof(float));
|
||||
unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride);
|
||||
unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride);
|
||||
unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride);
|
||||
unit.mMatMul->onExecute(unit.mTempInputs, unit.mTempOutputs);
|
||||
::memcpy(outputPtr + i * outputStride, unit.mMatrixC->host<float>(), outputStride * sizeof(float));
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
|
|
|
@ -25,8 +25,7 @@ public:
|
|||
auto srcData = input->host<srcT>();
|
||||
auto dstData = output->host<dstT>();
|
||||
const auto inputDataSize = input->elementSize();
|
||||
const auto outputDataSize = output->elementSize();
|
||||
MNN_ASSERT(inputDataSize == outputDataSize);
|
||||
MNN_ASSERT(inputDataSize == output->elementSize());
|
||||
for (int i = 0; i < inputDataSize; i++) {
|
||||
dstData[i] = static_cast<dstT>(srcData[i]);
|
||||
}
|
||||
|
@ -46,8 +45,7 @@ public:
|
|||
auto srcData = input->host<int>();
|
||||
auto dstData = output->host<int>();
|
||||
const auto inputDataSize = input->elementSize();
|
||||
const auto outputDataSize = output->elementSize();
|
||||
MNN_ASSERT(inputDataSize == outputDataSize);
|
||||
MNN_ASSERT(inputDataSize == output->elementSize());
|
||||
for (int i = 0; i < inputDataSize; i++) {
|
||||
int value = srcData[i] == 0 ? 0 : 1;
|
||||
dstData[i] = value;
|
||||
|
|
|
@ -29,8 +29,7 @@ CPUEltwise::CPUEltwise(Backend *b, EltwiseType type, std::vector<float> coef) :
|
|||
ErrorCode CPUEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto inputTensor = inputs[0];
|
||||
const int size = inputTensor->elementSize();
|
||||
auto outputSize = outputs[0]->elementSize();
|
||||
MNN_ASSERT(outputSize == size);
|
||||
MNN_ASSERT(outputs[0]->elementSize() == size);
|
||||
|
||||
auto outputTensor = outputs[0];
|
||||
auto outputHost = outputTensor->host<float>();
|
||||
|
|
|
@ -34,6 +34,12 @@ CPUInterp::CPUInterp(Backend *backend, int resizeType,
|
|||
}
|
||||
|
||||
CPUInterp::~CPUInterp() {
|
||||
if (mInit && mResizeType == 2) {
|
||||
backend()->onReleaseBuffer(&mWidthPosition, Backend::STATIC);
|
||||
backend()->onReleaseBuffer(&mWidthFactor, Backend::STATIC);
|
||||
backend()->onReleaseBuffer(&mHeightPosition, Backend::STATIC);
|
||||
backend()->onReleaseBuffer(&mHeightFactor, Backend::STATIC);
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
@ -61,6 +67,9 @@ ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
}
|
||||
|
||||
ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (mResizeType != 2) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
const int inW = inputs[0]->buffer().dim[3].extent;
|
||||
const int inH = inputs[0]->buffer().dim[2].extent;
|
||||
const int outW = outputs[0]->buffer().dim[3].extent;
|
||||
|
@ -96,9 +105,6 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
|
|||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
if (mResizeType != 2) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
mInit = true;
|
||||
|
||||
auto _wPosition = mWidthPosition.host<int>();
|
||||
|
|
|
@ -73,11 +73,12 @@ static void _TransposePackC4MultiThread(const float* BPtr, float* BTempPtr, int
|
|||
}
|
||||
}
|
||||
|
||||
void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h) {
|
||||
void CPUMatMul::_scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h) {
|
||||
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
||||
MNN_ASSERT(e == 1);
|
||||
if (mTransposeB) {
|
||||
mPostFunctions.emplace_back(std::make_pair([C, A, B, h, l, numberThread, biasPtr](int tId) {
|
||||
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
|
||||
int tId, const float* A, const float* B, float* C) {
|
||||
auto lC4 = l / 4;
|
||||
auto lR = lC4 * 4;
|
||||
for (int y=tId; y<h; y+=numberThread) {
|
||||
|
@ -97,7 +98,8 @@ void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const
|
|||
}
|
||||
}, numberThread));
|
||||
} else {
|
||||
mPostFunctions.emplace_back(std::make_pair([C, A, B, h, l, numberThread, biasPtr](int tId) {
|
||||
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
|
||||
int tId, const float* A, const float* B, float* C) {
|
||||
auto hC4 = h / 4;
|
||||
auto hR = hC4 * 4;
|
||||
for (int y=tId; y<hC4; y+=numberThread) {
|
||||
|
@ -128,7 +130,7 @@ void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const
|
|||
}
|
||||
}
|
||||
|
||||
void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h) {
|
||||
void CPUMatMul::_scheduleForVec(float* C, const float* biasPtr, int e, int l, int h) {
|
||||
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
|
||||
// TODD: Support e = 1
|
||||
MNN_ASSERT(h == 1);
|
||||
|
@ -137,7 +139,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
|
|||
biasValue = *biasPtr;
|
||||
}
|
||||
if (mTransposeA) {
|
||||
mPostFunctions.emplace_back(std::make_pair([C, A, B, e, l, numberThread, biasValue](int tId) {
|
||||
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
|
||||
int tId, const float* A, const float* B, float* C) {
|
||||
auto eC4 = e / 4;
|
||||
auto eR = eC4 * 4;
|
||||
for (int y=tId; y<eC4; y+=numberThread) {
|
||||
|
@ -160,7 +163,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
|
|||
}
|
||||
}, numberThread));
|
||||
} else {
|
||||
mPostFunctions.emplace_back(std::make_pair([C, A, B, e, l, numberThread, biasValue](int tId) {
|
||||
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
|
||||
int tId, const float* A, const float* B, float* C) {
|
||||
auto lC4 = l / 4;
|
||||
auto lR = lC4 * 4;
|
||||
for (int y=tId; y<e; y+=numberThread) {
|
||||
|
@ -182,11 +186,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
|
|||
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
const Tensor* A = inputs[0];
|
||||
const Tensor* B = inputs[1];
|
||||
auto APtr = A->host<float>();
|
||||
auto BPtr = B->host<float>();
|
||||
Tensor* C = outputs[0];
|
||||
auto CPtr = C->host<float>();
|
||||
MNN_ASSERT(BPtr != nullptr && APtr != nullptr && CPtr != nullptr);
|
||||
|
||||
// Fill output by zero if one of inputs is empty.
|
||||
if (A->elementSize() == 0 || B->elementSize() == 0) {
|
||||
return NO_ERROR;
|
||||
|
@ -209,7 +210,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto bias = inputs[2];
|
||||
biasPtr = bias->host<float>();
|
||||
}
|
||||
_scheduleForVec(C->host<float>(), A->host<float>(), B->host<float>(), biasPtr, e, l, h);
|
||||
_scheduleForVec(C->host<float>(), biasPtr, e, l, h);
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (e == 1) {
|
||||
|
@ -218,7 +219,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto bias = inputs[2];
|
||||
biasPtr = bias->host<float>();
|
||||
}
|
||||
_scheduleForVecE(C->host<float>(), A->host<float>(), B->host<float>(), biasPtr, e, l, h);
|
||||
_scheduleForVecE(C->host<float>(), biasPtr, e, l, h);
|
||||
return NO_ERROR;
|
||||
}
|
||||
int eP, lP, hP;
|
||||
|
@ -235,7 +236,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto hC4 = UP_DIV(h, 4);
|
||||
auto lC4 = UP_DIV(l, 4);
|
||||
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
||||
mPreFunctions.emplace_back(std::make_pair([BPtr, BTempPtr, l, h, this] (int tId) {
|
||||
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this] (int tId, const float* APtr, const float* BPtr) {
|
||||
MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
|
||||
} , 1));
|
||||
res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
|
||||
|
@ -246,12 +247,13 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto ATPtr = AT->host<float>();
|
||||
if (mTransposeA) {
|
||||
// l, e -> lC4, e, 4
|
||||
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l](int tId) {
|
||||
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l](int tId, const float* APtr, const float* BPtr) {
|
||||
MNNPackC4(ATPtr, APtr, e, l);
|
||||
}, 1));
|
||||
} else {
|
||||
// e, l -> lC4, e, 4
|
||||
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l, lC4, numberThread](int tId) {
|
||||
mPreFunctions.emplace_back(std::make_pair(
|
||||
[ATPtr, e, l, lC4, numberThread](int tId, const float* APtr, const float* BPtr) {
|
||||
_TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread);
|
||||
}, numberThread));
|
||||
}
|
||||
|
@ -270,7 +272,8 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
}
|
||||
auto borigin = bias->host<float>();
|
||||
auto bdest = biasWrap->host<float>();
|
||||
mPreFunctions.emplace_back(std::make_pair([borigin, biasLength, bdest](int tId) {
|
||||
mPreFunctions.emplace_back(std::make_pair(
|
||||
[borigin, biasLength, bdest](int tId, const float* APtr, const float* BPtr) {
|
||||
::memset(bdest, 0, UP_DIV(biasLength, 4) * 4 * sizeof(float));
|
||||
::memcpy(bdest, borigin, biasLength * sizeof(float));
|
||||
}, 1));
|
||||
|
@ -292,7 +295,8 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto CTPtr = CT->host<float>();
|
||||
|
||||
// hC4, e, 4 -> e, h
|
||||
mPostFunctions.emplace_back(std::make_pair([CPtr, CTPtr, e, h, hC4, numberThread](int tId) {
|
||||
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, hC4, numberThread](
|
||||
int tId, const float* APtr, const float* BPtr, float* CPtr) {
|
||||
_TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread);
|
||||
}, numberThread));
|
||||
backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
|
||||
|
@ -308,16 +312,21 @@ ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::ve
|
|||
::memset(outputs[0]->host<char>(), 0, outputs[0]->size());
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto APtr = inputs[0]->host<float>();
|
||||
auto BPtr = inputs[1]->host<float>();
|
||||
auto CPtr = outputs[0]->host<float>();
|
||||
|
||||
for (auto& f : mPreFunctions) {
|
||||
MNN_CONCURRENCY_BEGIN(tId, f.second) {
|
||||
f.first(tId);
|
||||
f.first(tId, APtr, BPtr);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
mComputer->onExecute();
|
||||
for (auto& f : mPostFunctions) {
|
||||
MNN_CONCURRENCY_BEGIN(tId, f.second) {
|
||||
f.first(tId);
|
||||
f.first(tId, APtr, BPtr, CPtr);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
}
|
||||
|
|
|
@ -23,13 +23,13 @@ public:
|
|||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
void _scheduleForVec(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h);
|
||||
void _scheduleForVecE(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h);
|
||||
void _scheduleForVec(float* C, const float* biasPtr, int e, int l, int h);
|
||||
void _scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h);
|
||||
bool mTransposeA;
|
||||
bool mTransposeB;
|
||||
bool mSupportMultiThread = false;
|
||||
std::vector<std::pair<std::function<void(int)>, int>> mPreFunctions;
|
||||
std::vector<std::pair<std::function<void(int)>, int>> mPostFunctions;
|
||||
std::vector<std::pair<std::function<void(int, const float*, const float*)>, int>> mPreFunctions;
|
||||
std::vector<std::pair<std::function<void(int, const float*, const float*, float*)>, int>> mPostFunctions;
|
||||
std::shared_ptr<StrassenMatrixComputor> mComputer;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
|
|
@ -32,6 +32,7 @@ extern void ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
|
|||
extern void ___CPUPoolCreator__OpType_Pooling__();
|
||||
extern void ___CPUScatterNdCreator__OpType_ScatterNd__();
|
||||
extern void ___CPUShapeCreator__OpType_Shape__();
|
||||
extern void ___CPUPluginCreator__OpType_Plugin__();
|
||||
extern void ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
|
||||
extern void ___CPUROIPoolingCreator__OpType_ROIPooling__();
|
||||
extern void ___CPUTopKV2Creator__OpType_TopKV2__();
|
||||
|
@ -105,6 +106,7 @@ ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
|
|||
___CPUPoolCreator__OpType_Pooling__();
|
||||
___CPUScatterNdCreator__OpType_ScatterNd__();
|
||||
___CPUShapeCreator__OpType_Shape__();
|
||||
___CPUPluginCreator__OpType_Plugin__();
|
||||
___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
|
||||
___CPUROIPoolingCreator__OpType_ROIPooling__();
|
||||
___CPUTopKV2Creator__OpType_TopKV2__();
|
||||
|
|
|
@ -52,8 +52,7 @@ ErrorCode CPUOneHot::onExecute(const std::vector<Tensor*>& inputs, const std::ve
|
|||
const auto indicesPtr = indices->host<int>();
|
||||
|
||||
auto dataType = onValueTensor->getType();
|
||||
auto offDataType = offValueTensor->getType();
|
||||
MNN_ASSERT(dataType == offDataType);
|
||||
MNN_ASSERT(offValueTensor->getType() == dataType);
|
||||
|
||||
if (dataType == halide_type_of<float>()) {
|
||||
OneHotImpl<float>(depth, outerSize, innerSize, indicesPtr, onValueTensor, offValueTensor, outputs[0]);
|
||||
|
|
|
@ -6,17 +6,18 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef MNN_WITH_PLUGIN
|
||||
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/AutoStorage.h"
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#ifdef MNN_WITH_PLUGIN
|
||||
#include "MNN/plugin/PluginContext.hpp"
|
||||
#include "MNN/plugin/PluginKernel.hpp"
|
||||
#endif // MNN_WITH_PLUGIN
|
||||
|
||||
namespace MNN {
|
||||
|
||||
#ifdef MNN_WITH_PLUGIN
|
||||
static std::shared_ptr<plugin::CPUComputeKernel> getCPUComputeKernel( // NOLINT
|
||||
const std::string& name) { // NOLINT
|
||||
return std::shared_ptr<plugin::CPUComputeKernel>( // NOLINT
|
||||
|
@ -55,12 +56,14 @@ ErrorCode CPUPlugin::onExecute(const std::vector<Tensor*>& inputs, // NOLINT
|
|||
return INVALID_VALUE;
|
||||
}
|
||||
}
|
||||
#endif // MNN_WITH_PLUGIN
|
||||
|
||||
class CPUPluginCreator : public CPUBackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, // NOLINT
|
||||
const std::vector<Tensor*>& outputs, // NOLINT
|
||||
const MNN::Op* op, Backend* backend) const {
|
||||
#ifdef MNN_WITH_PLUGIN
|
||||
MNN_ASSERT(op->type() == OpType_Plugin);
|
||||
// Plugin op should has inputs or outputs, or both of them.
|
||||
MNN_CHECK(inputs.size() > 0 || outputs.size() > 0, // NOLINT
|
||||
|
@ -76,11 +79,13 @@ public:
|
|||
ctx->setAttr(attr->key()->str(), attr);
|
||||
}
|
||||
return new CPUPlugin(std::move(ctx));
|
||||
#else
|
||||
MNN_ERROR("Plugin is not supported. Please recompile with `MNN_WITH_PLUGIN` enabled.");
|
||||
return nullptr;
|
||||
#endif // MNN_WITH_PLUGIN
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OP_CREATOR(CPUPluginCreator, OpType_Plugin);
|
||||
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_WITH_PLUGIN
|
||||
|
|
|
@ -101,7 +101,6 @@ static void poolingMax(const float *channelInput, int inputWidth, int inputHeigh
|
|||
channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
|
||||
float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
|
||||
int wCount = padRight - padLeft;
|
||||
int iwStart = -padWidth + padLeft * strideWidth;
|
||||
int wCountC4 = wCount / 4;
|
||||
int wCountRemain = wCount - wCountC4 * 4;
|
||||
int strideWidthFuse = strideWidth4 * 4;
|
||||
|
|
|
@ -54,11 +54,8 @@ ErrorCode CPUQuantizedAdd::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
mLeftShiftResult1 = (1 << leftShift) * ((1 << leftShift1));
|
||||
mLeftShiftResult2 = (1 << leftShift) * ((1 << leftShift2));
|
||||
|
||||
const int left1 = leftShift + leftShift1;
|
||||
const int left2 = leftShift + leftShift2;
|
||||
|
||||
MNN_ASSERT(left1 == leftShift);
|
||||
MNN_ASSERT(left2 == leftShift);
|
||||
MNN_ASSERT(leftShift + leftShift1 == leftShift);
|
||||
MNN_ASSERT(leftShift + leftShift2 == leftShift);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
@ -62,7 +62,6 @@ ErrorCode CPUROIPooling::onExecute(const std::vector<Tensor *> &inputs, const st
|
|||
auto ow = output->width(), oh = output->height(), os = ow * oh * 4;
|
||||
auto slice = UP_DIV(input->channel(), 4);
|
||||
auto numROI = inputs[1]->batch();
|
||||
auto batchSize = input->batch();
|
||||
|
||||
for (int n = 0; n < numROI; ++n) {
|
||||
auto batchOutput = output->host<float>() + output->buffer().dim[0].stride * n;
|
||||
|
@ -72,7 +71,7 @@ ErrorCode CPUROIPooling::onExecute(const std::vector<Tensor *> &inputs, const st
|
|||
int y1 = round(roiPtr[2] * mSpatialScale);
|
||||
int x2 = round(roiPtr[3] * mSpatialScale);
|
||||
int y2 = round(roiPtr[4] * mSpatialScale);
|
||||
MNN_ASSERT(roi < batchSize);
|
||||
MNN_ASSERT(roi < input->batch());
|
||||
|
||||
int roiW = max(x2 - x1 + 1, 1);
|
||||
int roiH = max(y2 - y1 + 1, 1);
|
||||
|
|
|
@ -10,11 +10,10 @@
|
|||
namespace MNN {
|
||||
|
||||
ErrorCode CPUSelect::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto inSize0 = inputs[0]->elementSize();
|
||||
auto inSize1 = inputs[1]->elementSize();
|
||||
auto inSize2 = inputs[2]->elementSize();
|
||||
auto outSize = outputs[0]->elementSize();
|
||||
MNN_ASSERT(inSize0 == outSize);
|
||||
MNN_ASSERT(inputs[0]->elementSize() == outSize);
|
||||
MNN_ASSERT(inSize1 == 1 || inSize1 == outSize);
|
||||
MNN_ASSERT(inSize2 == 1 || inSize2 == outSize);
|
||||
auto output = outputs[0]->host<float>();
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
#include "backend/cpu/CPUTopKV2.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/Concurrency.h"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
@ -98,8 +100,60 @@ ErrorCode CPUTopKV2::onExecute(const std::vector<Tensor*>& inputs, const std::ve
|
|||
const int inputDimension = inputTensor->buffer().dimensions;
|
||||
|
||||
const int rowSize = inputTensor->buffer().dim[inputDimension - 1].extent;
|
||||
const int rowC4Blocks = rowSize / 4;
|
||||
const int rowRemain = rowSize % 4;
|
||||
const int rowC4ElementSize = rowC4Blocks * 4;
|
||||
MNN_ASSERT(k <= rowSize);
|
||||
const int numRows = inputTensor->elementSize() / rowSize;
|
||||
|
||||
if (k == 1) {
|
||||
if (halide_type_float == inputTensor->getType().code) {
|
||||
float* inputData = inputTensor->host<float>();
|
||||
float* topkData = outputData->host<float>();
|
||||
int32_t* indicesData = outputIndices->host<int32_t>();
|
||||
|
||||
MNN_CONCURRENCY_BEGIN(i, numRows) {
|
||||
float* inputRowData = inputData + i * rowSize;
|
||||
float* rowTopkData = topkData + i * k;
|
||||
int32_t* rowTopkIndexData = indicesData + i * k;
|
||||
MNNVectorTop1Float(inputRowData, rowTopkData, rowTopkIndexData, rowC4Blocks);
|
||||
for (int j = 0; j < rowRemain; j++) {
|
||||
int index = rowC4ElementSize + j;
|
||||
float value = inputRowData[index];
|
||||
if (value > rowTopkData[0]) {
|
||||
rowTopkData[0] = value;
|
||||
rowTopkIndexData[0] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
} else if (halide_type_int == inputTensor->getType().code && 32 == inputTensor->getType().bits) {
|
||||
int32_t* inputData = inputTensor->host<int32_t>();
|
||||
int32_t* topkData = outputData->host<int32_t>();
|
||||
int32_t* indicesData = outputIndices->host<int32_t>();
|
||||
MNN_CONCURRENCY_BEGIN(i, numRows) {
|
||||
int32_t* inputRowData = inputData + i * rowSize;
|
||||
int32_t* rowTopkData = topkData + i * k;
|
||||
int32_t* rowTopkIndexData = indicesData + i * k;
|
||||
MNNVectorTop1Int32(inputRowData, rowTopkData, rowTopkIndexData, rowC4Blocks);
|
||||
for (int j = 0; j < rowRemain; j++) {
|
||||
int index = rowC4ElementSize + j;
|
||||
int32_t value = inputRowData[index];
|
||||
if (value > rowTopkData[0]) {
|
||||
rowTopkData[0] = value;
|
||||
rowTopkIndexData[0] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
} else {
|
||||
MNN_PRINT("TopKV2 data type not supported\n");
|
||||
MNN_ASSERT(false);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
if (halide_type_float == inputTensor->getType().code) {
|
||||
auto inputData = inputTensor->host<float>();
|
||||
auto topkData = outputData->host<float>();
|
||||
|
|
|
@ -26,8 +26,7 @@ CPUUnary::CPUUnary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType
|
|||
|
||||
ErrorCode CPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
MNN_ASSERT(1 == outputs.size());
|
||||
auto dtype = inputs[0]->getType();
|
||||
MNN_ASSERT(dtype == halide_type_of<float>() || dtype == halide_type_of<int32_t>());
|
||||
MNN_ASSERT(inputs[0]->getType() == halide_type_of<float>() || inputs[0]->getType() == halide_type_of<int32_t>());
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@ namespace MNN {
|
|||
|
||||
ErrorCode CPUWhere::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto& ib = inputs[0]->buffer();
|
||||
auto& ob = outputs[0]->buffer();
|
||||
int32_t* inputData = inputs[0]->host<int32_t>();
|
||||
auto outputData = outputs[0]->host<int32_t>();
|
||||
auto inputTotal = inputs[0]->elementSize();
|
||||
|
@ -25,7 +24,7 @@ ErrorCode CPUWhere::onExecute(const std::vector<Tensor*>& inputs, const std::vec
|
|||
}
|
||||
}
|
||||
|
||||
MNN_ASSERT(ob.dim[0].extent == trueVec.size());
|
||||
MNN_ASSERT(outputs[0]->batch() == trueVec.size());
|
||||
for (int i = 0; i < trueVec.size(); i++) {
|
||||
int index = trueVec[i];
|
||||
for (int j = 0; j < ib.dimensions; j++) {
|
||||
|
|
|
@ -191,10 +191,7 @@ ThreadPool::ThreadPool(int numberThread) {
|
|||
}
|
||||
|
||||
ThreadPool::~ThreadPool() {
|
||||
{
|
||||
std::lock_guard<std::mutex> _l(mQueueMutex);
|
||||
mStop = true;
|
||||
}
|
||||
mCondition.notify_all();
|
||||
for (auto& worker : mWorkers) {
|
||||
worker.join();
|
||||
|
@ -234,10 +231,8 @@ void ThreadPool::active() {
|
|||
if (nullptr == gInstance) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> _l(gInstance->mQueueMutex);
|
||||
gInstance->mActiveCount++;
|
||||
}
|
||||
std::lock_guard<std::mutex> _l(gInstance->mQueueMutex);
|
||||
gInstance->mCondition.notify_all();
|
||||
}
|
||||
void ThreadPool::deactive() {
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
//
|
||||
// MNNVectorTop1Float.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/12/08.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNVectorTop1Float
|
||||
// void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
|
||||
push {r4-r11, lr}
|
||||
|
||||
// Auto: r0: input, r1: maxValue, r2: maxIndex, r3: inputCountUnit
|
||||
|
||||
// q15 maxValue
|
||||
vld1.f32 {q15}, [r0]
|
||||
|
||||
// q14 maxIndex
|
||||
mov r11, #0
|
||||
vmov.s32 d28[0], r11
|
||||
mov r11, #1
|
||||
vmov.s32 d28[1], r11
|
||||
mov r11, #2
|
||||
vmov.s32 d29[0], r11
|
||||
mov r11, #3
|
||||
vmov.s32 d29[1], r11
|
||||
|
||||
// q11 current index
|
||||
vmov.s32 q11, q14
|
||||
|
||||
// all 4, increment
|
||||
mov r11, #4
|
||||
vmov.s32 d20[0], r11
|
||||
vmov.s32 d20[1], r11
|
||||
vmov.s32 d21[0], r11
|
||||
vmov.s32 d21[1], r11
|
||||
|
||||
|
||||
cmp r3, #0
|
||||
beq End
|
||||
|
||||
Loop:
|
||||
vld1.f32 {q13}, [r0]!
|
||||
|
||||
vcgt.f32 q12, q13, q15
|
||||
vbit.f32 q15, q13, q12
|
||||
vbit.s32 q14, q11, q12
|
||||
|
||||
vadd.s32 q11, q11, q10
|
||||
subs r3, r3, #1
|
||||
|
||||
bne Loop
|
||||
|
||||
// reduce result to single value and index
|
||||
vcgt.f32 d24, d31, d30
|
||||
vbit.f32 d30, d31, d24
|
||||
vbit.s32 d28, d29, d24
|
||||
|
||||
vtrn.f32 d30, d31
|
||||
vtrn.s32 d28, d29
|
||||
|
||||
vcgt.f32 d24, d31, d30
|
||||
vbit.f32 d30, d31, d24
|
||||
vbit.s32 d28, d29, d24
|
||||
|
||||
vst1.f32 d30[0], [r1]
|
||||
vst1.s32 d28[0], [r2]
|
||||
|
||||
End:
|
||||
pop {r4-r11, pc}
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
|
@ -0,0 +1,83 @@
|
|||
//
|
||||
// MNNVectorTop1Int32.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/12/08.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNVectorTop1Int32
|
||||
// void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
|
||||
push {r4-r11, lr}
|
||||
|
||||
// Auto: r0: input, r1: maxValue, r2: maxIndex, r3: inputCountUnit
|
||||
|
||||
// q15 maxValue
|
||||
vld1.s32 {q15}, [r0]
|
||||
|
||||
// q14 maxIndex
|
||||
mov r11, #0
|
||||
vmov.s32 d28[0], r11
|
||||
mov r11, #1
|
||||
vmov.s32 d28[1], r11
|
||||
mov r11, #2
|
||||
vmov.s32 d29[0], r11
|
||||
mov r11, #3
|
||||
vmov.s32 d29[1], r11
|
||||
|
||||
// q11 current index
|
||||
vmov.s32 q11, q14
|
||||
|
||||
// all 4, increment
|
||||
mov r11, #4
|
||||
vmov.s32 d20[0], r11
|
||||
vmov.s32 d20[1], r11
|
||||
vmov.s32 d21[0], r11
|
||||
vmov.s32 d21[1], r11
|
||||
|
||||
|
||||
cmp r3, #0
|
||||
beq End
|
||||
|
||||
Loop:
|
||||
vld1.s32 {q13}, [r0]!
|
||||
|
||||
vcgt.s32 q12, q13, q15
|
||||
vbit.s32 q15, q13, q12
|
||||
vbit.s32 q14, q11, q12
|
||||
|
||||
vadd.s32 q11, q11, q10
|
||||
subs r3, r3, #1
|
||||
|
||||
bne Loop
|
||||
|
||||
// reduce result to single value and index
|
||||
vcgt.s32 d24, d31, d30
|
||||
vbit.s32 d30, d31, d24
|
||||
vbit.s32 d28, d29, d24
|
||||
|
||||
vtrn.s32 d30, d31
|
||||
vtrn.s32 d28, d29
|
||||
|
||||
vcgt.s32 d24, d31, d30
|
||||
vbit.s32 d30, d31, d24
|
||||
vbit.s32 d28, d29, d24
|
||||
|
||||
vst1.s32 d30[0], [r1]
|
||||
vst1.s32 d28[0], [r2]
|
||||
|
||||
End:
|
||||
pop {r4-r11, pc}
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
|
@ -16,9 +16,11 @@
|
|||
asm_function MNNPackedMatMulRemain
|
||||
//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
|
||||
//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5: cache, x6:postParameters, x7:bias
|
||||
str x19, [sp, #-8]
|
||||
str x20, [sp, #-16]
|
||||
str x21, [sp, #-24]
|
||||
sub sp, sp, #32
|
||||
str x19, [sp, #0]
|
||||
str x20, [sp, #8]
|
||||
str x21, [sp, #16]
|
||||
add sp, sp, #32
|
||||
ldr x11, [x4, #0] // aStride
|
||||
ldr x9, [x4, #8] // l
|
||||
ldr x10, [x4, #16] // h
|
||||
|
@ -530,9 +532,11 @@ LoopE1:
|
|||
|
||||
|
||||
End:
|
||||
ldr x19, [sp, #-8]
|
||||
ldr x20, [sp, #-16]
|
||||
ldr x21, [sp, #-24]
|
||||
sub sp, sp, #32
|
||||
ldr x19, [sp, #0]
|
||||
ldr x20, [sp, #8]
|
||||
ldr x21, [sp, #16]
|
||||
add sp, sp, #32
|
||||
|
||||
ret
|
||||
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
//
|
||||
// MNNVectorTop1Float.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/12/09.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNVectorTop1Float
|
||||
// void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
|
||||
// Auto: x0: input, x1: maxValue, x2: maxIndex, x3: inputCountUnit
|
||||
|
||||
// v30 maxValue
|
||||
ld1 {v30.4s}, [x0]
|
||||
|
||||
// v29 maxIndex
|
||||
mov w11, #0
|
||||
mov v29.s[0], w11
|
||||
mov w11, #1
|
||||
mov v29.s[1], w11
|
||||
mov w11, #2
|
||||
mov v29.s[2], w11
|
||||
mov w11, #3
|
||||
mov v29.s[3], w11
|
||||
|
||||
// v28 current index
|
||||
mov v28.4s, v29.4s
|
||||
|
||||
// v27, all 4, increment
|
||||
mov w11, #4
|
||||
mov v27.s[0], w11
|
||||
mov v27.s[1], w11
|
||||
mov v27.s[2], w11
|
||||
mov v27.s[3], w11
|
||||
|
||||
|
||||
cmp x3, #0
|
||||
beq End
|
||||
|
||||
Loop:
|
||||
ld1 {v26.4s}, [x0], #16
|
||||
|
||||
fcmgt v25.4s, v26.4s, v30.4s
|
||||
bit v30.16b, v26.16b, v25.16b
|
||||
bit v29.16b, v28.16b, v25.16b
|
||||
|
||||
add v28.4s, v28.4s, v27.4s
|
||||
subs x3, x3, #1
|
||||
|
||||
bne Loop
|
||||
|
||||
// reduce result to single value and index
|
||||
mov v20.d[0], v30.d[1]
|
||||
mov v21.d[0], v29.d[1]
|
||||
|
||||
fcmgt v25.2s, v20.2s, v30.2s
|
||||
bit v30.8b, v20.8b, v25.8b
|
||||
bit v29.8b, v21.8b, v25.8b
|
||||
|
||||
mov v20.s[0], v30.s[1]
|
||||
mov v21.s[0], v29.s[1]
|
||||
|
||||
fcmgt v25.2s, v20.2s, v30.2s
|
||||
bit v30.8b, v20.8b, v25.8b
|
||||
bit v29.8b, v21.8b, v25.8b
|
||||
|
||||
st1 {v30.s}[0], [x1]
|
||||
st1 {v29.s}[0], [x2]
|
||||
|
||||
|
||||
End:
|
||||
ret
|
||||
|
||||
|
||||
#endif
|
|
@ -0,0 +1,83 @@
|
|||
//
|
||||
// MNNVectorTop1Int32.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/12/09.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
#include "MNNAsmGlobal.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
|
||||
asm_function MNNVectorTop1Int32
|
||||
// void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
|
||||
// Auto: x0: input, x1: maxValue, x2: maxIndex, x3: inputCountUnit
|
||||
|
||||
// v30 maxValue
|
||||
ld1 {v30.4s}, [x0]
|
||||
|
||||
// v29 maxIndex
|
||||
mov w11, #0
|
||||
mov v29.s[0], w11
|
||||
mov w11, #1
|
||||
mov v29.s[1], w11
|
||||
mov w11, #2
|
||||
mov v29.s[2], w11
|
||||
mov w11, #3
|
||||
mov v29.s[3], w11
|
||||
|
||||
// v28 current index
|
||||
mov v28.4s, v29.4s
|
||||
|
||||
// v27, all 4, increment
|
||||
mov w11, #4
|
||||
mov v27.s[0], w11
|
||||
mov v27.s[1], w11
|
||||
mov v27.s[2], w11
|
||||
mov v27.s[3], w11
|
||||
|
||||
|
||||
cmp x3, #0
|
||||
beq End
|
||||
|
||||
Loop:
|
||||
ld1 {v26.4s}, [x0], #16
|
||||
|
||||
cmgt v25.4s, v26.4s, v30.4s
|
||||
bit v30.16b, v26.16b, v25.16b
|
||||
bit v29.16b, v28.16b, v25.16b
|
||||
|
||||
add v28.4s, v28.4s, v27.4s
|
||||
subs x3, x3, #1
|
||||
|
||||
bne Loop
|
||||
|
||||
// reduce result to single value and index
|
||||
mov v20.d[0], v30.d[1]
|
||||
mov v21.d[0], v29.d[1]
|
||||
|
||||
cmgt v25.2s, v20.2s, v30.2s
|
||||
bit v30.8b, v20.8b, v25.8b
|
||||
bit v29.8b, v21.8b, v25.8b
|
||||
|
||||
mov v20.s[0], v30.s[1]
|
||||
mov v21.s[0], v29.s[1]
|
||||
|
||||
cmgt v25.2s, v20.2s, v30.2s
|
||||
bit v30.8b, v20.8b, v25.8b
|
||||
bit v29.8b, v21.8b, v25.8b
|
||||
|
||||
st1 {v30.s}[0], [x1]
|
||||
st1 {v29.s}[0], [x2]
|
||||
|
||||
|
||||
End:
|
||||
ret
|
||||
|
||||
|
||||
#endif
|
|
@ -865,4 +865,37 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
|
||||
float maxV = input[0];
|
||||
int maxIdx = 0;
|
||||
for (int i = 0; i < inputCountUnit; i++) {
|
||||
int offset = i * UNIT;
|
||||
for (int j = 0; j < UNIT; j++) {
|
||||
if (input[offset + j] > maxV) {
|
||||
maxV = input[offset + j];
|
||||
maxIdx = offset + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
maxValue[0] = maxV;
|
||||
maxIndex[0] = maxIdx;
|
||||
}
|
||||
|
||||
void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
|
||||
int32_t maxV = input[0];
|
||||
int maxIdx = 0;
|
||||
for (int i = 0; i < inputCountUnit; i++) {
|
||||
int offset = i * UNIT;
|
||||
for (int j = 0; j < UNIT; j++) {
|
||||
if (input[offset + j] > maxV) {
|
||||
maxV = input[offset + j];
|
||||
maxIdx = offset + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
maxValue[0] = maxV;
|
||||
maxIndex[0] = maxIdx;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -91,6 +91,9 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi
|
|||
|
||||
// dim: 4-element, sizeDW, sizeDH, strideSW, strideDH
|
||||
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim); // not C4
|
||||
|
||||
void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -354,7 +354,7 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
const int ow = output->width(), oh = output->height();
|
||||
const int iw = input->width(), ih = input->height();
|
||||
const int dc_4 = UP_DIV(output->channel(), 4);
|
||||
const int padX = mPadX, padY = mPadY, kernelSize = 9;
|
||||
const int padX = mPadX, padY = mPadY;
|
||||
|
||||
const bool combine1D2D = (mStrategy.unitType == ComputeStrategy::D2_D1);
|
||||
const bool offline = (mStrategy.transPhase == ComputeStrategy::Offline);
|
||||
|
@ -373,7 +373,6 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
for (int b = 0; b < input->batch(); ++b) {
|
||||
auto src = input->host<int8_t>() + b * input->stride(0);
|
||||
auto dst = mTempInput->host<int8_t>() + b * mTempInput->stride(0);
|
||||
const int threadNumber = ((CPUBackend*)backend())->threadNumber();
|
||||
const int ic8 = UP_DIV(input->channel(), 8), ic4 = UP_DIV(input->channel(), 4);
|
||||
// C4 to C8
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
|
@ -592,7 +591,7 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
auto gemmConcurrencyFunc = [=, &gemmFunc](int xC, int gemmNum, const int8_t* srcOrigin, const int8_t* weight, float* dstOrigin) {
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
const int step = UP_DIV(gemmNum, threadNumber);
|
||||
gemmFunc(xC, tId * step, ALIMIN((tId + 1) * step, gemmNum), srcOrigin, weight, dstOrigin);
|
||||
gemmFunc(xC, (int)tId * step, ALIMIN((tId + 1) * step, gemmNum), srcOrigin, weight, dstOrigin);
|
||||
}
|
||||
MNN_CONCURRENCY_END()
|
||||
};
|
||||
|
|
|
@ -267,7 +267,6 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
|
|||
MNN_CONCURRENCY_END();
|
||||
|
||||
auto batch = input->batch();
|
||||
auto matrixSizeE = output->height() * output->width() * input->batch();
|
||||
auto outputPlane = output->height() * output->width();
|
||||
auto ocC4 = UP_DIV(output->channel(), 4);
|
||||
MNN_CONCURRENCY_BEGIN(y, ocC4) {
|
||||
|
|
|
@ -15,8 +15,7 @@ namespace MNN {
|
|||
ConvolutionGroup::ConvolutionGroup(Backend *b, const std::vector<std::shared_ptr<Execution>> &subConvolution)
|
||||
: MNN::Execution(b) {
|
||||
mSubConvolution = subConvolution;
|
||||
auto group = subConvolution.size();
|
||||
MNN_ASSERT(group > 1);
|
||||
MNN_ASSERT(subConvolution.size() > 1);
|
||||
|
||||
mInputRaw.reset(new Tensor(4));
|
||||
mInputUnit.reset(new Tensor(4, Tensor::CAFFE_C4));
|
||||
|
|
|
@ -118,7 +118,6 @@ ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {
|
|||
ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& inputs,
|
||||
const std::vector<Tensor*>& outputs) {
|
||||
CPUConvolution::onResize(inputs, outputs);
|
||||
auto layer = mCommon;
|
||||
auto input = inputs[0];
|
||||
auto weight = inputs[1];
|
||||
Tensor* bias = nullptr;
|
||||
|
|
|
@ -114,7 +114,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
|
|||
MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
|
||||
|
||||
auto srcUnit2 = srcUnit * srcUnit;
|
||||
auto dstUnit2 = dstUnit * dstUnit;
|
||||
|
||||
int ow = output->width();
|
||||
int oh = output->height();
|
||||
|
@ -137,7 +136,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
|
|||
int tileCount = UP_DIV(totalCount, ePack);
|
||||
int eRemain = totalCount % ePack;
|
||||
threadNumber = std::min(threadNumber, tileCount);
|
||||
auto hDiv = MNNGetC4DivNumber(hPack);
|
||||
std::vector<size_t> parameters(6);
|
||||
parameters[0] = eRemain * sizeof(float);
|
||||
parameters[1] = input->channel();
|
||||
|
@ -277,7 +275,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
|
|||
for (int z = 0; z < dc_4; ++z) {
|
||||
auto dstZAddr = dstStart + z * dstZStep;
|
||||
auto srcZ = srcXi + z * srcZStep;
|
||||
auto biasZ = bias + 4 * z;
|
||||
// Transform
|
||||
for (int i = 0; i < srcUnit; ++i) {
|
||||
mDestTransform(srcZ + i * unitStep, midBuffer0 + i * dstUnit * 4,
|
||||
|
@ -324,7 +321,7 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
|
|||
MNN_CONCURRENCY_END();
|
||||
|
||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||
for (int dy=tId; dy < dc_4; dy += threadNumber) {
|
||||
for (int dy=(int)tId; dy < dc_4; dy += threadNumber) {
|
||||
postFunction(dstOrigin + 4 * ow * oh * dy, bias + 4* dy, ow * oh, 1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,20 +21,7 @@
|
|||
#endif
|
||||
|
||||
bool MNNReorder4x4ByPlatform(float* dst, size_t number) {
|
||||
for (int i = 0; i < number; ++i) {
|
||||
auto addr = dst + 16 * i;
|
||||
auto s0 = _mm_loadu_ps(addr + 4 * 0);
|
||||
auto s1 = _mm_loadu_ps(addr + 4 * 1);
|
||||
auto s2 = _mm_loadu_ps(addr + 4 * 2);
|
||||
auto s3 = _mm_loadu_ps(addr + 4 * 3);
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(addr + 4 * 0, s0);
|
||||
_mm_storeu_ps(addr + 4 * 1, s1);
|
||||
_mm_storeu_ps(addr + 4 * 2, s2);
|
||||
_mm_storeu_ps(addr + 4 * 3, s3);
|
||||
}
|
||||
return true;
|
||||
return _SSE_MNNReorder4x4ByPlatform(dst, number);
|
||||
}
|
||||
|
||||
struct FunctionGroup {
|
||||
|
@ -60,6 +47,7 @@ struct FunctionGroup {
|
|||
size_t weight_depth_offset) = _SSE_MNNGemmFloatCommon_4;
|
||||
void (*MNNPackC4ForMatMul_A)(float* dest, const float* source, size_t e, size_t l,
|
||||
size_t eReal) = _SSE_MNNPackC4ForMatMul_A;
|
||||
void (*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose) = _SSE_MNNPackForMatMul_B;
|
||||
void (*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
|
||||
const float* postParameters, const float* bias) = _SSE_MNNPackedMatMul;
|
||||
void (*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
|
||||
|
@ -144,167 +132,16 @@ void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size
|
|||
gFunc.MNNMatrixSub(C, A, B, widthC4, cStride, aStride, bStride, height);
|
||||
}
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
|
||||
return _SSE_MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad);
|
||||
}
|
||||
|
||||
void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
|
||||
auto areaC4 = area / 4;
|
||||
auto depthC4 = depth / 4;
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
auto dstPlane = dst + z * area * 4;
|
||||
auto srcPlane = src + z * area * 4;
|
||||
for (int x = 0; x < areaC4; ++x) {
|
||||
auto s = srcPlane + 4 * x;
|
||||
auto d = dstPlane + 16 * x;
|
||||
auto s0 = _mm_loadu_ps(s + 0 * area);
|
||||
auto s1 = _mm_loadu_ps(s + 1 * area);
|
||||
auto s2 = _mm_loadu_ps(s + 2 * area);
|
||||
auto s3 = _mm_loadu_ps(s + 3 * area);
|
||||
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(d + 4 * 0, s0);
|
||||
_mm_storeu_ps(d + 4 * 1, s1);
|
||||
_mm_storeu_ps(d + 4 * 2, s2);
|
||||
_mm_storeu_ps(d + 4 * 3, s3);
|
||||
}
|
||||
}
|
||||
auto areaRemain = areaC4 * 4;
|
||||
auto depthRemain = depthC4 * 4;
|
||||
// Down
|
||||
int remain = depth - depthRemain;
|
||||
if (remain > 0) {
|
||||
float* dstPlane = depthC4 * area * 4 + dst;
|
||||
const float* srcPlane = src + depthC4 * area * 4;
|
||||
for (int x = 0; x < area; ++x) {
|
||||
for (int y = 0; y < remain; y++) {
|
||||
dstPlane[4 * x + y] = srcPlane[y * area + x];
|
||||
}
|
||||
for (int y = remain; y < 4; y++) {
|
||||
dstPlane[4 * x + y] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
float* dstPlane = z * area * 4 + dst;
|
||||
const float* srcPlane = src + z * area * 4;
|
||||
for (int x = areaRemain; x < area; ++x) {
|
||||
float s0 = srcPlane[x];
|
||||
float s1 = srcPlane[x + area];
|
||||
float s2 = srcPlane[x + area * 2];
|
||||
float s3 = srcPlane[x + area * 3];
|
||||
_mm_store_ps(dstPlane + 4 * x, _mm_set_ps(s3, s2, s1, s0));
|
||||
}
|
||||
}
|
||||
}
|
||||
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
|
||||
int w = dim[0];
|
||||
int h = dim[1];
|
||||
int srcStride = dim[2];
|
||||
int dstStride = dim[3];
|
||||
auto wC4 = w / 4;
|
||||
auto hC4 = h / 4;
|
||||
for (int y = 0; y < hC4; ++y) {
|
||||
auto sy = (float*)srcO + 4 * y;
|
||||
auto dy = (float*)dstO + 4 * y * dstStride;
|
||||
for (int x = 0; x < wC4; ++x) {
|
||||
auto sx = sy + x * 4 * srcStride;
|
||||
auto dx = dy + 4 * x;
|
||||
auto s0 = _mm_loadu_ps(sx + srcStride * 0);
|
||||
auto s1 = _mm_loadu_ps(sx + srcStride * 1);
|
||||
auto s2 = _mm_loadu_ps(sx + srcStride * 2);
|
||||
auto s3 = _mm_loadu_ps(sx + srcStride * 3);
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(dx + dstStride * 0, s0);
|
||||
_mm_storeu_ps(dx + dstStride * 1, s1);
|
||||
_mm_storeu_ps(dx + dstStride * 2, s2);
|
||||
_mm_storeu_ps(dx + dstStride * 3, s3);
|
||||
}
|
||||
}
|
||||
// Down
|
||||
for (int i = hC4 * 4; i < h; ++i) {
|
||||
auto si = srcO + i;
|
||||
auto di = dstO + i * dstStride;
|
||||
for (int j = 0; j < w; ++j) {
|
||||
auto sj = si + j * srcStride;
|
||||
auto dj = di + j;
|
||||
*dj = *sj;
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int i = 0; i < hC4 * 4; ++i) {
|
||||
auto si = srcO + i;
|
||||
auto di = dstO + i * dstStride;
|
||||
for (int j = wC4 * 4; j < w; ++j) {
|
||||
auto sj = si + j * srcStride;
|
||||
auto dj = di + j;
|
||||
*dj = *sj;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
|
||||
auto areaC4 = area / 4;
|
||||
auto depthC4 = depth / 4;
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
auto dstPlane = dst + z * area * 4;
|
||||
auto srcPlane = src + z * area * 4;
|
||||
for (int x = 0; x < areaC4; ++x) {
|
||||
auto s = srcPlane + 16 * x;
|
||||
auto d = dstPlane + 4 * x;
|
||||
auto s0 = _mm_loadu_ps(s + 0 * 4);
|
||||
auto s1 = _mm_loadu_ps(s + 1 * 4);
|
||||
auto s2 = _mm_loadu_ps(s + 2 * 4);
|
||||
auto s3 = _mm_loadu_ps(s + 3 * 4);
|
||||
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(d + 0 * area, s0);
|
||||
_mm_storeu_ps(d + 1 * area, s1);
|
||||
_mm_storeu_ps(d + 2 * area, s2);
|
||||
_mm_storeu_ps(d + 3 * area, s3);
|
||||
}
|
||||
}
|
||||
auto areaRemain = areaC4 * 4;
|
||||
auto depthRemain = depthC4 * 4;
|
||||
// Down
|
||||
int remain = depth - depthRemain;
|
||||
if (remain > 0) {
|
||||
float* dstPlane = depthC4 * area * 4 + dst;
|
||||
const float* srcPlane = src + depthC4 * area * 4;
|
||||
for (int x = 0; x < area; ++x) {
|
||||
for (int y = 0; y < remain; y++) {
|
||||
dstPlane[y * area + x] = srcPlane[4 * x + y];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
const float* srcPlane = z * area * 4 + src;
|
||||
float* dstPlane = dst + z * area * 4;
|
||||
for (int x = areaRemain; x < area; ++x) {
|
||||
for (int y = 0; y < 4; y++) {
|
||||
dstPlane[y * area + x] = srcPlane[4 * x + y];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
|
||||
return gFunc.MNNPackC4ForMatMul_A(dest, source, e, l, eReal);
|
||||
}
|
||||
|
||||
void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
|
||||
if (!transpose) {
|
||||
MNNUnpackTranspose(dest, source, l, h);
|
||||
return;
|
||||
}
|
||||
MNNPackC4(dest, source, l, h);
|
||||
gFunc.MNNPackForMatMul_B(dest, source, h, l, transpose);
|
||||
}
|
||||
|
||||
void MNNGetMatMulPackMode(int* eP, int* lP, int* hP) {
|
||||
|
|
|
@ -169,6 +169,307 @@ static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, con
|
|||
TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
|
||||
}
|
||||
}
|
||||
static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) {
|
||||
auto aStride = parameter[0] / sizeof(float);
|
||||
auto h = parameter[2];
|
||||
auto l = parameter[1];
|
||||
auto cStride = parameter[3] / sizeof(float);
|
||||
auto bExtraStride = parameter[5] / sizeof(float);
|
||||
auto bStride = bExtraStride + l * 4;
|
||||
auto hC4 = UP_DIV(h, 4);
|
||||
int lC4 = l / 4;
|
||||
int lR = lC4 * 4;
|
||||
const int hC4Unit = 4;
|
||||
int hC16 = hC4 / hC4Unit;
|
||||
int hR = hC16 * hC4Unit;
|
||||
auto src = A;
|
||||
for (int y = 0; y < hC16; ++y) {
|
||||
auto weight0 = B + (hC4Unit * y + 0) * bStride;
|
||||
auto dst0 = C + (hC4Unit * y + 0) * cStride;
|
||||
auto weight1 = B + (hC4Unit * y + 1) * bStride;
|
||||
auto dst1 = C + (hC4Unit * y + 1) * cStride;
|
||||
auto weight2 = B + (hC4Unit * y + 2) * bStride;
|
||||
auto dst2 = C + (hC4Unit * y + 2) * cStride;
|
||||
auto weight3 = B + (hC4Unit * y + 3) * bStride;
|
||||
auto dst3 = C + (hC4Unit * y + 3) * cStride;
|
||||
auto sumAvx00 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx01 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx10 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx11 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx20 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx21 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx30 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx31 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx40 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx41 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto srcUse = src;
|
||||
for (int sy = 0; sy < l; ++sy) {
|
||||
auto S0 = _mm256_broadcast_ss(srcUse + 0);
|
||||
auto S1 = _mm256_broadcast_ss(srcUse + 1);
|
||||
auto S2 = _mm256_broadcast_ss(srcUse + 2);
|
||||
auto S3 = _mm256_broadcast_ss(srcUse + 3);
|
||||
auto S4 = _mm256_broadcast_ss(srcUse + 4);
|
||||
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
|
||||
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
|
||||
|
||||
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
|
||||
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
|
||||
|
||||
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
|
||||
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
|
||||
|
||||
sumAvx20 = MNNAVXFMA(S2, W0, sumAvx20);
|
||||
sumAvx21 = MNNAVXFMA(S2, W1, sumAvx21);
|
||||
|
||||
sumAvx30 = MNNAVXFMA(S3, W0, sumAvx30);
|
||||
sumAvx31 = MNNAVXFMA(S3, W1, sumAvx31);
|
||||
|
||||
sumAvx40 = MNNAVXFMA(S4, W0, sumAvx40);
|
||||
sumAvx41 = MNNAVXFMA(S4, W1, sumAvx41);
|
||||
|
||||
srcUse += aStride;
|
||||
weight0 += 4;
|
||||
weight1 += 4;
|
||||
weight2 += 4;
|
||||
weight3 += 4;
|
||||
}
|
||||
_mm256_storeu_ps(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32));
|
||||
_mm256_storeu_ps(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32));
|
||||
_mm_storeu_ps(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0));
|
||||
|
||||
_mm256_storeu_ps(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49));
|
||||
_mm256_storeu_ps(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49));
|
||||
_mm_storeu_ps(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1));
|
||||
|
||||
_mm256_storeu_ps(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32));
|
||||
_mm256_storeu_ps(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32));
|
||||
_mm_storeu_ps(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0));
|
||||
|
||||
_mm256_storeu_ps(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49));
|
||||
_mm256_storeu_ps(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49));
|
||||
_mm_storeu_ps(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1));
|
||||
}
|
||||
for (int y = hR; y < hC4; ++y) {
|
||||
auto weight = B + y * bStride;
|
||||
auto dst = C + y * cStride;
|
||||
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
|
||||
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
|
||||
auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2);
|
||||
auto s3 = _mm_broadcast_ss(A + 0 * aStride + 3);
|
||||
auto s4 = _mm_broadcast_ss(A + 0 * aStride + 4);
|
||||
auto w0 = _mm_loadu_ps(weight + 0 * 4);
|
||||
auto z0 = _mm_mul_ps(s0, w0);
|
||||
auto z1 = _mm_mul_ps(s1, w0);
|
||||
auto z2 = _mm_mul_ps(s2, w0);
|
||||
auto z3 = _mm_mul_ps(s3, w0);
|
||||
auto z4 = _mm_mul_ps(s4, w0);
|
||||
|
||||
for (int sy = 1; sy < l; ++sy) {
|
||||
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
|
||||
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
|
||||
s2 = _mm_broadcast_ss(A + sy * aStride + 2);
|
||||
s3 = _mm_broadcast_ss(A + sy * aStride + 3);
|
||||
s4 = _mm_broadcast_ss(A + sy * aStride + 4);
|
||||
w0 = _mm_loadu_ps(weight + sy * 4);
|
||||
z0 = MNNSSEFMA(s0, w0, z0);
|
||||
z1 = MNNSSEFMA(s1, w0, z1);
|
||||
z2 = MNNSSEFMA(s2, w0, z2);
|
||||
z3 = MNNSSEFMA(s3, w0, z3);
|
||||
z4 = MNNSSEFMA(s4, w0, z4);
|
||||
}
|
||||
_mm_store_ps(dst + 4 * 0, z0);
|
||||
_mm_store_ps(dst + 4 * 1, z1);
|
||||
_mm_store_ps(dst + 4 * 2, z2);
|
||||
_mm_store_ps(dst + 4 * 3, z3);
|
||||
_mm_store_ps(dst + 4 * 4, z4);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, const size_t* parameter) {
|
||||
auto aStride = parameter[0] / sizeof(float);
|
||||
auto h = parameter[2];
|
||||
auto l = parameter[1];
|
||||
auto cStride = parameter[3] / sizeof(float);
|
||||
auto bExtraStride = parameter[5] / sizeof(float);
|
||||
auto bStride = bExtraStride + l * 4;
|
||||
auto hC4 = UP_DIV(h, 4);
|
||||
int lC4 = l / 4;
|
||||
int lR = lC4 * 4;
|
||||
const int hC4Unit = 4;
|
||||
int hC16 = hC4 / hC4Unit;
|
||||
int hR = hC16 * hC4Unit;
|
||||
auto src = A;
|
||||
for (int y = 0; y < hC16; ++y) {
|
||||
auto weight0 = B + (hC4Unit * y + 0) * bStride;
|
||||
auto dst0 = C + (hC4Unit * y + 0) * cStride;
|
||||
auto weight1 = B + (hC4Unit * y + 1) * bStride;
|
||||
auto dst1 = C + (hC4Unit * y + 1) * cStride;
|
||||
auto weight2 = B + (hC4Unit * y + 2) * bStride;
|
||||
auto dst2 = C + (hC4Unit * y + 2) * cStride;
|
||||
auto weight3 = B + (hC4Unit * y + 3) * bStride;
|
||||
auto dst3 = C + (hC4Unit * y + 3) * cStride;
|
||||
auto sumAvx00 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx01 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx10 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx11 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx20 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx21 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto srcUse = src;
|
||||
for (int sy = 0; sy < l; ++sy) {
|
||||
auto S0 = _mm256_broadcast_ss(srcUse + 0);
|
||||
auto S1 = _mm256_broadcast_ss(srcUse + 1);
|
||||
auto S2 = _mm256_broadcast_ss(srcUse + 2);
|
||||
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
|
||||
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
|
||||
|
||||
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
|
||||
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
|
||||
|
||||
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
|
||||
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
|
||||
|
||||
sumAvx20 = MNNAVXFMA(S2, W0, sumAvx20);
|
||||
sumAvx21 = MNNAVXFMA(S2, W1, sumAvx21);
|
||||
|
||||
srcUse += aStride;
|
||||
weight0 += 4;
|
||||
weight1 += 4;
|
||||
weight2 += 4;
|
||||
weight3 += 4;
|
||||
}
|
||||
_mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
|
||||
_mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
|
||||
_mm_storeu_ps(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0));
|
||||
|
||||
_mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
|
||||
_mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
|
||||
_mm_storeu_ps(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1));
|
||||
|
||||
_mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
|
||||
_mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
|
||||
_mm_storeu_ps(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0));
|
||||
|
||||
_mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
|
||||
_mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
|
||||
_mm_storeu_ps(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1));
|
||||
|
||||
}
|
||||
for (int y = hR; y < hC4; ++y) {
|
||||
auto weight = B + y * bStride;
|
||||
auto dst = C + y * cStride;
|
||||
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
|
||||
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
|
||||
auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2);
|
||||
auto w0 = _mm_loadu_ps(weight + 0 * 4);
|
||||
auto z0 = _mm_mul_ps(s0, w0);
|
||||
auto z1 = _mm_mul_ps(s1, w0);
|
||||
auto z2 = _mm_mul_ps(s2, w0);
|
||||
|
||||
for (int sy = 1; sy < l; ++sy) {
|
||||
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
|
||||
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
|
||||
s2 = _mm_broadcast_ss(A + sy * aStride + 2);
|
||||
w0 = _mm_loadu_ps(weight + sy * 4);
|
||||
z0 = MNNSSEFMA(s0, w0, z0);
|
||||
z1 = MNNSSEFMA(s1, w0, z1);
|
||||
z2 = MNNSSEFMA(s2, w0, z2);
|
||||
}
|
||||
_mm_store_ps(dst + 4 * 0, z0);
|
||||
_mm_store_ps(dst + 4 * 1, z1);
|
||||
_mm_store_ps(dst + 4 * 2, z2);
|
||||
}
|
||||
}
|
||||
|
||||
static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, const size_t* parameter) {
|
||||
auto aStride = parameter[0] / sizeof(float);
|
||||
auto h = parameter[2];
|
||||
auto l = parameter[1];
|
||||
auto cStride = parameter[3] / sizeof(float);
|
||||
auto bExtraStride = parameter[5] / sizeof(float);
|
||||
auto bStride = bExtraStride + l * 4;
|
||||
auto hC4 = UP_DIV(h, 4);
|
||||
int lC4 = l / 4;
|
||||
int lR = lC4 * 4;
|
||||
const int hC4Unit = 4;
|
||||
int hC16 = hC4 / hC4Unit;
|
||||
int hR = hC16 * hC4Unit;
|
||||
auto src = A;
|
||||
for (int y = 0; y < hC16; ++y) {
|
||||
auto weight0 = B + (hC4Unit * y + 0) * bStride;
|
||||
auto dst0 = C + (hC4Unit * y + 0) * cStride;
|
||||
auto weight1 = B + (hC4Unit * y + 1) * bStride;
|
||||
auto dst1 = C + (hC4Unit * y + 1) * cStride;
|
||||
auto weight2 = B + (hC4Unit * y + 2) * bStride;
|
||||
auto dst2 = C + (hC4Unit * y + 2) * cStride;
|
||||
auto weight3 = B + (hC4Unit * y + 3) * bStride;
|
||||
auto dst3 = C + (hC4Unit * y + 3) * cStride;
|
||||
auto sumAvx00 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx01 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto sumAvx10 = _mm256_set1_ps(0.0f);
|
||||
auto sumAvx11 = _mm256_set1_ps(0.0f);
|
||||
|
||||
auto srcUse = src;
|
||||
for (int sy = 0; sy < l; ++sy) {
|
||||
auto S0 = _mm256_broadcast_ss(srcUse + 0);
|
||||
auto S1 = _mm256_broadcast_ss(srcUse + 1);
|
||||
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
|
||||
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
|
||||
|
||||
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
|
||||
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
|
||||
|
||||
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
|
||||
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
|
||||
|
||||
srcUse += aStride;
|
||||
weight0 += 4;
|
||||
weight1 += 4;
|
||||
weight2 += 4;
|
||||
weight3 += 4;
|
||||
}
|
||||
_mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
|
||||
_mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
|
||||
|
||||
_mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
|
||||
_mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
|
||||
|
||||
_mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
|
||||
_mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
|
||||
|
||||
_mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
|
||||
_mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
|
||||
|
||||
}
|
||||
for (int y = hR; y < hC4; ++y) {
|
||||
auto weight = B + y * bStride;
|
||||
auto dst = C + y * cStride;
|
||||
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
|
||||
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
|
||||
auto w0 = _mm_loadu_ps(weight + 0 * 4);
|
||||
auto z0 = _mm_mul_ps(s0, w0);
|
||||
auto z1 = _mm_mul_ps(s1, w0);
|
||||
|
||||
for (int sy = 1; sy < l; ++sy) {
|
||||
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
|
||||
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
|
||||
w0 = _mm_loadu_ps(weight + sy * 4);
|
||||
z0 = MNNSSEFMA(s0, w0, z0);
|
||||
z1 = MNNSSEFMA(s1, w0, z1);
|
||||
}
|
||||
_mm_store_ps(dst + 4 * 0, z0);
|
||||
_mm_store_ps(dst + 4 * 1, z1);
|
||||
}
|
||||
}
|
||||
|
||||
static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) {
|
||||
auto aStride = parameter[0] / sizeof(float);
|
||||
|
@ -303,19 +604,40 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
|
|||
C += 8 * 4;
|
||||
A += 8;
|
||||
}
|
||||
if (eSize >= 4) {
|
||||
if (eSize >= 5) {
|
||||
_AVX_MNNPackedMatMul_5(C, A, B, parameter);
|
||||
eSize -= 5;
|
||||
C += 5 * 4;
|
||||
A += 5;
|
||||
}
|
||||
if (eSize == 4) {
|
||||
_AVX_MNNPackedMatMul_4(C, A, B, parameter);
|
||||
eSize -= 4;
|
||||
C += 4 * 4;
|
||||
A += 4;
|
||||
}
|
||||
if (eSize == 3) {
|
||||
_AVX_MNNPackedMatMul_3(C, A, B, parameter);
|
||||
eSize -= 3;
|
||||
C += 3 * 4;
|
||||
A += 3;
|
||||
}
|
||||
if (eSize == 2) {
|
||||
_AVX_MNNPackedMatMul_2(C, A, B, parameter);
|
||||
eSize -= 2;
|
||||
C += 2 * 4;
|
||||
A += 2;
|
||||
}
|
||||
if (eSize == 0) {
|
||||
return;
|
||||
}
|
||||
int lC4 = l / 4;
|
||||
int lR = lC4 * 4;
|
||||
const int hC4Unit = 4;
|
||||
int hC16 = hC4 / hC4Unit;
|
||||
int hR = hC16 * hC4Unit;
|
||||
for (int x = 0; x < eSize; ++x) {
|
||||
auto src = A + x;
|
||||
auto src = A;
|
||||
int x = 0;
|
||||
for (int y = 0; y < hC16; ++y) {
|
||||
auto weight0 = B + (hC4Unit * y + 0) * bStride;
|
||||
auto dst0 = C + (hC4Unit * y + 0) * cStride + x * 4;
|
||||
|
@ -434,4 +756,3 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
|
|||
_mm_store_ps(dst, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,3 +75,5 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
|||
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
|
||||
size_t dst_depth_quad, const QuanPostTreatParameters* post);
|
||||
void _SSE_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
|
||||
void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
|
||||
bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number);
|
||||
|
|
|
@ -9,6 +9,27 @@
|
|||
#include "GemmCommon.hpp"
|
||||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number) {
|
||||
for (int i = 0; i < number; ++i) {
|
||||
auto addr = dst + 16 * i;
|
||||
auto s0 = _mm_loadu_ps(addr + 4 * 0);
|
||||
auto s1 = _mm_loadu_ps(addr + 4 * 1);
|
||||
auto s2 = _mm_loadu_ps(addr + 4 * 2);
|
||||
auto s3 = _mm_loadu_ps(addr + 4 * 3);
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(addr + 4 * 0, s0);
|
||||
_mm_storeu_ps(addr + 4 * 1, s1);
|
||||
_mm_storeu_ps(addr + 4 * 2, s2);
|
||||
_mm_storeu_ps(addr + 4 * 3, s3);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
|
||||
const int pack = 12;
|
||||
const int mid = 1; // Deprecate
|
||||
|
@ -279,3 +300,156 @@ E##u = _mm_add_epi32(E##u, _mm_madd_epi16(w##u##v, s3##v));\
|
|||
_mm_storeu_ps((float*)dst_x, _mm_castsi128_ps(d0));
|
||||
}
|
||||
}
|
||||
|
||||
void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
|
||||
auto areaC4 = area / 4;
|
||||
auto depthC4 = depth / 4;
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
auto dstPlane = dst + z * area * 4;
|
||||
auto srcPlane = src + z * area * 4;
|
||||
for (int x = 0; x < areaC4; ++x) {
|
||||
auto s = srcPlane + 4 * x;
|
||||
auto d = dstPlane + 16 * x;
|
||||
auto s0 = _mm_loadu_ps(s + 0 * area);
|
||||
auto s1 = _mm_loadu_ps(s + 1 * area);
|
||||
auto s2 = _mm_loadu_ps(s + 2 * area);
|
||||
auto s3 = _mm_loadu_ps(s + 3 * area);
|
||||
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(d + 4 * 0, s0);
|
||||
_mm_storeu_ps(d + 4 * 1, s1);
|
||||
_mm_storeu_ps(d + 4 * 2, s2);
|
||||
_mm_storeu_ps(d + 4 * 3, s3);
|
||||
}
|
||||
}
|
||||
auto areaRemain = areaC4 * 4;
|
||||
auto depthRemain = depthC4 * 4;
|
||||
// Down
|
||||
int remain = depth - depthRemain;
|
||||
if (remain > 0) {
|
||||
float* dstPlane = depthC4 * area * 4 + dst;
|
||||
const float* srcPlane = src + depthC4 * area * 4;
|
||||
for (int x = 0; x < area; ++x) {
|
||||
for (int y = 0; y < remain; y++) {
|
||||
dstPlane[4 * x + y] = srcPlane[y * area + x];
|
||||
}
|
||||
for (int y = remain; y < 4; y++) {
|
||||
dstPlane[4 * x + y] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
float* dstPlane = z * area * 4 + dst;
|
||||
const float* srcPlane = src + z * area * 4;
|
||||
for (int x = areaRemain; x < area; ++x) {
|
||||
float s0 = srcPlane[x];
|
||||
float s1 = srcPlane[x + area];
|
||||
float s2 = srcPlane[x + area * 2];
|
||||
float s3 = srcPlane[x + area * 3];
|
||||
_mm_store_ps(dstPlane + 4 * x, _mm_set_ps(s3, s2, s1, s0));
|
||||
}
|
||||
}
|
||||
}
|
||||
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
|
||||
int w = dim[0];
|
||||
int h = dim[1];
|
||||
int srcStride = dim[2];
|
||||
int dstStride = dim[3];
|
||||
auto wC4 = w / 4;
|
||||
auto hC4 = h / 4;
|
||||
for (int y = 0; y < hC4; ++y) {
|
||||
auto sy = (float*)srcO + 4 * y;
|
||||
auto dy = (float*)dstO + 4 * y * dstStride;
|
||||
for (int x = 0; x < wC4; ++x) {
|
||||
auto sx = sy + x * 4 * srcStride;
|
||||
auto dx = dy + 4 * x;
|
||||
auto s0 = _mm_loadu_ps(sx + srcStride * 0);
|
||||
auto s1 = _mm_loadu_ps(sx + srcStride * 1);
|
||||
auto s2 = _mm_loadu_ps(sx + srcStride * 2);
|
||||
auto s3 = _mm_loadu_ps(sx + srcStride * 3);
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(dx + dstStride * 0, s0);
|
||||
_mm_storeu_ps(dx + dstStride * 1, s1);
|
||||
_mm_storeu_ps(dx + dstStride * 2, s2);
|
||||
_mm_storeu_ps(dx + dstStride * 3, s3);
|
||||
}
|
||||
}
|
||||
// Down
|
||||
for (int i = hC4 * 4; i < h; ++i) {
|
||||
auto si = srcO + i;
|
||||
auto di = dstO + i * dstStride;
|
||||
for (int j = 0; j < w; ++j) {
|
||||
auto sj = si + j * srcStride;
|
||||
auto dj = di + j;
|
||||
*dj = *sj;
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int i = 0; i < hC4 * 4; ++i) {
|
||||
auto si = srcO + i;
|
||||
auto di = dstO + i * dstStride;
|
||||
for (int j = wC4 * 4; j < w; ++j) {
|
||||
auto sj = si + j * srcStride;
|
||||
auto dj = di + j;
|
||||
*dj = *sj;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
|
||||
auto areaC4 = area / 4;
|
||||
auto depthC4 = depth / 4;
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
auto dstPlane = dst + z * area * 4;
|
||||
auto srcPlane = src + z * area * 4;
|
||||
for (int x = 0; x < areaC4; ++x) {
|
||||
auto s = srcPlane + 16 * x;
|
||||
auto d = dstPlane + 4 * x;
|
||||
auto s0 = _mm_loadu_ps(s + 0 * 4);
|
||||
auto s1 = _mm_loadu_ps(s + 1 * 4);
|
||||
auto s2 = _mm_loadu_ps(s + 2 * 4);
|
||||
auto s3 = _mm_loadu_ps(s + 3 * 4);
|
||||
|
||||
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
|
||||
|
||||
_mm_storeu_ps(d + 0 * area, s0);
|
||||
_mm_storeu_ps(d + 1 * area, s1);
|
||||
_mm_storeu_ps(d + 2 * area, s2);
|
||||
_mm_storeu_ps(d + 3 * area, s3);
|
||||
}
|
||||
}
|
||||
auto areaRemain = areaC4 * 4;
|
||||
auto depthRemain = depthC4 * 4;
|
||||
// Down
|
||||
int remain = depth - depthRemain;
|
||||
if (remain > 0) {
|
||||
float* dstPlane = depthC4 * area * 4 + dst;
|
||||
const float* srcPlane = src + depthC4 * area * 4;
|
||||
for (int x = 0; x < area; ++x) {
|
||||
for (int y = 0; y < remain; y++) {
|
||||
dstPlane[y * area + x] = srcPlane[4 * x + y];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Right
|
||||
for (int z = 0; z < depthC4; ++z) {
|
||||
const float* srcPlane = z * area * 4 + src;
|
||||
float* dstPlane = dst + z * area * 4;
|
||||
for (int x = areaRemain; x < area; ++x) {
|
||||
for (int y = 0; y < 4; y++) {
|
||||
dstPlane[y * area + x] = srcPlane[4 * x + y];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
|
||||
if (!transpose) {
|
||||
MNNUnpackTranspose(dest, source, l, h);
|
||||
return;
|
||||
}
|
||||
MNNPackC4(dest, source, l, h);
|
||||
}
|
||||
|
|
|
@ -1,165 +0,0 @@
|
|||
//
|
||||
// BufferPool.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/12/30.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "BufferPool.hpp"
|
||||
//#define DUMP_USAGE
|
||||
//#define MNN_DEBUG_MEMORY
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
BufferPool::Node::~Node() {
|
||||
if (nullptr == parent) {
|
||||
runtime->free(pointer);
|
||||
}
|
||||
}
|
||||
void* BufferPool::alloc(size_t size, bool seperate) {
|
||||
#ifdef DUMP_USAGE
|
||||
auto memoryUsed = size / 1024.0f / 1024.0f;
|
||||
MNN_PRINT("Alloc: %f\n", memoryUsed);
|
||||
#endif
|
||||
void* pointer = nullptr;
|
||||
// reuse if possible
|
||||
if (!seperate) {
|
||||
pointer = getFromFreeList(&mFreeList, size);
|
||||
if (nullptr != pointer) {
|
||||
return pointer;
|
||||
}
|
||||
}
|
||||
|
||||
// alloc otherwise
|
||||
pointer = mRuntime->alloc(size);
|
||||
if (nullptr == pointer) {
|
||||
return nullptr;
|
||||
}
|
||||
mTotalSize += size;
|
||||
|
||||
// save node
|
||||
std::shared_ptr<Node> node(new Node);
|
||||
node->size = size;
|
||||
node->pointer = pointer;
|
||||
node->runtime = mRuntime;
|
||||
mUsedList[pointer] = node;
|
||||
|
||||
#ifdef DUMP_USAGE
|
||||
MNN_PRINT("mTotalSize: %f\n", mTotalSize / 1024.0f / 1024.0f);
|
||||
#endif
|
||||
return pointer;
|
||||
}
|
||||
|
||||
void BufferPool::returnMemory(FREELIST* listP, std::shared_ptr<Node> node, bool permitMerge) {
|
||||
auto& list = *listP;
|
||||
list.insert(std::make_pair(node->size, node));
|
||||
// update parent use count
|
||||
if (nullptr != node->parent && permitMerge) {
|
||||
auto parent = node->parent;
|
||||
parent->useCount -= 1;
|
||||
|
||||
// merge if all subnodes were freed
|
||||
auto needMerge = parent->useCount == 0;
|
||||
while (needMerge) {
|
||||
// collect all subnodes
|
||||
for (auto iter = list.begin(); iter != list.end();) {
|
||||
if (iter->second->parent.get() == parent.get()) {
|
||||
iter = list.erase(iter);
|
||||
continue;
|
||||
}
|
||||
iter++;
|
||||
}
|
||||
|
||||
// do merge downside up
|
||||
list.insert(std::make_pair(parent->size, parent));
|
||||
needMerge = false;
|
||||
if (parent->parent.get() != nullptr) {
|
||||
parent = parent->parent;
|
||||
parent->useCount -= 1;
|
||||
needMerge = parent->useCount == 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BufferPool::free(void* pointer, bool needRelease) {
|
||||
// get node
|
||||
auto x = mUsedList.find(pointer);
|
||||
if (x == mUsedList.end()) {
|
||||
MNN_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
if (needRelease) {
|
||||
MNN_ASSERT(x->second->parent == nullptr);
|
||||
mTotalSize -= x->second->size;
|
||||
mUsedList.erase(x);
|
||||
return true;
|
||||
}
|
||||
|
||||
// mark as reusable
|
||||
auto node = x->second;
|
||||
mUsedList.erase(x);
|
||||
returnMemory(&mFreeList, node);
|
||||
#ifdef DUMP_USAGE
|
||||
auto memoryUsed = x->second->size / 1024.0f / 1024.0f;
|
||||
MNN_PRINT("Free: %f\n", memoryUsed);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void BufferPool::release(bool allRelease) {
|
||||
if (allRelease) {
|
||||
mUsedList.clear();
|
||||
mFreeList.clear();
|
||||
mTotalSize = 0;
|
||||
return;
|
||||
}
|
||||
for (auto f : mFreeList) {
|
||||
mTotalSize -= f.first;
|
||||
}
|
||||
mFreeList.clear();
|
||||
}
|
||||
|
||||
void* BufferPool::getFromFreeList(FREELIST* list, size_t size, bool permiteSplit) {
|
||||
#ifdef MNN_DEBUG_MEMORY
|
||||
return nullptr;
|
||||
#endif
|
||||
|
||||
// get node larger than size
|
||||
auto x = list->lower_bound(size);
|
||||
if (x == list->end()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// update parent use count
|
||||
void* pointer = x->second->pointer;
|
||||
if (permiteSplit && nullptr != x->second->parent) {
|
||||
x->second->parent->useCount += 1;
|
||||
}
|
||||
|
||||
// uses up all aligned space
|
||||
auto sizeAlign = size;
|
||||
if (sizeAlign >= x->first || (!permiteSplit)) {
|
||||
mUsedList.insert(std::make_pair(pointer, x->second));
|
||||
list->erase(x);
|
||||
return pointer;
|
||||
}
|
||||
|
||||
// split otherwise
|
||||
std::shared_ptr<Node> first(new Node);
|
||||
first->parent = x->second;
|
||||
first->size = sizeAlign;
|
||||
first->pointer = x->second->pointer;
|
||||
mUsedList.insert(std::make_pair(pointer, first));
|
||||
x->second->useCount += 1;
|
||||
|
||||
std::shared_ptr<Node> second(new Node);
|
||||
second->parent = x->second;
|
||||
second->size = x->second->size - sizeAlign;
|
||||
second->pointer = ((uint8_t*)x->second->pointer) + sizeAlign;
|
||||
list->insert(std::make_pair(second->size, second));
|
||||
list->erase(x);
|
||||
return pointer;
|
||||
}
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
|
@ -1,94 +0,0 @@
|
|||
//
|
||||
// BufferPool.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/02/28.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef BufferPool_hpp
|
||||
#define BufferPool_hpp
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "runtime/CUDARuntime.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
/** memory utils wrapper. provides memory reusing with alignment ability. */
|
||||
class BufferPool {
|
||||
public:
|
||||
/**
|
||||
* @brief init buffer allocator with pointer alignment.
|
||||
* @param CUDARuntime given runtime.
|
||||
*/
|
||||
BufferPool(CUDARuntime* runtime) : mRuntime(runtime) {
|
||||
// nothing to do
|
||||
}
|
||||
/**
|
||||
* @brief deinit buffer allocator. frees all allocated memories.
|
||||
*/
|
||||
~BufferPool() {
|
||||
release();
|
||||
}
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief alloc CHUNK pointer with given size. if any reusable pointer matches size, reuse it.
|
||||
* @param size given size.
|
||||
* @param seperate if true, the memory can't be alloc from free pool
|
||||
* @return allocated or used CHUNK pointer.
|
||||
* @sa free
|
||||
* @sa release
|
||||
*/
|
||||
void* alloc(size_t size, bool seperate = false);
|
||||
|
||||
/**
|
||||
* @brief mark CHUNK pointer as reusable.
|
||||
* @param pointer given CHUNK pointer.
|
||||
* @param release true if need free directly.
|
||||
* @return true if pointer is a CHUNK pointer, false otherwise.
|
||||
* @sa release
|
||||
*/
|
||||
bool free(void* pointer, bool release = false);
|
||||
|
||||
/**
|
||||
* @brief free all allocated memories.
|
||||
* @sa allocSeparate
|
||||
* @sa alloc
|
||||
* if allRelease, clear all memory , otherwise delete freelist
|
||||
*/
|
||||
void release(bool allRelease = true);
|
||||
|
||||
/**
|
||||
* @brief query total size allocated indeed.
|
||||
* @return total size allocated indeed.
|
||||
*/
|
||||
size_t totalSize() const {
|
||||
return mTotalSize;
|
||||
}
|
||||
|
||||
private:
|
||||
class Node {
|
||||
public:
|
||||
~Node();
|
||||
void* pointer;
|
||||
size_t size;
|
||||
std::shared_ptr<Node> parent = nullptr;
|
||||
int useCount = 0;
|
||||
CUDARuntime* runtime;
|
||||
};
|
||||
|
||||
typedef std::multimap<size_t, std::shared_ptr<Node>> FREELIST;
|
||||
|
||||
static void returnMemory(FREELIST* list, std::shared_ptr<Node> node, bool permitMerge = true);
|
||||
void* getFromFreeList(FREELIST* list, size_t size, bool permiteSplit = true);
|
||||
|
||||
std::map<void*, std::shared_ptr<Node>> mUsedList;
|
||||
FREELIST mFreeList;
|
||||
size_t mTotalSize = 0;
|
||||
CUDARuntime* mRuntime;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
#endif
|
|
@ -24,6 +24,21 @@ std::map<OpType, CUDABackend::Creator*>* gCreator() {
|
|||
std::call_once(gOnce, [&]() { creators = new std::map<OpType, CUDABackend::Creator*>; });
|
||||
return creators;
|
||||
};
|
||||
class CUDARuntimeAllocator : public BufferAllocator::Allocator {
|
||||
public:
|
||||
CUDARuntimeAllocator(CUDARuntime* rt) : mRuntime(rt) {
|
||||
// Do nothing
|
||||
}
|
||||
virtual ~ CUDARuntimeAllocator() = default;
|
||||
virtual std::pair<void*, int> onAlloc(int size) override {
|
||||
return std::make_pair(mRuntime->alloc(size), 0);
|
||||
}
|
||||
virtual void onRelease(std::pair<void*, int> ptr) override {
|
||||
mRuntime->free(ptr.first);
|
||||
}
|
||||
private:
|
||||
CUDARuntime* mRuntime;
|
||||
};
|
||||
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
|
||||
// Shader precision
|
||||
if (precision == BackendConfig::Precision_Low) {
|
||||
|
@ -36,28 +51,25 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
|
|||
mIsCreateError = true;
|
||||
return;
|
||||
}
|
||||
mBufferPool.reset(new BufferPool(mCUDARuntime.get()));
|
||||
mStaticBufferPool.reset(new BufferPool(mCUDARuntime.get()));
|
||||
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
|
||||
mBufferPool.reset(new BufferAllocator(allocator));
|
||||
}
|
||||
}
|
||||
CUDARuntimeWrapper::~CUDARuntimeWrapper() {
|
||||
// Do nothing
|
||||
}
|
||||
Backend* CUDARuntimeWrapper::onCreate() const {
|
||||
return new CUDABackend(mBufferPool, mStaticBufferPool, mCUDARuntime);
|
||||
return new CUDABackend(mBufferPool, mCUDARuntime);
|
||||
}
|
||||
|
||||
void CUDARuntimeWrapper::onGabageCollect(int level) {
|
||||
mStaticBufferPool->release(false);
|
||||
if (level > 50) {
|
||||
mBufferPool->release(false);
|
||||
}
|
||||
}
|
||||
|
||||
CUDABackend::CUDABackend(std::shared_ptr<BufferPool> dy, std::shared_ptr<BufferPool> st,
|
||||
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
|
||||
std::shared_ptr<CUDARuntime> rt)
|
||||
: Backend(MNN_FORWARD_CUDA) {
|
||||
mBufferPool = dy;
|
||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
||||
mStaticBufferPool = st;
|
||||
mCUDARuntime = rt;
|
||||
}
|
||||
|
@ -66,12 +78,6 @@ CUDABackend::~CUDABackend() {
|
|||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("enter CUDABackend::~CUDABackend \n");
|
||||
#endif
|
||||
for (auto p : mStatic) {
|
||||
mStaticBufferPool->free(p);
|
||||
}
|
||||
for (auto p : mDynamic) {
|
||||
mBufferPool->free(p);
|
||||
}
|
||||
}
|
||||
|
||||
CUDARuntime* CUDABackend::getCUDARuntime() {
|
||||
|
@ -84,23 +90,22 @@ bool CUDABackend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storag
|
|||
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
|
||||
#endif
|
||||
int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
|
||||
std::pair<void*, int> buffer;
|
||||
if (storageType == DYNAMIC_SEPERATE) {
|
||||
auto buffer = mBufferPool->alloc(mallocSize, true);
|
||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
|
||||
buffer = mBufferPool->alloc(mallocSize, true);
|
||||
} else if (storageType == DYNAMIC) {
|
||||
auto buffer = mBufferPool->alloc(mallocSize, false);
|
||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
|
||||
buffer = mBufferPool->alloc(mallocSize, false);
|
||||
} else {
|
||||
MNN_ASSERT(storageType == STATIC);
|
||||
auto buffer = mStaticBufferPool->alloc(mallocSize, false);
|
||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
|
||||
}
|
||||
MNN_ASSERT(0 != ((Tensor*)nativeTensor)->buffer().device);
|
||||
if (STATIC == storageType) {
|
||||
mStatic.insert((void*)nativeTensor->buffer().device);
|
||||
} else {
|
||||
mDynamic.insert((void*)nativeTensor->buffer().device);
|
||||
buffer = mStaticBufferPool->alloc(mallocSize, false);
|
||||
}
|
||||
if(nullptr == buffer.first) {
|
||||
return false;
|
||||
};
|
||||
auto host = (uint8_t*)buffer.first + buffer.second;
|
||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
|
||||
auto des = TensorUtils::getDescribe(nativeTensor);
|
||||
des->extra.offset = buffer.second;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -108,24 +113,22 @@ bool CUDABackend::onReleaseBuffer(const Tensor* nativeTensor, StorageType storag
|
|||
if (storageType == DYNAMIC_SEPERATE) {
|
||||
return true;
|
||||
}
|
||||
auto buffer = nativeTensor->deviceId();
|
||||
auto buffer = (uint8_t*)nativeTensor->deviceId();
|
||||
auto des = TensorUtils::getDescribe(nativeTensor);
|
||||
auto pointer = std::make_pair(buffer - des->extra.offset, des->extra.offset);
|
||||
|
||||
if (storageType == DYNAMIC) {
|
||||
mDynamic.erase((void*)buffer);
|
||||
mBufferPool->free((void*)buffer);
|
||||
mBufferPool->free(pointer);
|
||||
return true;
|
||||
}
|
||||
if (storageType == STATIC) {
|
||||
mStatic.erase((void*)buffer);
|
||||
mStaticBufferPool->free((void*)buffer);
|
||||
mStaticBufferPool->free(pointer);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDABackend::onClearBuffer() {
|
||||
for (auto p : mDynamic) {
|
||||
mBufferPool->free(p);
|
||||
}
|
||||
mDynamic.clear();
|
||||
mBufferPool->release(true);
|
||||
return true;
|
||||
}
|
||||
size_t CUDABackend::realSize(const Tensor* tensor) {
|
||||
|
@ -172,9 +175,9 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
|
|||
auto exe = iter->second->onCreate(inputs, outputs, op, this);
|
||||
if (NULL == exe) {
|
||||
if (nullptr != op->name()) {
|
||||
MNN_PRINT("The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str());
|
||||
MNN_PRINT("The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
} else {
|
||||
// MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -12,12 +12,11 @@
|
|||
#include <set>
|
||||
#include <vector>
|
||||
#include "MNN_generated.h"
|
||||
#include "backend/cuda/core/BufferPool.hpp"
|
||||
#include "backend/cuda/core/runtime/CUDARuntime.hpp"
|
||||
#include "core/Backend.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/ConvolutionCommon.hpp"
|
||||
|
||||
#include "core/BufferAllocator.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
|
||||
|
@ -31,15 +30,14 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<BufferPool> mBufferPool;
|
||||
std::shared_ptr<BufferPool> mStaticBufferPool;
|
||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||
bool mIsCreateError{false};
|
||||
};
|
||||
|
||||
class CUDABackend final : public Backend {
|
||||
public:
|
||||
CUDABackend(std::shared_ptr<BufferPool> dy, std::shared_ptr<BufferPool> st, std::shared_ptr<CUDARuntime> rt);
|
||||
CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt);
|
||||
~CUDABackend();
|
||||
|
||||
CUDARuntime *getCUDARuntime();
|
||||
|
@ -64,10 +62,10 @@ public:
|
|||
|
||||
static bool addCreator(OpType t, Creator *c);
|
||||
|
||||
BufferPool *getBufferPool() const {
|
||||
BufferAllocator *getBufferPool() const {
|
||||
return mBufferPool.get();
|
||||
}
|
||||
BufferPool *getStaticBufferPool() const {
|
||||
BufferAllocator *getStaticBufferPool() const {
|
||||
return mStaticBufferPool.get();
|
||||
}
|
||||
virtual std::pair<float, bool> onMeasure(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
|
@ -75,10 +73,8 @@ public:
|
|||
static size_t realSize(const Tensor *tensor);
|
||||
|
||||
private:
|
||||
std::set<void *> mStatic;
|
||||
std::set<void *> mDynamic;
|
||||
std::shared_ptr<BufferPool> mBufferPool;
|
||||
std::shared_ptr<BufferPool> mStaticBufferPool;
|
||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||
std::shared_ptr<BufferAllocator> mStaticBufferPool;
|
||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||
};
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
|
|||
// Note that all cublas scalars (alpha, beta) and scalar results such as dot
|
||||
// output resides at device side.
|
||||
cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST));
|
||||
cudnn_check(cudnnCreate(&mCudnnHandle));
|
||||
}
|
||||
|
||||
CUDARuntime::~CUDARuntime() {
|
||||
|
@ -64,13 +65,27 @@ CUDARuntime::~CUDARuntime() {
|
|||
MNN_PRINT("start ~CUDARuntime !\n");
|
||||
#endif
|
||||
cublas_check(cublasDestroy(mCublasHandle));
|
||||
cudnn_check(cudnnDestroy(mCudnnHandle));
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end ~CUDARuntime !\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
int CUDARuntime::blocks_num(const int total_threads) const {
|
||||
return (total_threads + mProp.maxThreadsPerBlock - 1) / mProp.maxThreadsPerBlock;
|
||||
int CUDARuntime::blocks_num(const int total_threads) {
|
||||
int maxNum = mProp.maxThreadsPerBlock;
|
||||
if(total_threads / 32 > maxNum) {
|
||||
mThreadPerBlock = maxNum;
|
||||
} else if(total_threads / 16 > maxNum) {
|
||||
mThreadPerBlock = maxNum / 2;
|
||||
} else if(total_threads / 8 > maxNum) {
|
||||
mThreadPerBlock = maxNum / 4;
|
||||
} else if(total_threads / 4 > maxNum) {
|
||||
mThreadPerBlock = maxNum / 8;
|
||||
} else {
|
||||
mThreadPerBlock = 128;
|
||||
}
|
||||
return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
|
||||
}
|
||||
|
||||
bool CUDARuntime::isSupportedFP16() const {
|
||||
|
@ -126,6 +141,7 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe
|
|||
default:
|
||||
MNN_ERROR("bad cuda memcpy kind\n");
|
||||
}
|
||||
//TODO, support Async Afterwards
|
||||
cuda_check(cudaMemcpy(dst, src, size_in_bytes, cuda_kind));
|
||||
}
|
||||
|
||||
|
@ -137,4 +153,8 @@ cublasHandle_t CUDARuntime::cublas_handle() {
|
|||
return mCublasHandle;
|
||||
}
|
||||
|
||||
cudnnHandle_t CUDARuntime::cudnn_handle() {
|
||||
return mCudnnHandle;
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
@ -106,26 +106,29 @@ public:
|
|||
void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false);
|
||||
void memset(void *dst, int value, size_t size_in_bytes);
|
||||
cublasHandle_t cublas_handle();
|
||||
cudnnHandle_t cudnn_handle();
|
||||
|
||||
int threads_num() const {
|
||||
return mProp.maxThreadsPerBlock;
|
||||
int threads_num() {
|
||||
return mThreadPerBlock;
|
||||
}
|
||||
int major_sm() const {
|
||||
return mProp.major;
|
||||
}
|
||||
int blocks_num(const int total_threads) const;
|
||||
int blocks_num(const int total_threads);
|
||||
|
||||
private:
|
||||
cudaDeviceProp mProp;
|
||||
int mDeviceId;
|
||||
|
||||
cublasHandle_t mCublasHandle;
|
||||
cudnnHandle_t mCudnnHandle;
|
||||
|
||||
bool mIsSupportedFP16 = false;
|
||||
bool mSupportDotInt8 = false;
|
||||
bool mSupportDotAccInt8 = false;
|
||||
float mFlops = 4.0f;
|
||||
bool mIsCreateError{false};
|
||||
int mThreadPerBlock = 128;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
#include "ArgMaxExecution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template <typename T>
|
||||
__global__ void ARGMAX(const int count, const int outside, const int inside, const int dim,
|
||||
const T *input, T *output) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
const int o = i / inside;
|
||||
const int n = i % inside;
|
||||
|
||||
T* outPtr = output + inside * o;
|
||||
const T* inpPtr = input + inside * dim * o;
|
||||
int index = 0;
|
||||
T maxValue = inpPtr[0];
|
||||
for(int j=1; j<dim; j++) {
|
||||
T value = inpPtr[j*inside];
|
||||
if(maxValue < value) {
|
||||
index = j;
|
||||
maxValue = value;
|
||||
}
|
||||
}
|
||||
outPtr[n] = index;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ArgMaxExecution::ArgMaxExecution(const Op* op, Backend *backend) : Execution(backend) {
|
||||
mOp = op;
|
||||
mAxis = mOp->main_as_ArgMax()->axis();
|
||||
}
|
||||
|
||||
ArgMaxExecution::~ArgMaxExecution(){
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
||||
if (mAxis < 0) {
|
||||
mAxis = input->dimensions() + mAxis;
|
||||
}
|
||||
|
||||
mInside = 1;
|
||||
mOutside = 1;
|
||||
for (int i=0; i<mAxis; ++i) {
|
||||
mOutside *= input->length(i);
|
||||
}
|
||||
for (int i=mAxis+1; i<input->dimensions(); ++i) {
|
||||
mInside *= input->length(i);
|
||||
}
|
||||
mDim = input->length(mAxis);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
|
||||
|
||||
auto input = (void *)inputs[0]->deviceId();
|
||||
auto output = (void *)outputs[0]->deviceId();
|
||||
|
||||
int count = mOutside * mInside;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int thread_num = runtime->threads_num();
|
||||
ARGMAX<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const float*)input,(float *)output);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
class ArgMaxCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
return new ArgMaxExecution(op, backend);
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<ArgMaxCreator> __init(OpType_ArgMax);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
//
|
||||
// ArgMaxExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/07/29.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef ArgMaxExecution_hpp
|
||||
#define ArgMaxExecution_hpp
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class ArgMaxExecution : public Execution {
|
||||
public:
|
||||
ArgMaxExecution(const Op* op, Backend *backend);
|
||||
virtual ~ArgMaxExecution();
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const Op* mOp;
|
||||
int mAxis;
|
||||
int mInside;
|
||||
int mOutside;
|
||||
int mDim;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
||||
#endif
|
|
@ -0,0 +1,117 @@
|
|||
#include "BatchMatMulExecution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template <typename T>
|
||||
__global__ void transpose_bias(T *input, T *output, const T* bias, int batch, int e, int h) {
|
||||
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) {
|
||||
int i = index % (e*h);
|
||||
int b = index / (e*h);
|
||||
int y = i / e;
|
||||
output[index] = input[index] + bias[b * h + y];
|
||||
}
|
||||
return;
|
||||
}
|
||||
BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) {
|
||||
mTransposeA = transposeA;
|
||||
mTransposeB = transposeB;
|
||||
}
|
||||
BatchMatMulExecution::~ BatchMatMulExecution() {
|
||||
// do nothing
|
||||
}
|
||||
|
||||
ErrorCode BatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto C = outputs[0];
|
||||
|
||||
auto dimensions = C->dimensions();
|
||||
int batch = 1;
|
||||
for (int i = 0; i < dimensions - 2; ++i) {
|
||||
batch *= C->length(i);
|
||||
}
|
||||
auto e = C->length(dimensions-2);
|
||||
auto h = C->length(dimensions-1);
|
||||
if(inputs.size() > 2) {
|
||||
mTempOutput.reset(Tensor::createDevice<float>({batch*h*e}));
|
||||
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode BatchMatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
auto blasHandle = runtime->cublas_handle();
|
||||
const Tensor* A = inputs[0];
|
||||
const Tensor* B = inputs[1];
|
||||
|
||||
auto dimensions = A->dimensions();
|
||||
int batch = 1;
|
||||
for (int i = 0; i < dimensions - 2; ++i) {
|
||||
batch *= A->length(i);
|
||||
}
|
||||
|
||||
auto w0 = inputs[0]->length(dimensions-1);
|
||||
auto h0 = inputs[0]->length(dimensions-2);
|
||||
auto C = outputs[0];
|
||||
|
||||
auto e = C->length(dimensions-2);
|
||||
auto h = C->length(dimensions-1);
|
||||
auto l = w0;
|
||||
if (mTransposeA) {
|
||||
l = h0;
|
||||
}
|
||||
auto APtr = (const float*)A->deviceId();
|
||||
auto BPtr = (const float*)B->deviceId();
|
||||
auto CDestPtr = (float*)C->deviceId();
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
|
||||
auto tranB = CUBLAS_OP_N;
|
||||
auto ldB = h;
|
||||
if (mTransposeB) {
|
||||
ldB = l;
|
||||
tranB = CUBLAS_OP_T;
|
||||
}
|
||||
auto tranA = CUBLAS_OP_N;
|
||||
auto ldA = l;
|
||||
if (mTransposeA) {
|
||||
ldA = e;
|
||||
tranA = CUBLAS_OP_T;
|
||||
}
|
||||
|
||||
if(inputs.size() == 2) {
|
||||
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch);
|
||||
cublas_check(status);
|
||||
//cudaThreadSynchronize();
|
||||
|
||||
} else {
|
||||
auto CPtr = (float*)mTempOutput->deviceId();
|
||||
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch);
|
||||
cublas_check(status);
|
||||
//cudaThreadSynchronize();
|
||||
// Transpose batch, h, e -> batch, e, h
|
||||
int block_num = runtime->blocks_num(batch*e*h);
|
||||
int threads_num = runtime->threads_num();
|
||||
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class BatchMatMulCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
auto param = op->main_as_BatchMatMulParam();
|
||||
return new BatchMatMulExecution(param->adjX(), param->adjY(), backend);
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<BatchMatMulCreator> __init(OpType_BatchMatMul);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
//
|
||||
// BatchMatMulExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/07/30.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef BatchMatMulExecution_hpp
|
||||
#define BatchMatMulExecution_hpp
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class BatchMatMulExecution : public Execution {
|
||||
public:
|
||||
BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend);
|
||||
virtual ~BatchMatMulExecution();
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Tensor> mTempOutput;
|
||||
bool mTransposeA;
|
||||
bool mTransposeB;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
||||
#endif
|
|
@ -48,14 +48,16 @@ __global__ void MUL(const T *input0, const T* input1, T *output, size_t count, s
|
|||
template <typename T>
|
||||
__global__ void DIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = input0[i * s0] / input1[i * s1];
|
||||
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
|
||||
output[i] = sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001);
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
__global__ void REALDIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = input0[i * s0] / input1[i * s1];
|
||||
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
|
||||
output[i] = sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -123,7 +125,9 @@ __global__ void NOTEQUAL(const T *input0, const T* input1, int *output, size_t c
|
|||
template <typename T>
|
||||
__global__ void FLOORDIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = floor(1.0*(input0[i * s0] / input1[i * s1]));
|
||||
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
|
||||
output[i] = floor(1.0*sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001));
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -133,7 +137,10 @@ __global__ void FLOORMOD(const T *input0, const T* input1, T *output, size_t cou
|
|||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
T x = input0[i * s0];
|
||||
T y = input1[i * s1];
|
||||
output[i] = x - floor(1.0*(x / y)) * y;
|
||||
int sgn = y > 0 ? 1 : (y < 0 ? -1 : 0);
|
||||
T tmp = floor(1.0*sgn * x / max((float)abs(y), 0.0000001));
|
||||
|
||||
output[i] = x - tmp * y;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
|
|||
parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0);
|
||||
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
runtime->memcpy(mConstBuffer, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice);
|
||||
runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice);
|
||||
mTotalCount = parameters.total;
|
||||
|
||||
if(inputs.size() == 1) {
|
||||
|
@ -149,16 +149,17 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
int block_num = runtime->blocks_num(mTotalCount);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
|
||||
if (inputs.size() == 1) {
|
||||
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)mFilter,
|
||||
(const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
|
||||
(const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
|
||||
} else if (inputs.size() == 3) {
|
||||
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
|
||||
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
|
||||
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
|
||||
} else {
|
||||
MNN_ASSERT(inputs.size() == 2);
|
||||
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
|
||||
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
|
||||
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -249,9 +250,10 @@ ErrorCode DeconvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs
|
|||
parameters.outputSize[1] = outputs[0]->height();
|
||||
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
|
||||
parameters.subChannel = inputs[0]->channel();
|
||||
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
|
||||
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
runtime->memcpy(mConstBuffer, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice);
|
||||
runtime->memcpy(constPtr, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice);
|
||||
mTotalCount = parameters.total;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -260,12 +262,13 @@ ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector<Tensor *> &input
|
|||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
int block_num = runtime->blocks_num(mTotalCount);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
|
||||
if (inputs.size() > 2) {
|
||||
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
|
||||
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
|
||||
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
|
||||
} else {
|
||||
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
|
||||
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
|
||||
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ public:
|
|||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
void *mConstBuffer;
|
||||
std::pair<void*, int> mConstBuffer;
|
||||
const Op *mOp;
|
||||
int mTotalCount;
|
||||
|
||||
|
|
|
@ -70,7 +70,8 @@ ConvSingleInputExecution::ConvSingleInputExecution(Backend* backend, const MNN::
|
|||
cudnn_data_type_ = CUDNN_DATA_FLOAT;
|
||||
cudnn_data_type_len_ = 0;
|
||||
|
||||
cudnn_check(cudnnCreate(&cudnn_handle_));
|
||||
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
|
||||
cudnn_handle_ = runtime->cudnn_handle();
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
|
||||
|
@ -111,10 +112,24 @@ ConvSingleInputExecution::ConvSingleInputExecution(Backend* backend, const MNN::
|
|||
}
|
||||
use_bias_ = true;
|
||||
}
|
||||
|
||||
mKernelInfo.kernelN = common->outputCount();
|
||||
mKernelInfo.kernelC = weightSize / (mKernelInfo.kernelN * mKernelInfo.kernelY * mKernelInfo.kernelX);
|
||||
std::vector<int> filter_shape = {mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX};
|
||||
|
||||
cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
|
||||
filter_shape[1], filter_shape[2], filter_shape[3]));
|
||||
|
||||
cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX,
|
||||
mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
|
||||
}
|
||||
//set group num
|
||||
cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
|
||||
}
|
||||
|
||||
ConvSingleInputExecution::~ConvSingleInputExecution() {
|
||||
cudnn_check(cudnnDestroy(cudnn_handle_));
|
||||
cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
|
||||
cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
|
||||
cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
|
||||
|
@ -152,9 +167,32 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
mKernelInfo.kernelN = output->channel();
|
||||
mKernelInfo.kernelC = input->channel() / mKernelInfo.groups;
|
||||
|
||||
if(mIOInfo.iw==0) {
|
||||
mIOInfo.iw = 1;
|
||||
}
|
||||
if(mIOInfo.ih==0) {
|
||||
mIOInfo.ih = 1;
|
||||
}
|
||||
if(mIOInfo.ic==0) {
|
||||
mIOInfo.ic = 1;
|
||||
}
|
||||
if(mIOInfo.ib==0) {
|
||||
mIOInfo.ib = 1;
|
||||
}
|
||||
if(mIOInfo.ow==0) {
|
||||
mIOInfo.ow = 1;
|
||||
}
|
||||
if(mIOInfo.oh==0) {
|
||||
mIOInfo.oh = 1;
|
||||
}
|
||||
if(mIOInfo.oc==0) {
|
||||
mIOInfo.oc = 1;
|
||||
}
|
||||
if(mIOInfo.ob==0) {
|
||||
mIOInfo.ob = 1;
|
||||
}
|
||||
std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
|
||||
std::vector<int> output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
|
||||
std::vector<int> filter_shape = {mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX};
|
||||
|
||||
// printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]);
|
||||
// printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]);
|
||||
|
@ -162,8 +200,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0],
|
||||
in_shape[1], in_shape[2], in_shape[3]));
|
||||
|
||||
cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
|
||||
filter_shape[1], filter_shape[2], filter_shape[3]));
|
||||
cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0],
|
||||
output_shape[1], output_shape[2], output_shape[3]));
|
||||
|
||||
|
@ -205,14 +241,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
}
|
||||
input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
|
||||
|
||||
cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX,
|
||||
mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
|
||||
}
|
||||
//set group num
|
||||
cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
|
||||
|
||||
// algorithm
|
||||
constexpr int requested_algo_count = 1;
|
||||
int returned_algo_count;
|
||||
|
@ -246,7 +274,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
|
||||
ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
|
||||
//MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
|
||||
|
||||
MNN_ASSERT(inputs.size() == 1);
|
||||
MNN_ASSERT(outputs.size() == 1);
|
||||
|
||||
|
@ -264,7 +291,6 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
|
||||
|
||||
if(use_pad_) {
|
||||
std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
|
||||
|
||||
|
@ -289,6 +315,7 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
|
|||
if(use_relu_ || use_relu6_) {
|
||||
cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr));
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -65,7 +65,8 @@ DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const M
|
|||
cudnn_data_type_ = CUDNN_DATA_FLOAT;
|
||||
cudnn_data_type_len_ = 0;
|
||||
|
||||
cudnn_check(cudnnCreate(&cudnn_handle_));
|
||||
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
|
||||
cudnn_handle_ = runtime->cudnn_handle();
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
|
||||
|
@ -110,7 +111,6 @@ DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const M
|
|||
}
|
||||
|
||||
DeconvSingleInputExecution::~DeconvSingleInputExecution() {
|
||||
cudnn_check(cudnnDestroy(cudnn_handle_));
|
||||
cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
|
||||
cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
|
||||
cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
#include "GatherV2Execution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template <typename T>
|
||||
__global__ void GATHERV2(const int count, const int outside, const int inside, const int iNum, const int oNum,
|
||||
const T *input, const int* indice, T *output) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
const int o = i / oNum;
|
||||
const int n = i % oNum;
|
||||
|
||||
T* outPtr = output + inside * oNum * o;
|
||||
const T* inpPtr = input + inside * iNum * o;
|
||||
for(int j=0; j<inside; j++) {
|
||||
outPtr[n*inside+j] = inpPtr[indice[n]*inside+j];
|
||||
}
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
GatherV2Execution::GatherV2Execution(const Op* op, Backend *backend) : Execution(backend) {
|
||||
mOp = op;
|
||||
}
|
||||
|
||||
GatherV2Execution::~GatherV2Execution(){
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
ErrorCode GatherV2Execution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto params = inputs[0];
|
||||
mAxis = 0;
|
||||
if (mOp->main_type() == OpParameter_Axis) {
|
||||
mAxis = mOp->main_as_Axis()->axis();
|
||||
}
|
||||
MNN_ASSERT(mAxis > -params->buffer().dimensions && mAxis < params->buffer().dimensions);
|
||||
|
||||
if (mAxis < 0) {
|
||||
mAxis = params->buffer().dimensions + mAxis;
|
||||
}
|
||||
|
||||
auto indices = inputs[1];
|
||||
auto output = outputs[0];
|
||||
mOutNum = indices->elementSize();
|
||||
mInside = 1;
|
||||
mOutside = 1;
|
||||
for (int i=0; i<mAxis; ++i) {
|
||||
mOutside *= params->length(i);
|
||||
}
|
||||
for (int i=mAxis+1; i<params->dimensions(); ++i) {
|
||||
mInside *= params->length(i);
|
||||
}
|
||||
mInpNum = params->length(mAxis);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode GatherV2Execution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
|
||||
|
||||
auto params = (void *)inputs[0]->deviceId();
|
||||
auto indices = (void *)inputs[1]->deviceId();
|
||||
auto output = (void *)outputs[0]->deviceId();
|
||||
|
||||
if (inputs.size() == 3) {
|
||||
cudaMemcpy(&mAxis, (void *)inputs[2]->deviceId(), sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
auto input0 = inputs[0];
|
||||
MNN_ASSERT(mAxis > -input0->dimensions() && mAxis < input0->dimensions());
|
||||
if (mAxis < 0) {
|
||||
mAxis = input0->dimensions() + mAxis;
|
||||
}
|
||||
|
||||
mInside = 1;
|
||||
mOutside = 1;
|
||||
for (int i=0; i<mAxis; ++i) {
|
||||
mOutside *= input0->length(i);
|
||||
}
|
||||
for (int i=mAxis+1; i<input0->dimensions(); ++i) {
|
||||
mInside *= input0->length(i);
|
||||
}
|
||||
mInpNum = input0->length(mAxis);
|
||||
}
|
||||
|
||||
int count = mOutside * mOutNum;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int thread_num = runtime->threads_num();
|
||||
GATHERV2<<<block_num, thread_num>>>(count, mOutside, mInside, mInpNum, mOutNum, (const float*)params, (int *)indices,
|
||||
(float *)output);
|
||||
return NO_ERROR;
|
||||
}
|
||||
class GatherV2Creator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
return new GatherV2Execution(op, backend);
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<GatherV2Creator> __init(OpType_GatherV2);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
//
|
||||
// GatherV2Execution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/07/29.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef GatherV2Execution_hpp
|
||||
#define GatherV2Execution_hpp
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class GatherV2Execution : public Execution {
|
||||
public:
|
||||
GatherV2Execution(const Op* op, Backend *backend);
|
||||
virtual ~GatherV2Execution();
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const Op* mOp;
|
||||
int mAxis;
|
||||
int mInside;
|
||||
int mOutside;
|
||||
int mInpNum;
|
||||
int mOutNum;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
||||
#endif
|
|
@ -0,0 +1,352 @@
|
|||
#include "LayerNormExecution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
|
||||
|
||||
|
||||
|
||||
#define FINAL_MASK 0xffffffff
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceSum(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1)
|
||||
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceSum(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceSum<T>(val);
|
||||
|
||||
if(lane == 0)
|
||||
shared[wid] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceSum(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__
|
||||
void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl)
|
||||
{
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ float s_mean;
|
||||
__shared__ float s_variance;
|
||||
float mean = 0.0f;
|
||||
float variance = 0.0f;
|
||||
|
||||
float local_out = 0.0f;
|
||||
|
||||
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
|
||||
local_out += (float)(input[blockIdx.x * n + idx*256 + tid]);
|
||||
}
|
||||
|
||||
mean = blockReduceSum<float>(local_out);
|
||||
if(threadIdx.x == 0)
|
||||
s_mean = mean / n;
|
||||
__syncthreads();
|
||||
|
||||
float var_tmp = 0.0f;
|
||||
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
|
||||
var_tmp += ((input[blockIdx.x * n + idx*256 + tid] - s_mean) * (input[blockIdx.x * n + idx*256 + tid] - s_mean));
|
||||
}
|
||||
variance += blockReduceSum<float>(var_tmp);
|
||||
if(threadIdx.x == 0)
|
||||
s_variance = variance / n + epsilon;
|
||||
__syncthreads();
|
||||
|
||||
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
|
||||
out[blockIdx.x * n + idx*256+tid] =
|
||||
(T)(((input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__
|
||||
void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
|
||||
{
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ float s_mean;
|
||||
__shared__ float s_variance;
|
||||
float mean = 0.0f;
|
||||
float variance = 0.0f;
|
||||
|
||||
float local_out = 0.0f;
|
||||
|
||||
float value_tmp[8];
|
||||
value_tmp[0] = input[blockIdx.x * 2048 + 0*256 + tid];
|
||||
value_tmp[1] = input[blockIdx.x * 2048 + 1*256 + tid];
|
||||
value_tmp[2] = input[blockIdx.x * 2048 + 2*256 + tid];
|
||||
value_tmp[3] = input[blockIdx.x * 2048 + 3*256 + tid];
|
||||
value_tmp[4] = input[blockIdx.x * 2048 + 4*256 + tid];
|
||||
value_tmp[5] = input[blockIdx.x * 2048 + 5*256 + tid];
|
||||
value_tmp[6] = input[blockIdx.x * 2048 + 6*256 + tid];
|
||||
value_tmp[7] = input[blockIdx.x * 2048 + 7*256 + tid];
|
||||
|
||||
#pragma unroll(8)
|
||||
for(int idx=0; idx<8; idx++) {
|
||||
local_out += (float)value_tmp[idx];
|
||||
}
|
||||
|
||||
mean = blockReduceSum<float>(local_out);
|
||||
if(threadIdx.x == 0)
|
||||
s_mean = mean / n;
|
||||
__syncthreads();
|
||||
|
||||
float var_tmp = 0.0f;
|
||||
|
||||
#pragma unroll(8)
|
||||
for(int idx=0; idx<8; idx++) {
|
||||
var_tmp += ((value_tmp[idx] - s_mean) * (value_tmp[idx] - s_mean));
|
||||
}
|
||||
variance += blockReduceSum<float>(var_tmp);
|
||||
if(threadIdx.x == 0)
|
||||
s_variance = variance / n + epsilon;
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll(8)
|
||||
for(int idx=0; idx<8; idx++) {
|
||||
out[blockIdx.x * 2048 + idx*256+tid] =
|
||||
(T)(((value_tmp[idx] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__
|
||||
void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
|
||||
{
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ float s_mean;
|
||||
__shared__ float s_variance;
|
||||
float mean = 0.0f;
|
||||
float variance = 0.0f;
|
||||
|
||||
float local_out = 0.0f;
|
||||
|
||||
float value_tmp[4];
|
||||
value_tmp[0] = input[blockIdx.x * 1024 + 0*256 + tid];
|
||||
value_tmp[1] = input[blockIdx.x * 1024 + 1*256 + tid];
|
||||
value_tmp[2] = input[blockIdx.x * 1024 + 2*256 + tid];
|
||||
value_tmp[3] = input[blockIdx.x * 1024 + 3*256 + tid];
|
||||
|
||||
#pragma unroll(4)
|
||||
for(int idx=0; idx<4; idx++) {
|
||||
local_out += (float)value_tmp[idx];
|
||||
}
|
||||
|
||||
mean = blockReduceSum<float>(local_out);
|
||||
if(threadIdx.x == 0)
|
||||
s_mean = mean / n;
|
||||
__syncthreads();
|
||||
|
||||
float var_tmp = 0.0f;
|
||||
|
||||
#pragma unroll(4)
|
||||
for(int idx=0; idx<4; idx++) {
|
||||
var_tmp += ((value_tmp[idx] - s_mean) * (value_tmp[idx] - s_mean));
|
||||
}
|
||||
variance += blockReduceSum<float>(var_tmp);
|
||||
if(threadIdx.x == 0)
|
||||
s_variance = variance / n + epsilon;
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll(4)
|
||||
for(int idx=0; idx<4; idx++) {
|
||||
out[blockIdx.x * 1024 + idx*256+tid] =
|
||||
(T)(((value_tmp[idx] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__
|
||||
void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
|
||||
{
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ float s_mean;
|
||||
__shared__ float s_variance;
|
||||
float mean = 0.0f;
|
||||
float variance = 0.0f;
|
||||
|
||||
float local_out = 0.0f;
|
||||
|
||||
float value_tmp[2];
|
||||
value_tmp[0] = input[blockIdx.x * 512 + 0*256 + tid];
|
||||
value_tmp[1] = input[blockIdx.x * 512 + 1*256 + tid];
|
||||
|
||||
local_out += (float)value_tmp[0];
|
||||
local_out += (float)value_tmp[1];
|
||||
|
||||
mean = blockReduceSum<float>(local_out);
|
||||
if(threadIdx.x == 0)
|
||||
s_mean = mean / n;
|
||||
__syncthreads();
|
||||
|
||||
float var_tmp = 0.0f;
|
||||
var_tmp += ((value_tmp[0] - s_mean) * (value_tmp[0] - s_mean));
|
||||
var_tmp += ((value_tmp[1] - s_mean) * (value_tmp[1] - s_mean));
|
||||
|
||||
variance += blockReduceSum<float>(var_tmp);
|
||||
if(threadIdx.x == 0)
|
||||
s_variance = variance / n + epsilon;
|
||||
__syncthreads();
|
||||
|
||||
out[blockIdx.x * 512 + 0*256+tid] =
|
||||
(T)(((value_tmp[0] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[0*256 + tid])) + (float)(__ldg(&beta[0*256 + tid])));
|
||||
out[blockIdx.x * 512 + 1*256+tid] =
|
||||
(T)(((value_tmp[1] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[1*256 + tid])) + (float)(__ldg(&beta[1*256 + tid])));
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon,
|
||||
const T* in, T* out, const T* gamma_data, const T* beta_data) {
|
||||
CUDA_KERNEL_LOOP(i, count) {
|
||||
const int o = i / inside;
|
||||
const int index = i % inside;
|
||||
const T* inner_input = in + o * inside;
|
||||
T* inner_output = out + o * inside;
|
||||
T sum = 0.f;
|
||||
for (int j = 0; j < inside; ++j) {
|
||||
sum += inner_input[j];
|
||||
}
|
||||
T mean = sum / inside;
|
||||
T square_sum = 0.f;
|
||||
for (int j = 0; j < inside; ++j) {
|
||||
square_sum += (inner_input[j] - mean) * (inner_input[j] - mean);
|
||||
}
|
||||
T variable = square_sum / inside;
|
||||
variable = 1.f / sqrt(variable + epsilon);
|
||||
|
||||
inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
|
||||
}
|
||||
}
|
||||
|
||||
LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backend *backend) : Execution(backend) {
|
||||
int axis_size = layer_norm_param->axis()->size();
|
||||
mAxises.resize(axis_size);
|
||||
for (int i = 0; i < axis_size; ++i) {
|
||||
mAxises[i] = layer_norm_param->axis()->Get(i);
|
||||
}
|
||||
|
||||
mEps = layer_norm_param->epsilon();
|
||||
|
||||
int size = layer_norm_param->gamma()->size();
|
||||
mGammaTensor.reset(Tensor::createDevice<float>({size}));
|
||||
auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n");
|
||||
}
|
||||
|
||||
mDeviceGamma = (void *)mGammaTensor.get()->buffer().device;
|
||||
const float* gamma_data = layer_norm_param->gamma()->data();
|
||||
cudaMemcpy(mDeviceGamma, gamma_data, size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
if (layer_norm_param->beta()->size() != size) {
|
||||
MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n");
|
||||
}
|
||||
mBetaTensor.reset(Tensor::createDevice<float>({size}));
|
||||
status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n");
|
||||
}
|
||||
|
||||
mDeviceBeta = (void *)mBetaTensor.get()->buffer().device;
|
||||
const float* beta_data = layer_norm_param->beta()->data();
|
||||
cudaMemcpy(mDeviceBeta, beta_data, size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
}
|
||||
LayerNormExecution::~LayerNormExecution() {
|
||||
if (nullptr != mGammaTensor) {
|
||||
backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC);
|
||||
}
|
||||
if (nullptr != mBetaTensor) {
|
||||
backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC);
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode LayerNormExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
MNN_ASSERT(inputs.size() == 1);
|
||||
MNN_ASSERT(outputs.size() == 1);
|
||||
auto input = inputs[0];
|
||||
|
||||
mOutside = 1;
|
||||
mInside = 1;
|
||||
int rank = input->dimensions();
|
||||
std::vector<int> axis(mAxises.size());
|
||||
for (int i = 0; i < mAxises.size(); ++i) {
|
||||
if (mAxises[i] < 0) {
|
||||
mAxises[i] += rank;
|
||||
}
|
||||
}
|
||||
std::sort(axis.begin(), axis.end());
|
||||
for (int i = 0; i < rank - axis.size(); ++i) {
|
||||
mOutside *= input->length(i);
|
||||
}
|
||||
for (int i = rank - axis.size(); i < rank; ++i) {
|
||||
mInside *= input->length(i);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
|
||||
int block_num = runtime->blocks_num(mOutside*mInside);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto input_addr = (void*)inputs[0]->deviceId();
|
||||
auto output_addr = (void*)outputs[0]->deviceId();
|
||||
|
||||
if(mInside < 128) {
|
||||
LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr,
|
||||
(const float *)mDeviceGamma, (const float *)mDeviceBeta);
|
||||
} else {
|
||||
if(mInside == 2048) {
|
||||
input_layernorm_2048<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
|
||||
(const float *)mDeviceBeta, mOutside, mInside, mEps);
|
||||
} else if(mInside == 1024) {
|
||||
input_layernorm_1024<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
|
||||
(const float *)mDeviceBeta, mOutside, mInside, mEps);
|
||||
} else if(mInside == 512) {
|
||||
input_layernorm_512<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
|
||||
(const float *)mDeviceBeta, mOutside, mInside, mEps);
|
||||
} else {
|
||||
int sumPerKnl = (mInside+255) / 256;
|
||||
input_layernorm<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
|
||||
(const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl);
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class LayerNormCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
auto param = op->main_as_LayerNorm();
|
||||
return new LayerNormExecution(param, backend);
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<LayerNormCreator> __init(OpType_LayerNorm);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
//
|
||||
// LayerNormExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/31.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef LayerNormExecution_hpp
|
||||
#define LayerNormExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
class LayerNormExecution : public Execution {
|
||||
public:
|
||||
LayerNormExecution(const LayerNorm* layer_norm_param, Backend *backend);
|
||||
virtual ~LayerNormExecution();
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
CUDARuntime *mRuntime;
|
||||
void *mDeviceGamma = nullptr;
|
||||
void *mDeviceBeta = nullptr;
|
||||
|
||||
std::vector<int> mAxises;
|
||||
int mInside = 1;
|
||||
int mOutside = 1;
|
||||
|
||||
float mEps = 0.001;
|
||||
|
||||
std::unique_ptr<Tensor> mGammaTensor;
|
||||
std::unique_ptr<Tensor> mBetaTensor;
|
||||
|
||||
std::shared_ptr<Tensor> LayerNormTensor;
|
||||
std::shared_ptr<Tensor> biasTensor;
|
||||
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
#endif /* LayerNormExecution_hpp */
|
|
@ -3,20 +3,11 @@ namespace MNN {
|
|||
namespace CUDA {
|
||||
|
||||
template <typename T>
|
||||
__global__ void transpose(T *input, T *output, size_t e, size_t h) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
|
||||
__global__ void transpose_bias(T *input, T *output, const T* bias, int e, int h) {
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
|
||||
int y = i % e;
|
||||
int x = i / e;
|
||||
output[y * h + x] = input[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
__global__ void transpose_bias(T *input, T *output, const T* bias, size_t e, size_t h) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
|
||||
int y = i % e;
|
||||
int x = i / e;
|
||||
output[y * h + x] = input[i] + bias[x];
|
||||
output[i] = input[i] + bias[x];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -32,12 +23,14 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
auto C = outputs[0];
|
||||
auto e = C->length(0);
|
||||
auto h = C->length(1);
|
||||
if(inputs.size() > 2) {
|
||||
mTempOutput.reset(Tensor::createDevice<float>({e, h}));
|
||||
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -58,33 +51,40 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
}
|
||||
auto APtr = (const float*)A->deviceId();
|
||||
auto BPtr = (const float*)B->deviceId();
|
||||
auto CPtr = (float*)mTempOutput->deviceId();
|
||||
auto CDestPtr = (float*)C->deviceId();
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
auto tranA = CUBLAS_OP_T;
|
||||
auto ldA = l;
|
||||
if (mTransposeA) {
|
||||
ldA = e;
|
||||
tranA = CUBLAS_OP_N;
|
||||
}
|
||||
auto tranB = CUBLAS_OP_T;
|
||||
|
||||
auto tranB = CUBLAS_OP_N;
|
||||
auto ldB = h;
|
||||
if (mTransposeB) {
|
||||
ldB = l;
|
||||
tranB = CUBLAS_OP_N;
|
||||
tranB = CUBLAS_OP_T;
|
||||
}
|
||||
auto tranA = CUBLAS_OP_N;
|
||||
auto ldA = l;
|
||||
if (mTransposeA) {
|
||||
ldA = e;
|
||||
tranA = CUBLAS_OP_T;
|
||||
}
|
||||
auto status = cublasSgemm(blasHandle, tranA, tranB, e, h, l, &alpha, APtr, ldA, BPtr, ldB, &beta, CPtr, e);
|
||||
//cudaThreadSynchronize();
|
||||
// Transpose h, e -> e, h
|
||||
int block_num = runtime->blocks_num(e*h);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto CDestPtr = (float*)C->deviceId();
|
||||
if (inputs.size() > 2) {
|
||||
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
|
||||
|
||||
if(inputs.size() == 2) {
|
||||
auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h);
|
||||
cublas_check(status);
|
||||
//cudaThreadSynchronize();
|
||||
} else {
|
||||
transpose<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, e, h);
|
||||
auto CPtr = (float*)mTempOutput->deviceId();
|
||||
auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h);
|
||||
cublas_check(status);
|
||||
//cudaThreadSynchronize();
|
||||
// Transpose h, e -> e, h
|
||||
|
||||
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
#include "SoftmaxExecution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
|
||||
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
|
||||
cudnn_handle_ = runtime->cudnn_handle();
|
||||
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
|
||||
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
|
||||
|
||||
cudnn_data_type_ = CUDNN_DATA_FLOAT;
|
||||
mAxis = axis;
|
||||
}
|
||||
|
||||
SoftmaxExecution::~SoftmaxExecution() {
|
||||
cudnnDestroyTensorDescriptor(input_desc_);
|
||||
cudnnDestroyTensorDescriptor(output_desc_);
|
||||
}
|
||||
|
||||
ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
inside = 1;
|
||||
outside = 1;
|
||||
if(mAxis < 0) {
|
||||
mAxis += inputs[0]->dimensions();
|
||||
}
|
||||
axis = inputs[0]->length(mAxis);
|
||||
for (int i=0; i<mAxis; ++i) {
|
||||
outside *= inputs[0]->length(i);
|
||||
}
|
||||
for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
|
||||
inside *= inputs[0]->length(i);
|
||||
}
|
||||
|
||||
std::vector<int> tensor_shape = {outside, axis, inside, 1};
|
||||
cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
|
||||
tensor_shape[1], tensor_shape[2], tensor_shape[3]));
|
||||
|
||||
cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
|
||||
tensor_shape[1], tensor_shape[2], tensor_shape[3]));
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode SoftmaxExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = (void*)inputs[0]->deviceId();
|
||||
auto output = (void*)outputs[0]->deviceId();
|
||||
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
cudnn_check(cudnnSoftmaxForward(cudnn_handle_, CUDNN_SOFTMAX_ACCURATE,
|
||||
CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha,
|
||||
input_desc_, input,
|
||||
&beta,
|
||||
output_desc_, output));
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class SoftmaxCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
auto type = inputs[0]->getType();
|
||||
if (type.code != halide_type_float) {
|
||||
MNN_PRINT("softmax data type:%s not support", type.code);
|
||||
return nullptr;
|
||||
}
|
||||
auto axis = op->main_as_Axis()->axis();
|
||||
return new SoftmaxExecution(axis, backend);
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<SoftmaxCreator> __init(OpType_Softmax);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
//
|
||||
// SoftmaxExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/31.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef SoftmaxExecution_hpp
|
||||
#define SoftmaxExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
class SoftmaxExecution : public Execution {
|
||||
public:
|
||||
SoftmaxExecution(int axis, Backend *backend);
|
||||
virtual ~SoftmaxExecution();
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
cudnnHandle_t cudnn_handle_;
|
||||
cudnnTensorDescriptor_t input_desc_;
|
||||
cudnnTensorDescriptor_t output_desc_;
|
||||
cudnnDataType_t cudnn_data_type_;
|
||||
|
||||
int mAxis;
|
||||
int axis;
|
||||
int inside;
|
||||
int outside;
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
#endif /* SoftmaxExecution_hpp */
|
|
@ -68,17 +68,13 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc,
|
|||
|
||||
for (int g = 0; g < group; g++) {
|
||||
auto g_dst = dst + g * goc_4 * gic_4 * kh * kw * 16; // g
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int o = 0; o < goc; o++) {
|
||||
auto zo = o / 4, ro = o % 4;
|
||||
auto o_dst = g_dst + zo * gic_4 * kh * kw * 16 + ro * 4; // o/4 x 4
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int i = 0; i < gic; i++) {
|
||||
auto zi = i / 4, ri = i % 4;
|
||||
auto i_dst = o_dst + zi * kh * kw * 16 + ri; // i/4 x 4
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int h = 0; h < kh; h++) {
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int w = 0; w < kw; w++) {
|
||||
// to [g][o/4][i/4][h][w][16]
|
||||
// from [g][o][i][h][w]
|
||||
|
@ -92,9 +88,6 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc,
|
|||
}
|
||||
|
||||
void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
|
||||
std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
|
||||
if (conv->quanParameter()) {
|
||||
qnt = ConvolutionCommon::load(conv->quanParameter(), true);
|
||||
|
|
|
@ -88,9 +88,7 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int kh,
|
|||
for (int g = 0; g < group; g++) {
|
||||
auto z = g / 4, r = g % 4;
|
||||
auto z_dst = dst + z * kh * kw * 4 + r;
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int h = 0; h < kh; h++) {
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int w = 0; w < kw; w++) {
|
||||
// to [g/4][h][w][4]
|
||||
// from [g][h][w]
|
||||
|
|
|
@ -20,8 +20,6 @@ MetalMatMul::MetalMatMul(Backend *backend, const MatMul *matmul) : Execution(bac
|
|||
mTransposeB = matmul->transposeB();
|
||||
}
|
||||
ErrorCode MetalMatMul::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
struct matP {
|
||||
int size[4];
|
||||
int stride[4];
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
#import "backend/metal/MetalRaster.hpp"
|
||||
#import "backend/metal/MNNMetalContext.h"
|
||||
#import "core/Macro.h"
|
||||
|
@ -14,6 +13,7 @@
|
|||
#include "core/TensorUtils.hpp"
|
||||
#include "core/OpCommonUtils.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
struct SamplerInfo {
|
||||
|
@ -186,7 +186,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, local.first, local.second));
|
||||
}
|
||||
mShapeTemp.clear();
|
||||
for (auto& iter : mTempInput) {
|
||||
for (int i = 0; i < mTempInput.size(); ++i) {
|
||||
id<MTLBuffer> shape = [context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly];
|
||||
mShapeTemp.emplace_back(std::move(shape));
|
||||
}
|
||||
|
|
|
@ -71,7 +71,6 @@ ErrorCode MetalReduction::onResize(const std::vector<Tensor *> &inputs, const st
|
|||
|
||||
ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto &input = inputs[0], &output = outputs[0];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
|
@ -79,7 +78,6 @@ ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const s
|
|||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConst offset:0 atIndex:2];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue