Synchronize internal master to Github

This commit is contained in:
Hui Shu 2020-12-15 14:12:35 +08:00
parent 6b0c16f24f
commit ab711d484c
168 changed files with 4474 additions and 1848 deletions

View File

@ -512,6 +512,8 @@
#if defined(_MSC_VER)
# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
#elif defined(__MINGW32__)
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
#else
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
#endif // !_MSC_VER

View File

@ -368,6 +368,9 @@ list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Math
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NeuralNetWorkOp.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Optimizer.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Executor.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NN.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Module.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/NeuralNetWorkOp.hpp")
set(MNN_DEPS "")
set(MNN_EXTRA_DEPENDS "")
@ -552,7 +555,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
# https://stackoverflow.com/questions/23250863/difference-between-pthread-and-lpthread-while-compiling
target_link_libraries(MNN PUBLIC -pthread dl)
elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
target_link_libraries(MNN PUBLIC log android m)
target_link_libraries(MNN PUBLIC log m)
else()
endif()
if (NOT MNN_BUILD_SHARED_LIBS)

View File

@ -1,6 +1,6 @@
Pod::Spec.new do |s|
s.name = "MNN"
s.version = "1.1.0"
s.version = "1.1.1"
s.summary = "MNN"
s.description = <<-DESC

View File

@ -42,6 +42,7 @@ using namespace MNN;
input_2 --> region_2 --/
3. This example read a json file and construct some Rasters and compute.
Example input file at $<MNN-ROOT>/resource/exec/rasterDemo_transpose.json
The input json file format is as below:
{
"inputs" : [

Binary file not shown.

Before

Width:  |  Height:  |  Size: 104 KiB

After

Width:  |  Height:  |  Size: 341 KiB

View File

@ -126,7 +126,7 @@ void Expr::_addLinkForInputs(EXPRP expr) {
}
}
}
EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy) {
EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, Expr::MemoryType memtype) {
EXPRP expr(new Expr(1));
expr->mOp = nullptr;
auto originPtr = ptr;
@ -144,7 +144,7 @@ EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type,
// VARP::TRAINABLE
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::TRAINABLE;
}
if (dstInfo.size > 0 && copy) {
if (dstInfo.size > 0 && memtype == COPY) {
auto res = Utils::allocMemoryForHostTensor(expr->mInside->mOutputTensors[0]);
if (!res) {
MNN_ASSERT(false);
@ -160,11 +160,13 @@ EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type,
return expr;
}
expr->mInside->mContentDirty = false;
if (copy) {
if (memtype == COPY) {
::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
} else {
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
if (memtype == REF) {
TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
}
}
return expr;
}
@ -813,7 +815,6 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
} else if (info.type.code == halide_type_int && info.type.bits == 8) {
blob->dataType = DataType_DT_INT8;
blob->int8s.resize(info.size);
auto pptr = (int8_t *)ptr;
::memcpy(blob->int8s.data(), ptr, info.size * sizeof(int8_t));
} else if (info.type.code == halide_type_uint && info.type.bits == 8) {
blob->dataType = DataType_DT_UINT8;

View File

@ -115,7 +115,7 @@ VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP
if(!bias.empty()) {
ipParam->biasTerm = 1;
}
ipParam->weightSize = weight.size();
ipParam->weightSize = (int)weight.size();
ipParam->weight = std::move(weight);
ipParam->bias = std::move(bias);

View File

@ -118,7 +118,7 @@ void Module::clearCache() {
this->onClearCache();
}
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic) {
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Module::Config* config) {
AutoStorage<uint8_t> buffer;
{
FileLoader loader(fileName);
@ -135,11 +135,15 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
return {};
}
}
return load(inputs, outputs, buffer.get(), buffer.size(), dynamic);
return load(inputs, outputs, buffer.get(), buffer.size(), config);
}
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
return PipelineModule::load(inputs, outputs, buffer, length, dynamic);
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config) {
return PipelineModule::load(inputs, outputs, buffer, length, config);
}
Module* Module::extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph) {
return PipelineModule::extract(inputs, outputs, fortrain, subGraph);
}
EXPRP Module::CloneContext::getOrClone(EXPRP expr) {

View File

@ -396,7 +396,7 @@ void PipelineModule::onClearCache() {
// Do nothing
}
static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool dynamic) {
static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, const Module::Config* config) {
std::map<std::string, SubGraph> subGraphMap;
auto subGraphs = net->subgraphs();
if (nullptr == subGraphs) {
@ -426,10 +426,10 @@ static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool
flatbuffers::FlatBufferBuilder builder(1024);
auto offset = Net::Pack(builder, _tempNet.get());
builder.Finish(offset);
if (dynamic) {
submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), dynamic));
if (config->dynamic) {
submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), config));
} else {
submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs));
submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs, !config->shapeMutable));
}
if (graph->name() != nullptr) {
submodule->setName(graph->name()->str());
@ -569,6 +569,11 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, cons
break;
}
}
if (!find) {
if (net->tensorName() != nullptr) {
MNN_PRINT("%d tensor [ %s ] is input but not found\n", index, net->tensorName()->GetAsString(index)->c_str());
}
}
MNN_ASSERT(find);
}
}
@ -578,7 +583,7 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, cons
return submodule;
}
static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs) {
static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, bool shapeFix) {
if (1 == info.opList.size()) {
auto op = net->oplists()->GetAs<Op>(info.opList[0]);
if (OpType_If == op->type()) {
@ -622,25 +627,29 @@ static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info,
auto offset = Net::Pack(builder, _tempNet.get());
builder.Finish(offset);
_tempNet.reset();
return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames);
return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames, shapeFix);
}
Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config) {
// Create Subgraph
auto net = GetNet(buffer);
Module::Config defaultConfig;
if (nullptr == config) {
config = &defaultConfig;
}
auto subGraphs = net->subgraphs();
if (nullptr == net->oplists() || nullptr == net->tensorName()) {
MNN_ERROR("Invalid net, for null oplist or tensorName\n");
return nullptr;
}
if (!dynamic) {
if (!config->dynamic) {
if (nullptr == subGraphs) {
// Has no control flow, can just use static module
return new StaticModule(buffer, length, inputs, outputs);
return new StaticModule(buffer, length, inputs, outputs, !config->shapeMutable);
}
}
auto subGraphMap = _createSubGraph(net, dynamic);
if (dynamic) {
auto subGraphMap = _createSubGraph(net, config);
if (config->dynamic) {
// For dynamic mode
auto varMaps = Variable::loadMap(buffer, length);
std::vector<VARP> inputVars(inputs.size());
@ -686,7 +695,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
auto subModulesInfo = _createSubModuleInfo(net, inputIndexes, outputIndexes);
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
for (int i=0; i<subModulesInfo.size(); ++i) {
subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap));
subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap, !config->shapeMutable));
}
auto result = new PipelineModule;
/**

View File

@ -17,11 +17,8 @@ namespace Express {
class MNN_PUBLIC PipelineModule : public Module {
public:
typedef std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(Express::EXPRP)> Transformer;
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Module::Config* config = nullptr);
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
static Module* extractOrigin(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain) {
return extract(inputs, outputs, fortrain);
}
static bool turnQuantize(Module* module, const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor, NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);

View File

@ -14,6 +14,9 @@
#include <MNN/expr/Executor.hpp>
#include <MNN/AutoTime.hpp>
#include <MNN/expr/ExecutorScope.hpp>
#include "core/MNNMemoryUtils.h"
#include "Utils.hpp"
namespace MNN {
namespace Express {
StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix) : mInputs(inputs), mOutputs(outputs) {
@ -53,6 +56,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
} else {
mNet->setSessionMode(Interpreter::Session_Input_User);
}
auto rt = Express::ExecutorScope::Current()->getRuntime();
// TODO: Add Config
ScheduleConfig config;
@ -71,7 +75,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
}
StaticModule:: ~ StaticModule() {
// Do nothing
}
}
std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
AUTOTIME;
std::vector<Express::VARP> outputs(mOutputNumbers);
@ -107,9 +111,16 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
mNet->resizeSession(mSession);
if (mShapeFix) {
for (int i=0; i<inputs.size(); ++i) {
auto srcPtr = inputs[i]->readMap<void>();
// For Shape only usage input, don't alloc memory
if (nullptr != mInputTensors[i]->host<void>()) {
::memcpy(mInputTensors[i]->host<void>(), inputs[i]->readMap<void>(), mInputTensors[i]->size());
if (nullptr != mInputTensors[i]->host<void>() && nullptr != srcPtr) {
::memcpy(mInputTensors[i]->host<void>(), srcPtr, mInputTensors[i]->size());
} else if (mInputTensors[i]->deviceId() != 0) {
// Other backend
// TODO: Non-copy methed
auto exprInfo = inputs[i]->expr();
auto inside = exprInfo.first->inside();
mInputTensors[i]->copyFromHostTensor(inside->mOutputTensors[exprInfo.second]);
}
}
}
@ -132,8 +143,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
#endif
for (int i=0; i<mOutputTensors.size(); ++i) {
Express::Variable::Info info;
info.dim = mOutputTensors[i]->shape();
info.type = mOutputTensors[i]->getType();
auto currentTensor = mOutputTensors[i];
info.dim = currentTensor->shape();
info.type = currentTensor->getType();
auto format = TensorUtils::getDescribe(mOutputTensors[i])->dimensionFormat;
info.order = Express::NHWC;
if (format == MNN_DATA_FORMAT_NCHW) {
@ -141,8 +153,14 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
} else if (format == MNN_DATA_FORMAT_NC4HW4) {
info.order = Express::NC4HW4;
}
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, true), 0);
//::memcpy(outputs[i]->writeMap<void>(), mOutputTensors[i]->host<void>(), mOutputTensors[i]->size());
if (currentTensor->buffer().device != 0) {
std::shared_ptr<Tensor> tmpTensor(new Tensor(currentTensor, Tensor::CAFFE, false));
tmpTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(currentTensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
currentTensor->copyToHostTensor(tmpTensor.get());
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), tmpTensor->host<void>(), Express::VARP::CONSTANT, Expr::MemoryType::MOVE), 0);
} else {
outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, Expr::MemoryType::REF), 0);
}
}
return outputs;
}

View File

@ -127,7 +127,6 @@ public:
/**
* @brief The API shoud be called before create session.
* @param mode session mode
* @return void
*/
void setSessionMode(SessionMode mode);
@ -137,14 +136,13 @@ public:
* After createSession, try to save cache to file.
* @param cacheFile cache file name
* @param keySize the first `keySize` bytes used as the key to check if the `cacheFile` exists.
* @return void
*/
void setCacheFile(const char* cacheFile, size_t keySize = 128);
public:
/**
* @brief create runtimeInfo seperately with schedule config.
* @param config session schedule configs.
* @param configs session schedule configs.
*/
static RuntimeInfo createRuntime(const std::vector<ScheduleConfig>& configs);
@ -275,7 +273,7 @@ public:
* @brief get session info
* @param session given session.
* @param code given info code.
* @param void* given info ptr, see SessionInfoCode for detail
* @param ptr given info ptr, see SessionInfoCode for detail
* @return true if support the code, false otherwise.
*/
bool getSessionInfo(const Session* session, SessionInfoCode code, void* ptr);

View File

@ -14,7 +14,7 @@
namespace MNN {
namespace Express {
struct ExecutorScope final {
struct MNN_PUBLIC ExecutorScope final {
public:
ExecutorScope() = delete;
explicit ExecutorScope(const ExecutorScope&) = delete;

View File

@ -173,7 +173,12 @@ private:
class MNN_PUBLIC Expr {
public:
struct Inside;
static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy = true);
enum MemoryType {
COPY,
MOVE,
REF
};
static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, MemoryType copy = COPY);
static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
@ -226,14 +231,6 @@ public:
return mValid;
}
void setEntry(const std::vector<VARP>& entries) {
mEntries = entries;
}
const std::vector<VARP>& getEntry() const {
return mEntries;
}
private:
static void _addLinkForInputs(EXPRP expr);
@ -254,9 +251,6 @@ private:
bool mVisited = false;
std::vector<WeakEXPRP> mTo;
// Only the enter input has entries, and it helps to get info for enter
// input expression.
std::vector<VARP> mEntries;
};
} // namespace Express
} // namespace MNN

View File

@ -16,6 +16,7 @@
namespace MNN {
namespace Express {
struct SubGraph;
class MNN_PUBLIC Module {
public:
Module() = default;
@ -45,8 +46,17 @@ public:
void setParameter(Express::VARP parameter, int index);
static Module* createEmpty(const std::vector<Express::VARP>& parameters);
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic = false);
struct Config {
// Load module as dynamic, default static
bool dynamic = false;
// for static mode, if the shape is mutable, set true, otherwise set false to avoid resizeSession freqencily
bool shapeMutable = true;
};
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr);
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Config* config = nullptr);
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
static Module* clone(const Module* module, const bool shareParams = false);

View File

@ -14,7 +14,7 @@
#include <MNN/Interpreter.hpp> // Backend
#include <MNN/Tensor.hpp>
#include <Tensor_generated.h>
#include "Tensor_generated.h"
namespace MNN {
namespace plugin {

View File

@ -1,6 +1,3 @@
# ./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
# ./package_scripts/linux/build_lib.sh -o MNN-CPU-OPENCL/lib -b
# MNN
# |--- Debug
# | |--- libMNN.a

View File

@ -1,6 +1,3 @@
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
# ./package_scripts/linux/build_tools.sh -o MNN-CPU-OPENCL/tools -b
set -e
usage() {

View File

@ -1,11 +1,9 @@
# ./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
# ./package_scripts/linux/build_whl.sh -o MNN-CPU-OPENCL/py_whl -b
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-v MNN dist version"
echo -e "\t-b opencl backend"
exit 1
}
@ -13,6 +11,7 @@ usage() {
while getopts "o:v:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
v ) mnn_version=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
@ -38,7 +37,7 @@ rm -rf wheelhouse && mkdir wheelhouse
#Compile wheels
for PYBIN in /opt/python/*/bin; do
"${PYBIN}/pip" install -U numpy
"${PYBIN}/python" setup.py bdist_wheel
"${PYBIN}/python" setup.py bdist_wheel --version $mnn_version
done
# Bundle external shared libraries into the wheels

View File

@ -1,6 +1,3 @@
# ./package_scripts/mac/build_lib.sh -o MNN-CPU/lib
# ./package_scripts/mac/build_lib.sh -o MNN-CPU-OPENCL/lib -b
# MNN
# |--- Debug
# | |--- Dynamic

View File

@ -1,6 +1,3 @@
# ./package_scripts/mac/build_tools.sh -o MNN-CPU/tools
# ./package_scripts/mac/build_tools.sh -o MNN-CPU-OPENCL/tools -b
set -e
usage() {

View File

@ -1,22 +1,21 @@
# ./package_scripts/mac/build_whl.sh -o MNN-CPU/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0
# ./package_scripts/mac/build_whl.sh -o MNN-CPU-OPENCL/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0 -b
set -e
usage() {
echo "Usage: $0 -o path -v python_versions [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-v python versions in pyenv"
echo -e "\t-p python versions in pyenv"
echo -e "\t-v MNN dist version"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:v:hb" opt; do
while getopts "o:p:v:b" opt; do
case "$opt" in
o ) path=$OPTARG ;;
v ) IFS="," read -a python_versions <<< $OPTARG ;;
p ) IFS="," read -a python_versions <<< $OPTARG ;;
v ) mnn_version=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
* ) usage ;;
esac
done
@ -38,7 +37,7 @@ pushd pymnn/pip_package
rm -rf dist && mkdir dist
for env in $python_versions; do
pyenv global $env
python build_wheel.py
python build_wheel.py --version $mnn_version
done
cp dist/* $PACKAGE_PATH

View File

@ -1,8 +1,3 @@
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -opencl
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -opencl
# MNN
# |-- Debug
# | |--- MD

View File

@ -1,8 +1,3 @@
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x64
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x86
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x64 -opencl
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x86 -opencl
Param(
[Parameter(Mandatory=$true)][String]$path,
[Switch]$opencl

View File

@ -1,8 +1,5 @@
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
Param(
[Parameter(Mandatory=$true)][String]$version,
[Parameter(Mandatory=$true)][String]$pyenvs,
[Parameter(Mandatory=$true)][String]$path,
[Switch]$x86,
@ -15,9 +12,9 @@ $python_versions = $pyenvs.Split(",")
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
$PACKAGE_PATH = $(Resolve-Path $path).Path
$ARGS = ""
$ARGS = "--version $version"
if ($x86) {
$ARGS = "--x86"
$ARGS = " --x86"
}
powershell ./schema/generate.ps1

View File

@ -7,6 +7,10 @@
objects = {
/* Begin PBXBuildFile section */
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */; };
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */; };
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */; };
11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */ = {isa = PBXBuildFile; fileRef = 11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */; };
1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F722397BA5A004E8721 /* HalideRuntime.h */; settings = {ATTRIBUTES = (Public, ); }; };
1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F732397BA5A004E8721 /* MNNDefine.h */; settings = {ATTRIBUTES = (Public, ); }; };
1F501F812397BA5B004E8721 /* AutoTime.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 1F501F742397BA5A004E8721 /* AutoTime.hpp */; settings = {ATTRIBUTES = (Public, ); }; };
@ -45,6 +49,7 @@
4829A2DE23CC26AE00623BF5 /* ReverseSequenceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */; };
4829A2DF23CC26AE00623BF5 /* ReplaceTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */; };
4829A2E023CC26AE00623BF5 /* PaddingTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */; };
4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4836CEE4257744120068F6CE /* ShapePlugin.cpp */; };
48417FF024D13BF50056D9A7 /* GeometryThreshold.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */; };
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */; };
48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */; };
@ -251,6 +256,8 @@
48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD0349246AA40300456AF5 /* GeometryConvert.cpp */; };
48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */; };
48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */; };
6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */; };
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3E25823349002EC3D6 /* PluginKernel.cpp */; };
9200049921EDBDF600BCE892 /* TensorTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045D21EDBDF600BCE892 /* TensorTest.cpp */; };
9200049A21EDBDF600BCE892 /* ImageProcessTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045F21EDBDF600BCE892 /* ImageProcessTest.cpp */; };
9200049B21EDBDF600BCE892 /* MatrixTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200046021EDBDF600BCE892 /* MatrixTest.cpp */; };
@ -731,6 +738,10 @@
0F1465B71FA18D1000F9860A /* MNN.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = MNN.framework; sourceTree = BUILT_PRODUCTS_DIR; };
0F1465BB1FA18D1000F9860A /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
0F78AC261FCD495800205A7C /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Int32.S; sourceTree = "<group>"; };
11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Float.S; sourceTree = "<group>"; };
11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNVectorTop1Int32.S; sourceTree = "<group>"; };
1F501F722397BA5A004E8721 /* HalideRuntime.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HalideRuntime.h; path = MNN/HalideRuntime.h; sourceTree = "<group>"; };
1F501F732397BA5A004E8721 /* MNNDefine.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNDefine.h; path = MNN/MNNDefine.h; sourceTree = "<group>"; };
1F501F742397BA5A004E8721 /* AutoTime.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = AutoTime.hpp; path = MNN/AutoTime.hpp; sourceTree = "<group>"; };
@ -767,6 +778,7 @@
4829A2D323CC26AD00623BF5 /* ReverseSequenceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReverseSequenceTest.cpp; sourceTree = "<group>"; };
4829A2D423CC26AD00623BF5 /* ReplaceTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReplaceTest.cpp; sourceTree = "<group>"; };
4829A2D523CC26AD00623BF5 /* PaddingTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PaddingTest.cpp; sourceTree = "<group>"; };
4836CEE4257744120068F6CE /* ShapePlugin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapePlugin.cpp; sourceTree = "<group>"; };
48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryThreshold.cpp; sourceTree = "<group>"; };
48417FED24D13BF50056D9A7 /* GeometryELU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryELU.cpp; sourceTree = "<group>"; };
48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySelect.cpp; sourceTree = "<group>"; };
@ -973,6 +985,8 @@
48FD0349246AA40300456AF5 /* GeometryConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConvert.cpp; sourceTree = "<group>"; };
48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryImageOp.cpp; sourceTree = "<group>"; };
48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConv2DBackPropFilter.cpp; sourceTree = "<group>"; };
6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginShapeInference.cpp; sourceTree = "<group>"; };
6A131E3E25823349002EC3D6 /* PluginKernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginKernel.cpp; sourceTree = "<group>"; };
9200045321EDBCF700BCE892 /* MNNTestSuite.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = MNNTestSuite.h; path = ../../../test/MNNTestSuite.h; sourceTree = "<group>"; };
9200045521EDBCF700BCE892 /* TestUtils.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = TestUtils.h; path = ../../../test/TestUtils.h; sourceTree = "<group>"; };
9200045721EDBCF700BCE892 /* TestUtils.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = TestUtils.mm; path = ../../../test/TestUtils.mm; sourceTree = "<group>"; };
@ -1619,6 +1633,7 @@
488873A8215B639D0079B12E /* source */ = {
isa = PBXGroup;
children = (
6A131E3C2582331C002EC3D6 /* plugin */,
489D7A152550FDC800AD896A /* metal */,
48C84B9D250F725600EE7666 /* utils */,
48747D51245D9E33000B9709 /* geometry */,
@ -2014,6 +2029,15 @@
path = ../../../test/speed;
sourceTree = "<group>";
};
6A131E3C2582331C002EC3D6 /* plugin */ = {
isa = PBXGroup;
children = (
6A131E3E25823349002EC3D6 /* PluginKernel.cpp */,
6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */,
);
path = plugin;
sourceTree = "<group>";
};
9200045021EDBCEC00BCE892 /* Tests */ = {
isa = PBXGroup;
children = (
@ -2160,6 +2184,8 @@
92FF013A23AA0B4E00AC97F6 /* arm32 */ = {
isa = PBXGroup;
children = (
11A01A05258785EA00745FA7 /* MNNVectorTop1Float.S */,
11A01A04258785EA00745FA7 /* MNNVectorTop1Int32.S */,
48034562254157CE004738E3 /* MNNNV21ToBGRAUnit.S */,
48BB6EF525220AA80056E195 /* MNNTranspose32Bit4x4.S */,
C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */,
@ -2231,6 +2257,8 @@
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
isa = PBXGroup;
children = (
11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */,
11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */,
48034566254157DF004738E3 /* MNNNV21ToBGRAUnit.S */,
48BB6EEF25220A930056E195 /* MNNTranspose32Bit4x4.S */,
C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */,
@ -2350,6 +2378,7 @@
EBB38EC621E748B9005F76D7 /* shape */ = {
isa = PBXGroup;
children = (
4836CEE4257744120068F6CE /* ShapePlugin.cpp */,
48C84B6B250F709E00EE7666 /* SizeComputer.cpp */,
48C84B6A250F709E00EE7666 /* SizeComputer.hpp */,
486E1A9B24F507A600C16006 /* ShapeRandomUniform.cpp */,
@ -2828,6 +2857,7 @@
92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */,
92FF02D223AA0B5A00AC97F6 /* MNNNV21ToRGBAUnit.S in Sources */,
48747D66245D9E33000B9709 /* GeometryDepthToSpace.cpp in Sources */,
6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */,
92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */,
48034563254157CE004738E3 /* MNNNV21ToBGRAUnit.S in Sources */,
48FA474823AA127B00172C3B /* Expr.cpp in Sources */,
@ -2836,6 +2866,7 @@
92FF042923AA0B7100AC97F6 /* ShapeLinSpace.cpp in Sources */,
92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */,
48FB9DC224A8445A008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */,
92FF027523AA0B5A00AC97F6 /* CPUConvolution.cpp in Sources */,
48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */,
@ -2843,6 +2874,7 @@
48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
92FF03A023AA0B5A00AC97F6 /* ConvolutionWinograd.cpp in Sources */,
48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
92FF041E23AA0B7100AC97F6 /* ShapeRange.cpp in Sources */,
@ -2855,6 +2887,7 @@
92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
92FF030023AA0B5A00AC97F6 /* MNNSamplerC4NearestOpt.S in Sources */,
11A01A0C258785FB00745FA7 /* MNNVectorTop1Float.S in Sources */,
48FB9DC924A848D0008E1A2D /* MNNPackedMatMulRemain.S in Sources */,
92FF044023AA0B7100AC97F6 /* ShapeSlice.cpp in Sources */,
92FF044723AA0B7100AC97F6 /* ShapeSqueeze.cpp in Sources */,
@ -2893,6 +2926,7 @@
C43C81DF2518944F00A0FF84 /* WinogradHelper.cpp in Sources */,
92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */,
92FF044A23AA0B7100AC97F6 /* ShapeConvolution.cpp in Sources */,
11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */,
92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */,
92FF02E923AA0B5A00AC97F6 /* MNNDepthWiseInt8AddBiasScaleUnit.S in Sources */,
92FF026A23AA0B5A00AC97F6 /* CPUNonMaxSuppressionV2.cpp in Sources */,
@ -2960,6 +2994,7 @@
92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
92FF035323AA0B5A00AC97F6 /* CPUScatterNd.cpp in Sources */,
48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
@ -3004,6 +3039,7 @@
48FB9DCA24A848D0008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
489D7A832550FDC900AD896A /* MetalMatMul.mm in Sources */,
92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */,
11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */,
48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */,
EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */,
@ -3430,6 +3466,7 @@
"MNN_METAL_ENABLED=1",
"MNN_SUPPORT_TFLITE_QUAN=1",
"ENABLE_ARMV82=1",
"MNN_WITH_PLUGIN=1",
);
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
GCC_WARN_SHADOW = NO;
@ -3489,6 +3526,7 @@
"MNN_METAL_ENABLED=1",
"MNN_SUPPORT_TFLITE_QUAN=1",
"ENABLE_ARMV82=1",
"MNN_WITH_PLUGIN=1",
);
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
GCC_WARN_SHADOW = YES;

View File

@ -1,23 +1,33 @@
# TODO: avoid import everything from _mnncengine._nn for visable control
from _mnncengine._nn import *
import _mnncengine._expr as _F
import _mnncengine._nn as _nn
def load_module_from_file(file_name, for_training):
# old call: load_module_from_file(file_name, for_training)
# new call: load_module_from_file(file_name, dynamic=False, shape_mutable=True)
# support two by args and kwargs
def load_module_from_file(file_name, *args, **kwargs):
old_call = len(args) > 0 #for_training
m = _F.load_as_dict(file_name)
inputs_outputs = _F.get_inputs_and_outputs(m)
inputs = []
for key in inputs_outputs[0].keys():
inputs.append(inputs_outputs[0][key])
inputs.append(inputs_outputs[0][key] if old_call else key)
outputs = []
for key in inputs_outputs[1].keys():
outputs.append(inputs_outputs[1][key])
module = _nn.load_module(inputs, outputs, for_training)
outputs.append(inputs_outputs[1][key] if old_call else key)
if old_call:
for_training = args[0]
module = _nn.load_module(inputs, outputs, for_training)
else:
dynamic = kwargs.get('dynamic', False)
shape_mutable = kwargs.get('shape_mutable', True)
module = _nn.load_module_from_file(inputs, outputs, file_name, dynamic, shape_mutable)
return module
@ -53,17 +63,3 @@ class Module(_nn._Module):
else:
self._vars[name] = value
self._add_parameter(value)
class FixModule(object):
def __init__(self, module):
super(FixModule, self).__init__()
self.module = module
def forward(self, x):
self.module.train(False)
return self.module.forward(x)
def __call__(self, x):
self.module.train(False)
return self.module(x)

View File

@ -6,6 +6,8 @@ import argparse
parser = argparse.ArgumentParser(description='build pymnn wheel')
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
help='build wheel for 32bit arch, only usable on windows')
parser.add_argument('--version', dest='version', type=str, required=True,
help='MNN dist version')
args = parser.parse_args()
import os
@ -18,11 +20,11 @@ if __name__ == '__main__':
os.system("pip install -U numpy")
if os.path.exists('build'):
shutil.rmtree('build')
if IS_DARWIN:
os.system('python setup.py bdist_wheel')
comm_args = '--version ' + args.version
if IS_LINUX:
os.system('python setup.py bdist_wheel --plat-name=manylinux1_x86_64')
comm_args += ' --plat-name=manylinux1_x86_64'
if IS_WINDOWS:
os.putenv('DISTUTILS_USE_SDK', '1')
os.putenv('MSSdk', '1')
os.system('python setup.py bdist_wheel %s' % ('--x86' if args.x86 else ''))
comm_args += ' --x86' if args.x86 else ''
os.system('python setup.py bdist_wheel %s' % comm_args)

View File

@ -2,8 +2,18 @@
# Created by ruhuan on 2019.08.31
""" setup tool """
from __future__ import print_function
import os
import sys
import argparse
parser = argparse.ArgumentParser(description='build pymnn wheel')
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
help='build wheel for 32bit arch, only usable on windows')
parser.add_argument('--version', dest='version', type=str, required=True,
help='MNN dist version')
args, unknown = parser.parse_known_args()
sys.argv = [sys.argv[0]] + unknown
import os
import platform
try:
import numpy as np
@ -19,9 +29,8 @@ IS_LINUX = (platform.system() == 'Linux')
BUILD_DIR = 'pymnn_build'
BUILD_TYPE = 'RELEASE'
BUILD_ARCH = 'x64'
if '--x86' in sys.argv:
if args.x86:
BUILD_ARCH = ''
sys.argv.remove('--x86')
def check_env_flag(name, default=''):
""" check whether a env is set to Yes """
@ -46,7 +55,7 @@ if os.path.isdir('../../schema/private'):
print ('Building with python wheel with package name ', package_name)
version = '1.1.0'
version = args.version
depend_pip_packages = ['flatbuffers', 'numpy']
if package_name == 'MNN':
README = os.path.join(os.getcwd(), "README.md")
@ -106,9 +115,9 @@ def configure_extension_build():
]
if check_env_flag('WERROR'):
extra_compile_args.append('-Werror')
extra_compile_args += ['-DUSE_V3_API']
extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE']
root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd())))
engine_compile_args = ['-DBUILD_OPTYPE', '-DBUILD_TRAIN']
engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API']
engine_libraries = []
engine_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
@ -121,6 +130,7 @@ def configure_extension_build():
engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")]
engine_include_dirs = [os.path.join(root_dir, "include")]
engine_include_dirs += [os.path.join(root_dir, "express")]
engine_include_dirs += [os.path.join(root_dir, "express", "module")]
engine_include_dirs += [os.path.join(root_dir, "source")]
engine_include_dirs += [os.path.join(root_dir, "tools", "train", "source", "grad")]
engine_include_dirs += [os.path.join(root_dir, "tools", "train", "source", "module")]

File diff suppressed because it is too large Load Diff

View File

@ -161,23 +161,22 @@ static PyMethodDef module_methods[] = {
#else
#define MOD_INIT(name) PyMODINIT_FUNC init##name(void)
#endif
MOD_INIT(_tools)
{
#if PY_MAJOR_VERSION >= 3
PyObject *m = PyModule_Create(&moduledef);
// module import failed!
if (!m) {
printf("import Tools failed");
return NULL;
}
return m;
#else
PyObject *m = Py_InitModule3("_tools", module_methods, "MNNTools Module");
// module import failed!
if (!m) {
printf("import Tools failed");
return;
}
MOD_INIT(_tools) {
#if PY_MAJOR_VERSION >= 3
PyObject *m = PyModule_Create(&moduledef);
// module import failed!
if (!m) {
printf("import Tools failed");
return NULL;
}
return m;
#else
PyObject *m = Py_InitModule3("_tools", module_methods, "MNNTools Module");
// module import failed!
if (!m) {
printf("import Tools failed");
return;
#endif
}
return;
#endif
}

33
pymnn/src/common.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#ifndef PYMNN_USE_ALINNPYTHON
#ifndef PYMNN_EXPR_API
#error PYMNN_EXPR_API macro should be define on official python (PYMNN_USE_ALINNPYTHON=OFF)
#endif
#ifndef PYMNN_NUMPY_USABLE
#error PYMNN_NUMPY_USABLE macro should be define on official python (PYMNN_USE_ALINNPYTHON=OFF)
#endif
#endif
#if defined(ANDROID) || defined(__ANDROID__)
#undef _FILE_OFFSET_BITS
#endif
#include <fstream>
#ifdef PYMNN_USE_ALINNPYTHON
#include <AliNNPython/Python.h>
#include <AliNNPython/frameobject.h>
#include <AliNNPython/pythread.h>
#include "renameForAliNNPython.h"
#ifdef PYMNN_NUMPY_USABLE
#include <numpy/ndarrayobject.h>
#include <numpy/ndarraytypes.h>
#endif
#else
#define PyType_FindTLSType
#include <Python.h>
#include "structmember.h"
#include "numpy/arrayobject.h"
#endif

View File

@ -0,0 +1,190 @@
#pragma once
#define PyObject WeObject
#define PyImport_Import WeImport_Import
#define PyObject_GetAttrString WeObject_GetAttrString
#define PyObject_HEAD WeObject_HEAD
#define PyTypeObject WeTypeObject
#define PyObject_HEAD_INIT WeObject_HEAD_INIT
#define PyString_AsString WeString_AsString
#define PyErr_SetString WeErr_SetString
#define PyTuple_GetItem WeTuple_GetItem
#define PyTuple_Size WeTuple_Size
#define PyDict_New WeDict_New
#define PyDict_SetItem WeDict_SetItem
#define PyDict_GetItemString WeDict_GetItemString
#define PyCallable_Check WeCallable_Check
#define PyArg_ParseTuple WeArg_ParseTuple
#define PyLong_AsLong WeLong_AsLong
#define PyObject_Call WeObject_Call
#define PyType_Ready WeType_Ready
#define PyCapsule_New WeCapsule_New
#define PyLong_FromLong WeLong_FromLong
#define PyModule_AddObject WeModule_AddObject
#define PyTuple_SetItem WeTuple_SetItem
#define PyFloat_FromDouble WeFloat_FromDouble
#define PyFloat_AsDouble WeFloat_AsDouble
#define PyTuple_New WeTuple_New
#define PyString_FromString WeString_FromString
#define PyCapsule_GetPointer WeCapsule_GetPointer
#define PyObject_TypeCheck WeObject_TypeCheck
#define PyObject_IsInstance WeObject_IsInstance
#define PySequence_Tuple WeSequence_Tuple
#define PyExc_Exception (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_Exception)
#define PyExc_StopIteration (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_StopIteration)
#define PyExc_MemoryError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_MemoryError)
#define PyExc_ImportError ((WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_ImportError))
#define PyExc_IndexError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_IndexError)
#define PyExc_KeyError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_KeyError)
#define PyExc_ValueError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_ValueError)
#define PyExc_TypeError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_TypeError)
#define PyExc_BufferError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_BufferError)
#define PyExc_RuntimeError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_RuntimeError)
#define PyExc_SystemError (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_SystemError)
#define PyExc_FutureWarning (WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_FutureWarning)
#define PyExc_AttributeError ((WeObject *)WeType_FindTLSType((WeTypeObject *)WeExc_AttributeError))
#define PyErr_ExceptionMatches WeErr_ExceptionMatches
#define PyErr_Fetch WeErr_Fetch
#define PyErr_Restore WeErr_Restore
#define PyBuffer_Release WeBuffer_Release
#define PyObject_HasAttr WeObject_HasAttr
#define PyObject_HasAttrString WeObject_HasAttrString
#define PyObject_DelAttr WeObject_DelAttr
#define PyObject_DelAttrString WeObject_DelAttrString
#define PyObject_GetAttr WeObject_GetAttr
#define PyErr_GivenExceptionMatches WeErr_GivenExceptionMatches
#define PyErr_Clear WeErr_Clear
#define PyObject_SetAttr WeObject_SetAttr
#define PyObject_SetAttrString WeObject_SetAttrString
#define PyObject_Hash WeObject_Hash
#define PyObject_GetItem WeObject_GetItem
#define PyObject_SetItem WeObject_SetItem
#define PySequence_GetItem WeSequence_GetItem
#define PySequence_SetItem WeSequence_SetItem
#define PyList_GetItem WeList_GetItem
#define PyList_SetItem WeList_SetItem
#define PySequence_Fast_ITEMS WeSequence_Fast_ITEMS
#define PyDict_Next WeDict_Next
#define PyObject_GetIter WeObject_GetIter
#define PyStaticMethod_Type WeStaticMethod_Type
#define PyIter_Next WeIter_Next
#define PyErr_Occurred WeErr_Occurred
#define PyObject_Str WeObject_Str
#define PyString_AsStringAndSize WeString_AsStringAndSize
#define PyString_FromStringAndSize WeString_FromStringAndSize
#define PyObject_IsTrue WeObject_IsTrue
#define PyLong_AsUnsignedLong WeLong_AsUnsignedLong
#define PyLong_FromUnsignedLong WeLong_FromUnsignedLong
#define PyLong_AsLongLong WeLong_AsLongLong
#define PyLong_FromLongLong WeLong_FromLongLong
#define PyLong_AsLong WeLong_AsLong
#define PyLong_AsUnsignedLongLong WeLong_AsUnsignedLongLong
#define PyLong_FromUnsignedLongLong WeLong_FromUnsignedLongLong
#define PyNumber_Long WeNumber_Long
#define PyNumber_Float WeNumber_Float
#define PySequence_Check WeSequence_Check
#define PySequence_Size WeSequence_Size
#define PySequence_List WeSequence_List
#define PySlice_New WeSlice_New
#define PySlice_GetIndicesEx WeSlice_GetIndicesEx
#define PySlice_GetIndicesEx WeSlice_GetIndicesEx
#define PyCapsule_GetContext WeCapsule_GetContext
#define PyCapsule_SetContext WeCapsule_SetContext
#define PyCapsule_GetName WeCapsule_GetName
#define PyDict_Size WeDict_Size
#define PyDict_Clear WeDict_Clear
#define PyObject_CallFunctionObjArgs WeObject_CallFunctionObjArgs
#define PySet_New WeSet_New
#define PySet_Size WeSet_Size
#define PySet_Clear WeSet_Clear
#define PyStaticMethod_New WeStaticMethod_New
#define PyObject_CheckBuffer WeObject_CheckBuffer
#define PyObject_GetBuffer WeObject_GetBuffer
#define PyWeakref_NewRef WeWeakref_NewRef
#define PyDict_Type WeDict_Type
#define PyList_New WeList_New
#define PyList_Size WeList_Size
#define PyMemoryView_FromBuffer WeMemoryView_FromBuffer
#define PyObject_Length WeObject_Length
#define PyObject_Repr WeObject_Repr
#define PyThread_create_key WeThread_create_key
#define PyGILState_Ensure WeGILState_Ensure
#define PyGILState_Release WeGILState_Release
#define PyEval_InitThreads WeEval_InitThreads
#define PyThreadState WeThreadState
#define PyThreadState_Get WeThreadState_Get
#define PyThread_create_key WeThread_create_key
#define PyThread_set_key_value WeThread_set_key_value
#define PyMemoryView_FromObject WeMemoryView_FromObject
#define PyEval_GetBuiltins WeEval_GetBuiltins
#define PyList_Append WeList_Append
#define PyMem_Free WeMem_Free
#define PyErr_NormalizeException WeErr_NormalizeException
#define PyFrame_GetLineNumber WeFrame_GetLineNumber
#define PyType_IsSubtype WeType_IsSubtype
#define PyNumber_Check WeNumber_Check
#define PyInt_FromSsize_t WeInt_FromSsize_t
#define PyString_Size WeString_Size
#define _PyThreadState_Current _WeThreadState_Current
#define PyProperty_Type WeProperty_Type
#define PyType_Type WeType_Type
#define _PyType_Lookup _WeType_Lookup
#define PyBaseObject_Type WeBaseObject_Type
#define _PyObject_GetDictPtr _WeObject_GetDictPtr
#define PyInt_FromSize_t WeInt_FromSize_t
#define PyObject_ClearWeakRefs WeObject_ClearWeakRefs
#define PyErr_Format WeErr_Format
#define PyObject_MALLOC WeObject_MALLOC
#define PyCFunction_NewEx WeCFunction_NewEx
#define PyMethod_New WeMethod_New
#define PyDict_DelItemString WeDict_DelItemString
#define PyModule_GetName WeModule_GetName
#define PyImport_AddModule WeImport_AddModule
#define PyImport_ImportModule WeImport_ImportModule
#define PyImport_ReloadModule WeImport_ReloadModule
#define PyEval_GetGlobals WeEval_GetGlobals
#define PyErr_NewException WeErr_NewException
#define PyThread_get_key_value WeThread_get_key_value
#define PyGILState_GetThisThreadState WeGILState_GetThisThreadState
#define PyThreadState_New WeThreadState_New
#define PyEval_AcquireThread WeEval_AcquireThread
#define PyErr_WarnEx WeErr_WarnEx
#define PyThread_delete_key_value WeThread_delete_key_value
#define PyThreadState_Clear WeThreadState_Clear
#define PyThreadState_DeleteCurrent WeThreadState_DeleteCurrent
#define PyEval_SaveThread WeEval_SaveThread
#define PyEval_RestoreThread WeEval_RestoreThread
#define PyFrame_FastToLocals WeFrame_FastToLocals
#define PyDict_GetItem WeDict_GetItem
#define PyObject_CallObject WeObject_CallObject
#define PyObject_RichCompareBool WeObject_RichCompareBool
#define PyNumber_Invert WeNumber_Invert
#define PyNumber_Negative WeNumber_Negative
#define PyNumber_Add WeNumber_Add
#define PyNumber_InPlaceAdd WeNumber_InPlaceAdd
#define PyNumber_Subtract WeNumber_Subtract
#define PyNumber_InPlaceSubtract WeNumber_InPlaceSubtract
#define PyNumber_Multiply WeNumber_Multiply
#define PyNumber_InPlaceMultiply WeNumber_InPlaceMultiply
#define PyNumber_TrueDivide WeNumber_TrueDivide
#define PyNumber_InPlaceTrueDivide WeNumber_InPlaceTrueDivide
#define PyNumber_Or WeNumber_Or
#define PyNumber_InPlaceOr WeNumber_InPlaceOr
#define PyNumber_And WeNumber_And
#define PyNumber_InPlaceAnd WeNumber_InPlaceAnd
#define PyNumber_Xor WeNumber_Xor
#define PyNumber_InPlaceXor WeNumber_InPlaceXor
#define PyNumber_Lshift WeNumber_Lshift
#define PyNumber_InPlaceLshift WeNumber_InPlaceLshift
#define PyNumber_Rshift WeNumber_Rshift
#define PyNumber_InPlaceRshift WeNumber_InPlaceRshift
#define PyDict_Contains WeDict_Contains
#define PyLong_AsLongLongAndOverflow WeLong_AsLongLongAndOverflow
#define PySequence_Length WeSequence_Length
#define PySequence_Fast WeSequence_Fast
#define PySequence_Fast_GET_SIZE WeSequence_Fast_GET_SIZE
#define PyCFunction_Type WeCFunction_Type
#define PyType_FindTLSType WeType_FindTLSType
#define PyInterpreterState_Get WeInterpreterState_Get

View File

@ -1,20 +1,10 @@
#pragma once
#include <string>
#include <MNN/expr/Expr.hpp>
#include <MNN/expr/ExprCreator.hpp>
#ifdef USE_PRIVATE
#include "private_define.h"
#else
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/operators.h"
#include "numpy/arrayobject.h"
#include <Python.h>
#include "structmember.h"
#endif
using namespace MNN;
using namespace MNN::Express;
#include <vector>
#include "common.h"
using namespace std;
typedef vector<int> INTS;
// Returns true if obj is a bytes/str or unicode object
inline bool checkString(PyObject* obj) {
return PyBytes_Check(obj) || PyUnicode_Check(obj);
@ -176,9 +166,8 @@ halide_type_t dtype2htype(DType dtype) {
CONVERT(DType_INT8, halide_type_of<int8_t>(), dtype);
return halide_type_of<float>();
}
#ifndef USE_PRIVATE
inline int getitemsize(int dtype, int npy_type)
{
#ifdef PYMNN_NUMPY_USABLE
inline int getitemsize(int dtype, int npy_type) {
switch(dtype) {
case DType_FLOAT:
if(npy_type != NPY_FLOAT) {
@ -210,8 +199,7 @@ inline int getitemsize(int dtype, int npy_type)
}
}
#endif
inline int getitemsize(int dtype)
{
inline int getitemsize(int dtype) {
switch(dtype) {
case DType_FLOAT:
return 4;
@ -229,3 +217,4 @@ inline int getitemsize(int dtype)
throw std::runtime_error("does not support this dtype");
}
}

View File

@ -0,0 +1,45 @@
set -e
usage() {
echo "Usage: $0 -p python_version [-t]"
echo -e "\t-p python versions in pyenv"
echo -e "\t-t include train API wrapper"
exit 1
}
while getopts "p:t" opt; do
case "$opt" in
p ) py_version=$OPTARG ;;
t ) train_api=true ;;
* ) usage ;;
esac
done
rm -rf /tmp/mnn_py && mkdir -p /tmp/mnn_py
cp -r pip_package/MNN /tmp/mnn_py
pushd /tmp/mnn_py/MNN
rm -rf tools
cat __init__.py | sed '/from . import tools/d' > __init__.py.tmp
mv __init__.py.tmp __init__.py
if [ -z $train_api ]; then
rm -rf data optim
cat __init__.py | sed '/from . import data/d' | sed '/from . import optim/d' > __init__.py.tmp
mv __init__.py.tmp __init__.py
fi
find . -name __pycache__ | xargs rm -rf
pyenv global $py_version
python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)"
find . -name *.py | xargs rm -rf
cd ..
zip -r MNN.zip MNN
popd
rm -f android/src/main/assets/MNN.zip
rm -rf iOS/MNNPyBridge/lib/MNN
cp /tmp/mnn_py/MNN.zip android/src/main/assets
cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib
rm -rf /tmp/mnn_py

View File

@ -0,0 +1,33 @@
{
"inputs" : [
{
"id" : 0,
"type" : "int",
"dims" : [1, 1, 5, 4],
"data" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
}
],
"outputs": [
{
"id" : 0,
"type" : "int",
"dims" : [1, 1, 4, 5],
"data" : []
}
],
"regions" : [
{
"id" : 0,
"size" : [1, 4, 5],
"src" : {
"offset": 0,
"stride": [1, 1, 4]
},
"dst" : {
"offset" : 0,
"stride" : [1, 5, 1]
},
"origin": 0
}
]
}

View File

@ -78,7 +78,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
// The default data type of input tensor for arm82 backend is FLOAT32.
// However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
// then divide size by 2
auto size = sizeof(int16_t);
int size = sizeof(int16_t);
const int dimensions = buffer.dimensions;
for (int i = 0; i < dimensions; i++) {
int currentDimSize = buffer.dim[i].extent;
@ -87,7 +87,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora
}
size *= currentDimSize;
}
auto res = allocBuffer(size, buffer, storageType);
auto res = allocBuffer(size, (Tensor*)nativeTensor, storageType);
if (!res) {
return false;
}

View File

@ -97,7 +97,7 @@ ErrorCode Arm82Relu::onExecute(const std::vector<Tensor *> &inputs, const std::v
mThreadNumbers = static_cast<Arm82Backend *>(backend())->numberThread();
MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers)
for (int b = tId; b < batchAndChannel; b += mThreadNumbers) {
for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) {
_MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT,
src + b * plane * ARMV82_CHANNEL_UNIT,
slopeHalf,

View File

@ -24,7 +24,7 @@
#include "backend/arm82/Arm82Backend.hpp"
#endif
#define MAX_THREAD_NUMBER 32
#define LARGE_MEMORY 1024 * 1024 * 100
#define LARGE_MEMORY 1024 * 1024 * 500
//#define MNN_DUMP_MEMORY_USAGE
#define MNN_CPU_CHECK_NAN 1
@ -35,8 +35,7 @@ struct cpuinfo_arm_isa gCPUInfo;
#endif
CPURuntime::CPURuntime(const Backend::Info& info) {
mDynamicAllocator.reset(new BufferAllocator);
mStaticAllocator.reset(new BufferAllocator);
mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault()));
mThreadNumber = info.numThread;
mThreadNumber = std::max(1, mThreadNumber);
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -88,9 +87,8 @@ CPURuntime:: ~ CPURuntime() {
#endif
}
float CPURuntime::onGetMemoryInMB() {
auto dynamicMemoryInMB = mDynamicAllocator->totalSize() / 1024.0f / 1024.0f;
auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
return dynamicMemoryInMB + staticMemoryInMB;
return staticMemoryInMB;
}
Backend* CPURuntime::onCreate() const{
#if defined(__aarch64__) && ENABLE_ARMV82
@ -102,9 +100,6 @@ Backend* CPURuntime::onCreate() const{
}
void CPURuntime::onGabageCollect(int level) {
mStaticAllocator->release(false);
if (level > 50) {
mDynamicAllocator->release(false);
}
}
std::map<OpType, CPUBackend::Creator*>* CPUBackend::gCreator = nullptr;
@ -129,7 +124,8 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
CPUBackend::CPUBackend(const CPURuntime* runtime, MNNForwardType type) : Backend(type) {
mRuntime = runtime;
mCheckNAN = runtime->mFlags == MNN_CPU_CHECK_NAN;
mDynamicAllocator = runtime->mDynamicAllocator;
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
mStaticAllocator = runtime->mStaticAllocator;
}
bool CPUBackend::supportDot() const {
@ -137,9 +133,7 @@ bool CPUBackend::supportDot() const {
}
CPUBackend::~CPUBackend() {
for (auto p : mDynamic) {
mDynamicAllocator->free(p);
}
// Do nothing
}
void CPUBackend::onExecuteBegin() const {
@ -162,47 +156,45 @@ void CPUBackend::onExecuteEnd() const {
#endif
}
bool CPUBackend::allocBuffer(int size, halide_buffer_t& buffer, StorageType storageType) {
bool CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
// MNN_PRINT("Acquire size = %d\n", size);
if (size <= 0) {
MNN_ASSERT(false);
return false;
}
if (size > LARGE_MEMORY) {
MNN_PRINT("Size larger the 100 M :%d\n", size);
MNN_PRINT("Size larger than 500 M :%d\n", size);
}
auto& buffer = dest->buffer();
auto des = TensorUtils::getDescribe(dest);
std::pair<void*, int> points;
switch (storageType) {
case STATIC: {
#ifdef MNN_DUMP_MEMORY_USAGE
buffer.host = (uint8_t*)malloc(size);
#else
buffer.host = (uint8_t*)(mStaticAllocator->alloc(size, false));
#endif
points = mStaticAllocator->alloc(size, false);
break;
}
case DYNAMIC: {
buffer.host = (uint8_t*)(mDynamicAllocator->alloc(size, false));
points = mDynamicAllocator->alloc(size, false);
break;
}
case DYNAMIC_SEPERATE: {
buffer.host = (uint8_t*)(mDynamicAllocator->alloc(size, true));
points = mDynamicAllocator->alloc(size, true);
break;
}
default:
MNN_ASSERT(false);
break;
}
if (nullptr == buffer.host) {
if (nullptr == points.first) {
MNN_ERROR("Alloc buffer error for cpu backend\n");
return false;
}
if (STATIC == storageType) {
// Do nothing
} else {
mDynamic.insert(buffer.host);
}
buffer.host = (uint8_t*)points.first + points.second;
des->extra.offset = points.second;
if (buffer.type.code == halide_type_handle) {
// For handle we needn't recycle the buffer, use extra as hanleFreeFunction
::memset(buffer.host, 0, size);
des->extra.handleFreeFunction = (decltype(des->extra.handleFreeFunction))free;
}
return true;
}
@ -213,32 +205,29 @@ bool CPUBackend::onAcquireBuffer(const MNN::Tensor* nativeTensorConst, StorageTy
}
//FUNC_PRINT_ALL(nativeTensorConst, p);
auto nativeTensor = (Tensor*)nativeTensorConst;
auto& buffer = nativeTensor->buffer();
auto size = nativeTensor->size();
return allocBuffer(size, buffer, storageType);
return allocBuffer(size, nativeTensor, storageType);
}
bool CPUBackend::onReleaseBuffer(const MNN::Tensor* nativeTensor, StorageType storageType) {
if (DYNAMIC_SEPERATE == storageType) {
return true;
}
if (nativeTensor == nullptr) {
return false;
}
if (nullptr == nativeTensor->buffer().host) {
return false;
}
auto des = TensorUtils::getDescribe(nativeTensor);
std::pair<void*, int> pointer;
pointer.second = des->extra.offset;
pointer.first = (uint8_t*)nativeTensor->buffer().host - des->extra.offset;
if (STATIC == storageType) {
#ifdef MNN_DUMP_MEMORY_USAGE
free(nativeTensor->buffer().host);
#else
mStaticAllocator->free(nativeTensor->buffer().host);
#endif
mStaticAllocator->free(pointer);
return true;
}
if (DYNAMIC_SEPERATE == storageType) {
return true;
}
mDynamic.erase(nativeTensor->buffer().host);
mDynamicAllocator->free(nativeTensor->buffer().host);
mDynamicAllocator->free(pointer);
return true;
}
@ -338,10 +327,7 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
}
bool CPUBackend::onClearBuffer() {
for (auto p : mDynamic) {
mDynamicAllocator->free(p);
}
mDynamic.clear();
mDynamicAllocator->release(true);
return true;
}

View File

@ -34,7 +34,6 @@ public:
virtual float onGetMemoryInMB() override;
private:
std::shared_ptr<BufferAllocator> mStaticAllocator;
std::shared_ptr<BufferAllocator> mDynamicAllocator;
int mThreadNumber;
int mTaskIndex;
size_t mFlags;
@ -97,12 +96,11 @@ public:
static void initCreatorMap();
protected:
bool allocBuffer(int size, halide_buffer_t& buffer, StorageType storageType);
bool allocBuffer(int size, Tensor* dest, StorageType storageType);
private:
std::shared_ptr<BufferAllocator> mStaticAllocator;
std::shared_ptr<BufferAllocator> mDynamicAllocator;
bool mCheckNAN = false;
std::set<void*> mDynamic;
const CPURuntime* mRuntime;
static std::map<OpType, CPUBackend::Creator*>* getCreatorMap();
static std::map<OpType, CPUBackend::Creator*>* gCreator;

View File

@ -70,17 +70,7 @@ ErrorCode CPUBatchMatMul::onResize(const std::vector<Tensor*>& inputs, const std
TensorUtils::setLinearLayout(unit.mMatrixB.get());
TensorUtils::setLinearLayout(unit.mMatrixC.get());
auto res = backend()->onAcquireBuffer(unit.mMatrixA.get(), Backend::DYNAMIC);
res = res && backend()->onAcquireBuffer(unit.mMatrixB.get(), Backend::DYNAMIC);
res = res && backend()->onAcquireBuffer(unit.mMatrixC.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto code = unit.mMatMul->onResize(unit.mTempInputs, unit.mTempOutputs);
backend()->onReleaseBuffer(unit.mMatrixA.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(unit.mMatrixB.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(unit.mMatrixC.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
@ -109,10 +99,10 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const st
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
auto& unit = mUnits[tId];
for (int i = (int)tId; i < mBatch; i+=threadNumber) {
::memcpy(unit.mMatrixA->host<float>(), input0Ptr + i * input0Stride, input0Stride * sizeof(float));
::memcpy(unit.mMatrixB->host<float>(), input1Ptr + i * input1Stride, input1Stride * sizeof(float));
unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride);
unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride);
unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride);
unit.mMatMul->onExecute(unit.mTempInputs, unit.mTempOutputs);
::memcpy(outputPtr + i * outputStride, unit.mMatrixC->host<float>(), outputStride * sizeof(float));
}
}
MNN_CONCURRENCY_END();

View File

@ -25,8 +25,7 @@ public:
auto srcData = input->host<srcT>();
auto dstData = output->host<dstT>();
const auto inputDataSize = input->elementSize();
const auto outputDataSize = output->elementSize();
MNN_ASSERT(inputDataSize == outputDataSize);
MNN_ASSERT(inputDataSize == output->elementSize());
for (int i = 0; i < inputDataSize; i++) {
dstData[i] = static_cast<dstT>(srcData[i]);
}
@ -46,8 +45,7 @@ public:
auto srcData = input->host<int>();
auto dstData = output->host<int>();
const auto inputDataSize = input->elementSize();
const auto outputDataSize = output->elementSize();
MNN_ASSERT(inputDataSize == outputDataSize);
MNN_ASSERT(inputDataSize == output->elementSize());
for (int i = 0; i < inputDataSize; i++) {
int value = srcData[i] == 0 ? 0 : 1;
dstData[i] = value;

View File

@ -29,8 +29,7 @@ CPUEltwise::CPUEltwise(Backend *b, EltwiseType type, std::vector<float> coef) :
ErrorCode CPUEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto inputTensor = inputs[0];
const int size = inputTensor->elementSize();
auto outputSize = outputs[0]->elementSize();
MNN_ASSERT(outputSize == size);
MNN_ASSERT(outputs[0]->elementSize() == size);
auto outputTensor = outputs[0];
auto outputHost = outputTensor->host<float>();

View File

@ -34,6 +34,12 @@ CPUInterp::CPUInterp(Backend *backend, int resizeType,
}
CPUInterp::~CPUInterp() {
if (mInit && mResizeType == 2) {
backend()->onReleaseBuffer(&mWidthPosition, Backend::STATIC);
backend()->onReleaseBuffer(&mWidthFactor, Backend::STATIC);
backend()->onReleaseBuffer(&mHeightPosition, Backend::STATIC);
backend()->onReleaseBuffer(&mHeightFactor, Backend::STATIC);
}
}
ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -61,6 +67,9 @@ ErrorCode CPUInterp::onExecute(const std::vector<Tensor *> &inputs, const std::v
}
ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mResizeType != 2) {
return NO_ERROR;
}
const int inW = inputs[0]->buffer().dim[3].extent;
const int inH = inputs[0]->buffer().dim[2].extent;
const int outW = outputs[0]->buffer().dim[3].extent;
@ -96,9 +105,6 @@ ErrorCode CPUInterp::onResize(const std::vector<Tensor *> &inputs, const std::ve
if (!res) {
return OUT_OF_MEMORY;
}
if (mResizeType != 2) {
return NO_ERROR;
}
mInit = true;
auto _wPosition = mWidthPosition.host<int>();

View File

@ -73,11 +73,12 @@ static void _TransposePackC4MultiThread(const float* BPtr, float* BTempPtr, int
}
}
void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h) {
void CPUMatMul::_scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h) {
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
MNN_ASSERT(e == 1);
if (mTransposeB) {
mPostFunctions.emplace_back(std::make_pair([C, A, B, h, l, numberThread, biasPtr](int tId) {
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
int tId, const float* A, const float* B, float* C) {
auto lC4 = l / 4;
auto lR = lC4 * 4;
for (int y=tId; y<h; y+=numberThread) {
@ -97,7 +98,8 @@ void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const
}
}, numberThread));
} else {
mPostFunctions.emplace_back(std::make_pair([C, A, B, h, l, numberThread, biasPtr](int tId) {
mPostFunctions.emplace_back(std::make_pair([h, l, numberThread, biasPtr](
int tId, const float* A, const float* B, float* C) {
auto hC4 = h / 4;
auto hR = hC4 * 4;
for (int y=tId; y<hC4; y+=numberThread) {
@ -128,7 +130,7 @@ void CPUMatMul::_scheduleForVecE(float* C, const float* A, const float* B, const
}
}
void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h) {
void CPUMatMul::_scheduleForVec(float* C, const float* biasPtr, int e, int l, int h) {
int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
// TODD: Support e = 1
MNN_ASSERT(h == 1);
@ -137,7 +139,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
biasValue = *biasPtr;
}
if (mTransposeA) {
mPostFunctions.emplace_back(std::make_pair([C, A, B, e, l, numberThread, biasValue](int tId) {
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
int tId, const float* A, const float* B, float* C) {
auto eC4 = e / 4;
auto eR = eC4 * 4;
for (int y=tId; y<eC4; y+=numberThread) {
@ -160,7 +163,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
}
}, numberThread));
} else {
mPostFunctions.emplace_back(std::make_pair([C, A, B, e, l, numberThread, biasValue](int tId) {
mPostFunctions.emplace_back(std::make_pair([e, l, numberThread, biasValue](
int tId, const float* A, const float* B, float* C) {
auto lC4 = l / 4;
auto lR = lC4 * 4;
for (int y=tId; y<e; y+=numberThread) {
@ -182,11 +186,8 @@ void CPUMatMul::_scheduleForVec(float* C, const float* A, const float* B, const
ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
const Tensor* A = inputs[0];
const Tensor* B = inputs[1];
auto APtr = A->host<float>();
auto BPtr = B->host<float>();
Tensor* C = outputs[0];
auto CPtr = C->host<float>();
MNN_ASSERT(BPtr != nullptr && APtr != nullptr && CPtr != nullptr);
// Fill output by zero if one of inputs is empty.
if (A->elementSize() == 0 || B->elementSize() == 0) {
return NO_ERROR;
@ -209,7 +210,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto bias = inputs[2];
biasPtr = bias->host<float>();
}
_scheduleForVec(C->host<float>(), A->host<float>(), B->host<float>(), biasPtr, e, l, h);
_scheduleForVec(C->host<float>(), biasPtr, e, l, h);
return NO_ERROR;
}
if (e == 1) {
@ -218,7 +219,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto bias = inputs[2];
biasPtr = bias->host<float>();
}
_scheduleForVecE(C->host<float>(), A->host<float>(), B->host<float>(), biasPtr, e, l, h);
_scheduleForVecE(C->host<float>(), biasPtr, e, l, h);
return NO_ERROR;
}
int eP, lP, hP;
@ -235,7 +236,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto hC4 = UP_DIV(h, 4);
auto lC4 = UP_DIV(l, 4);
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
mPreFunctions.emplace_back(std::make_pair([BPtr, BTempPtr, l, h, this] (int tId) {
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this] (int tId, const float* APtr, const float* BPtr) {
MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
} , 1));
res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
@ -246,12 +247,13 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto ATPtr = AT->host<float>();
if (mTransposeA) {
// l, e -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l](int tId) {
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l](int tId, const float* APtr, const float* BPtr) {
MNNPackC4(ATPtr, APtr, e, l);
}, 1));
} else {
// e, l -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair([ATPtr, APtr, e, l, lC4, numberThread](int tId) {
mPreFunctions.emplace_back(std::make_pair(
[ATPtr, e, l, lC4, numberThread](int tId, const float* APtr, const float* BPtr) {
_TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread);
}, numberThread));
}
@ -270,7 +272,8 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
}
auto borigin = bias->host<float>();
auto bdest = biasWrap->host<float>();
mPreFunctions.emplace_back(std::make_pair([borigin, biasLength, bdest](int tId) {
mPreFunctions.emplace_back(std::make_pair(
[borigin, biasLength, bdest](int tId, const float* APtr, const float* BPtr) {
::memset(bdest, 0, UP_DIV(biasLength, 4) * 4 * sizeof(float));
::memcpy(bdest, borigin, biasLength * sizeof(float));
}, 1));
@ -292,7 +295,8 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto CTPtr = CT->host<float>();
// hC4, e, 4 -> e, h
mPostFunctions.emplace_back(std::make_pair([CPtr, CTPtr, e, h, hC4, numberThread](int tId) {
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, hC4, numberThread](
int tId, const float* APtr, const float* BPtr, float* CPtr) {
_TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread);
}, numberThread));
backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
@ -308,16 +312,21 @@ ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::ve
::memset(outputs[0]->host<char>(), 0, outputs[0]->size());
return NO_ERROR;
}
auto APtr = inputs[0]->host<float>();
auto BPtr = inputs[1]->host<float>();
auto CPtr = outputs[0]->host<float>();
for (auto& f : mPreFunctions) {
MNN_CONCURRENCY_BEGIN(tId, f.second) {
f.first(tId);
f.first(tId, APtr, BPtr);
}
MNN_CONCURRENCY_END();
}
mComputer->onExecute();
for (auto& f : mPostFunctions) {
MNN_CONCURRENCY_BEGIN(tId, f.second) {
f.first(tId);
f.first(tId, APtr, BPtr, CPtr);
}
MNN_CONCURRENCY_END();
}

View File

@ -23,13 +23,13 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
void _scheduleForVec(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h);
void _scheduleForVecE(float* C, const float* A, const float* B, const float* biasPtr, int e, int l, int h);
void _scheduleForVec(float* C, const float* biasPtr, int e, int l, int h);
void _scheduleForVecE(float* C, const float* biasPtr, int e, int l, int h);
bool mTransposeA;
bool mTransposeB;
bool mSupportMultiThread = false;
std::vector<std::pair<std::function<void(int)>, int>> mPreFunctions;
std::vector<std::pair<std::function<void(int)>, int>> mPostFunctions;
std::vector<std::pair<std::function<void(int, const float*, const float*)>, int>> mPreFunctions;
std::vector<std::pair<std::function<void(int, const float*, const float*, float*)>, int>> mPostFunctions;
std::shared_ptr<StrassenMatrixComputor> mComputer;
};
} // namespace MNN

View File

@ -32,6 +32,7 @@ extern void ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
extern void ___CPUPoolCreator__OpType_Pooling__();
extern void ___CPUScatterNdCreator__OpType_ScatterNd__();
extern void ___CPUShapeCreator__OpType_Shape__();
extern void ___CPUPluginCreator__OpType_Plugin__();
extern void ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
extern void ___CPUROIPoolingCreator__OpType_ROIPooling__();
extern void ___CPUTopKV2Creator__OpType_TopKV2__();
@ -105,6 +106,7 @@ ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__();
___CPUPoolCreator__OpType_Pooling__();
___CPUScatterNdCreator__OpType_ScatterNd__();
___CPUShapeCreator__OpType_Shape__();
___CPUPluginCreator__OpType_Plugin__();
___CPUInt8ToFloatCreator__OpType_Int8ToFloat__();
___CPUROIPoolingCreator__OpType_ROIPooling__();
___CPUTopKV2Creator__OpType_TopKV2__();

View File

@ -52,8 +52,7 @@ ErrorCode CPUOneHot::onExecute(const std::vector<Tensor*>& inputs, const std::ve
const auto indicesPtr = indices->host<int>();
auto dataType = onValueTensor->getType();
auto offDataType = offValueTensor->getType();
MNN_ASSERT(dataType == offDataType);
MNN_ASSERT(offValueTensor->getType() == dataType);
if (dataType == halide_type_of<float>()) {
OneHotImpl<float>(depth, outerSize, innerSize, indicesPtr, onValueTensor, offValueTensor, outputs[0]);

View File

@ -6,17 +6,18 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef MNN_WITH_PLUGIN
#include "backend/cpu/CPUBackend.hpp"
#include "core/AutoStorage.h"
#include "core/Execution.hpp"
#ifdef MNN_WITH_PLUGIN
#include "MNN/plugin/PluginContext.hpp"
#include "MNN/plugin/PluginKernel.hpp"
#endif // MNN_WITH_PLUGIN
namespace MNN {
#ifdef MNN_WITH_PLUGIN
static std::shared_ptr<plugin::CPUComputeKernel> getCPUComputeKernel( // NOLINT
const std::string& name) { // NOLINT
return std::shared_ptr<plugin::CPUComputeKernel>( // NOLINT
@ -55,12 +56,14 @@ ErrorCode CPUPlugin::onExecute(const std::vector<Tensor*>& inputs, // NOLINT
return INVALID_VALUE;
}
}
#endif // MNN_WITH_PLUGIN
class CPUPluginCreator : public CPUBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, // NOLINT
const std::vector<Tensor*>& outputs, // NOLINT
const MNN::Op* op, Backend* backend) const {
#ifdef MNN_WITH_PLUGIN
MNN_ASSERT(op->type() == OpType_Plugin);
// Plugin op should has inputs or outputs, or both of them.
MNN_CHECK(inputs.size() > 0 || outputs.size() > 0, // NOLINT
@ -76,11 +79,13 @@ public:
ctx->setAttr(attr->key()->str(), attr);
}
return new CPUPlugin(std::move(ctx));
#else
MNN_ERROR("Plugin is not supported. Please recompile with `MNN_WITH_PLUGIN` enabled.");
return nullptr;
#endif // MNN_WITH_PLUGIN
}
};
REGISTER_CPU_OP_CREATOR(CPUPluginCreator, OpType_Plugin);
} // namespace MNN
#endif // MNN_WITH_PLUGIN

View File

@ -101,7 +101,6 @@ static void poolingMax(const float *channelInput, int inputWidth, int inputHeigh
channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4;
float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4;
int wCount = padRight - padLeft;
int iwStart = -padWidth + padLeft * strideWidth;
int wCountC4 = wCount / 4;
int wCountRemain = wCount - wCountC4 * 4;
int strideWidthFuse = strideWidth4 * 4;

View File

@ -54,11 +54,8 @@ ErrorCode CPUQuantizedAdd::onResize(const std::vector<Tensor *> &inputs, const s
mLeftShiftResult1 = (1 << leftShift) * ((1 << leftShift1));
mLeftShiftResult2 = (1 << leftShift) * ((1 << leftShift2));
const int left1 = leftShift + leftShift1;
const int left2 = leftShift + leftShift2;
MNN_ASSERT(left1 == leftShift);
MNN_ASSERT(left2 == leftShift);
MNN_ASSERT(leftShift + leftShift1 == leftShift);
MNN_ASSERT(leftShift + leftShift2 == leftShift);
return NO_ERROR;
}

View File

@ -62,7 +62,6 @@ ErrorCode CPUROIPooling::onExecute(const std::vector<Tensor *> &inputs, const st
auto ow = output->width(), oh = output->height(), os = ow * oh * 4;
auto slice = UP_DIV(input->channel(), 4);
auto numROI = inputs[1]->batch();
auto batchSize = input->batch();
for (int n = 0; n < numROI; ++n) {
auto batchOutput = output->host<float>() + output->buffer().dim[0].stride * n;
@ -72,7 +71,7 @@ ErrorCode CPUROIPooling::onExecute(const std::vector<Tensor *> &inputs, const st
int y1 = round(roiPtr[2] * mSpatialScale);
int x2 = round(roiPtr[3] * mSpatialScale);
int y2 = round(roiPtr[4] * mSpatialScale);
MNN_ASSERT(roi < batchSize);
MNN_ASSERT(roi < input->batch());
int roiW = max(x2 - x1 + 1, 1);
int roiH = max(y2 - y1 + 1, 1);

View File

@ -10,11 +10,10 @@
namespace MNN {
ErrorCode CPUSelect::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto inSize0 = inputs[0]->elementSize();
auto inSize1 = inputs[1]->elementSize();
auto inSize2 = inputs[2]->elementSize();
auto outSize = outputs[0]->elementSize();
MNN_ASSERT(inSize0 == outSize);
MNN_ASSERT(inputs[0]->elementSize() == outSize);
MNN_ASSERT(inSize1 == 1 || inSize1 == outSize);
MNN_ASSERT(inSize2 == 1 || inSize2 == outSize);
auto output = outputs[0]->host<float>();

View File

@ -9,6 +9,8 @@
#include "backend/cpu/CPUTopKV2.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "core/Macro.h"
#include "core/Concurrency.h"
#include "backend/cpu/compute/CommonOptFunction.h"
namespace MNN {
@ -98,8 +100,60 @@ ErrorCode CPUTopKV2::onExecute(const std::vector<Tensor*>& inputs, const std::ve
const int inputDimension = inputTensor->buffer().dimensions;
const int rowSize = inputTensor->buffer().dim[inputDimension - 1].extent;
const int rowC4Blocks = rowSize / 4;
const int rowRemain = rowSize % 4;
const int rowC4ElementSize = rowC4Blocks * 4;
MNN_ASSERT(k <= rowSize);
const int numRows = inputTensor->elementSize() / rowSize;
if (k == 1) {
if (halide_type_float == inputTensor->getType().code) {
float* inputData = inputTensor->host<float>();
float* topkData = outputData->host<float>();
int32_t* indicesData = outputIndices->host<int32_t>();
MNN_CONCURRENCY_BEGIN(i, numRows) {
float* inputRowData = inputData + i * rowSize;
float* rowTopkData = topkData + i * k;
int32_t* rowTopkIndexData = indicesData + i * k;
MNNVectorTop1Float(inputRowData, rowTopkData, rowTopkIndexData, rowC4Blocks);
for (int j = 0; j < rowRemain; j++) {
int index = rowC4ElementSize + j;
float value = inputRowData[index];
if (value > rowTopkData[0]) {
rowTopkData[0] = value;
rowTopkIndexData[0] = index;
}
}
}
MNN_CONCURRENCY_END();
} else if (halide_type_int == inputTensor->getType().code && 32 == inputTensor->getType().bits) {
int32_t* inputData = inputTensor->host<int32_t>();
int32_t* topkData = outputData->host<int32_t>();
int32_t* indicesData = outputIndices->host<int32_t>();
MNN_CONCURRENCY_BEGIN(i, numRows) {
int32_t* inputRowData = inputData + i * rowSize;
int32_t* rowTopkData = topkData + i * k;
int32_t* rowTopkIndexData = indicesData + i * k;
MNNVectorTop1Int32(inputRowData, rowTopkData, rowTopkIndexData, rowC4Blocks);
for (int j = 0; j < rowRemain; j++) {
int index = rowC4ElementSize + j;
int32_t value = inputRowData[index];
if (value > rowTopkData[0]) {
rowTopkData[0] = value;
rowTopkIndexData[0] = index;
}
}
}
MNN_CONCURRENCY_END();
} else {
MNN_PRINT("TopKV2 data type not supported\n");
MNN_ASSERT(false);
}
return NO_ERROR;
}
if (halide_type_float == inputTensor->getType().code) {
auto inputData = inputTensor->host<float>();
auto topkData = outputData->host<float>();

View File

@ -26,8 +26,7 @@ CPUUnary::CPUUnary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType
ErrorCode CPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(1 == outputs.size());
auto dtype = inputs[0]->getType();
MNN_ASSERT(dtype == halide_type_of<float>() || dtype == halide_type_of<int32_t>());
MNN_ASSERT(inputs[0]->getType() == halide_type_of<float>() || inputs[0]->getType() == halide_type_of<int32_t>());
return NO_ERROR;
}

View File

@ -13,7 +13,6 @@ namespace MNN {
ErrorCode CPUWhere::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto& ib = inputs[0]->buffer();
auto& ob = outputs[0]->buffer();
int32_t* inputData = inputs[0]->host<int32_t>();
auto outputData = outputs[0]->host<int32_t>();
auto inputTotal = inputs[0]->elementSize();
@ -25,7 +24,7 @@ ErrorCode CPUWhere::onExecute(const std::vector<Tensor*>& inputs, const std::vec
}
}
MNN_ASSERT(ob.dim[0].extent == trueVec.size());
MNN_ASSERT(outputs[0]->batch() == trueVec.size());
for (int i = 0; i < trueVec.size(); i++) {
int index = trueVec[i];
for (int j = 0; j < ib.dimensions; j++) {

View File

@ -191,10 +191,7 @@ ThreadPool::ThreadPool(int numberThread) {
}
ThreadPool::~ThreadPool() {
{
std::lock_guard<std::mutex> _l(mQueueMutex);
mStop = true;
}
mStop = true;
mCondition.notify_all();
for (auto& worker : mWorkers) {
worker.join();
@ -234,10 +231,8 @@ void ThreadPool::active() {
if (nullptr == gInstance) {
return;
}
{
std::lock_guard<std::mutex> _l(gInstance->mQueueMutex);
gInstance->mActiveCount++;
}
gInstance->mActiveCount++;
std::lock_guard<std::mutex> _l(gInstance->mQueueMutex);
gInstance->mCondition.notify_all();
}
void ThreadPool::deactive() {

View File

@ -0,0 +1,83 @@
//
// MNNVectorTop1Float.S
// MNN
//
// Created by MNN on 2020/12/08.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNVectorTop1Float
// void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
push {r4-r11, lr}
// Auto: r0: input, r1: maxValue, r2: maxIndex, r3: inputCountUnit
// q15 maxValue
vld1.f32 {q15}, [r0]
// q14 maxIndex
mov r11, #0
vmov.s32 d28[0], r11
mov r11, #1
vmov.s32 d28[1], r11
mov r11, #2
vmov.s32 d29[0], r11
mov r11, #3
vmov.s32 d29[1], r11
// q11 current index
vmov.s32 q11, q14
// all 4, increment
mov r11, #4
vmov.s32 d20[0], r11
vmov.s32 d20[1], r11
vmov.s32 d21[0], r11
vmov.s32 d21[1], r11
cmp r3, #0
beq End
Loop:
vld1.f32 {q13}, [r0]!
vcgt.f32 q12, q13, q15
vbit.f32 q15, q13, q12
vbit.s32 q14, q11, q12
vadd.s32 q11, q11, q10
subs r3, r3, #1
bne Loop
// reduce result to single value and index
vcgt.f32 d24, d31, d30
vbit.f32 d30, d31, d24
vbit.s32 d28, d29, d24
vtrn.f32 d30, d31
vtrn.s32 d28, d29
vcgt.f32 d24, d31, d30
vbit.f32 d30, d31, d24
vbit.s32 d28, d29, d24
vst1.f32 d30[0], [r1]
vst1.s32 d28[0], [r2]
End:
pop {r4-r11, pc}
#endif
#endif

View File

@ -0,0 +1,83 @@
//
// MNNVectorTop1Int32.S
// MNN
//
// Created by MNN on 2020/12/08.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNVectorTop1Int32
// void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
push {r4-r11, lr}
// Auto: r0: input, r1: maxValue, r2: maxIndex, r3: inputCountUnit
// q15 maxValue
vld1.s32 {q15}, [r0]
// q14 maxIndex
mov r11, #0
vmov.s32 d28[0], r11
mov r11, #1
vmov.s32 d28[1], r11
mov r11, #2
vmov.s32 d29[0], r11
mov r11, #3
vmov.s32 d29[1], r11
// q11 current index
vmov.s32 q11, q14
// all 4, increment
mov r11, #4
vmov.s32 d20[0], r11
vmov.s32 d20[1], r11
vmov.s32 d21[0], r11
vmov.s32 d21[1], r11
cmp r3, #0
beq End
Loop:
vld1.s32 {q13}, [r0]!
vcgt.s32 q12, q13, q15
vbit.s32 q15, q13, q12
vbit.s32 q14, q11, q12
vadd.s32 q11, q11, q10
subs r3, r3, #1
bne Loop
// reduce result to single value and index
vcgt.s32 d24, d31, d30
vbit.s32 d30, d31, d24
vbit.s32 d28, d29, d24
vtrn.s32 d30, d31
vtrn.s32 d28, d29
vcgt.s32 d24, d31, d30
vbit.s32 d30, d31, d24
vbit.s32 d28, d29, d24
vst1.s32 d30[0], [r1]
vst1.s32 d28[0], [r2]
End:
pop {r4-r11, pc}
#endif
#endif

View File

@ -16,9 +16,11 @@
asm_function MNNPackedMatMulRemain
//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias);
//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5: cache, x6:postParameters, x7:bias
str x19, [sp, #-8]
str x20, [sp, #-16]
str x21, [sp, #-24]
sub sp, sp, #32
str x19, [sp, #0]
str x20, [sp, #8]
str x21, [sp, #16]
add sp, sp, #32
ldr x11, [x4, #0] // aStride
ldr x9, [x4, #8] // l
ldr x10, [x4, #16] // h
@ -530,9 +532,11 @@ LoopE1:
End:
ldr x19, [sp, #-8]
ldr x20, [sp, #-16]
ldr x21, [sp, #-24]
sub sp, sp, #32
ldr x19, [sp, #0]
ldr x20, [sp, #8]
ldr x21, [sp, #16]
add sp, sp, #32
ret

View File

@ -0,0 +1,83 @@
//
// MNNVectorTop1Float.S
// MNN
//
// Created by MNN on 2020/12/09.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNVectorTop1Float
// void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
// Auto: x0: input, x1: maxValue, x2: maxIndex, x3: inputCountUnit
// v30 maxValue
ld1 {v30.4s}, [x0]
// v29 maxIndex
mov w11, #0
mov v29.s[0], w11
mov w11, #1
mov v29.s[1], w11
mov w11, #2
mov v29.s[2], w11
mov w11, #3
mov v29.s[3], w11
// v28 current index
mov v28.4s, v29.4s
// v27, all 4, increment
mov w11, #4
mov v27.s[0], w11
mov v27.s[1], w11
mov v27.s[2], w11
mov v27.s[3], w11
cmp x3, #0
beq End
Loop:
ld1 {v26.4s}, [x0], #16
fcmgt v25.4s, v26.4s, v30.4s
bit v30.16b, v26.16b, v25.16b
bit v29.16b, v28.16b, v25.16b
add v28.4s, v28.4s, v27.4s
subs x3, x3, #1
bne Loop
// reduce result to single value and index
mov v20.d[0], v30.d[1]
mov v21.d[0], v29.d[1]
fcmgt v25.2s, v20.2s, v30.2s
bit v30.8b, v20.8b, v25.8b
bit v29.8b, v21.8b, v25.8b
mov v20.s[0], v30.s[1]
mov v21.s[0], v29.s[1]
fcmgt v25.2s, v20.2s, v30.2s
bit v30.8b, v20.8b, v25.8b
bit v29.8b, v21.8b, v25.8b
st1 {v30.s}[0], [x1]
st1 {v29.s}[0], [x2]
End:
ret
#endif

View File

@ -0,0 +1,83 @@
//
// MNNVectorTop1Int32.S
// MNN
//
// Created by MNN on 2020/12/09.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNVectorTop1Int32
// void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
// Auto: x0: input, x1: maxValue, x2: maxIndex, x3: inputCountUnit
// v30 maxValue
ld1 {v30.4s}, [x0]
// v29 maxIndex
mov w11, #0
mov v29.s[0], w11
mov w11, #1
mov v29.s[1], w11
mov w11, #2
mov v29.s[2], w11
mov w11, #3
mov v29.s[3], w11
// v28 current index
mov v28.4s, v29.4s
// v27, all 4, increment
mov w11, #4
mov v27.s[0], w11
mov v27.s[1], w11
mov v27.s[2], w11
mov v27.s[3], w11
cmp x3, #0
beq End
Loop:
ld1 {v26.4s}, [x0], #16
cmgt v25.4s, v26.4s, v30.4s
bit v30.16b, v26.16b, v25.16b
bit v29.16b, v28.16b, v25.16b
add v28.4s, v28.4s, v27.4s
subs x3, x3, #1
bne Loop
// reduce result to single value and index
mov v20.d[0], v30.d[1]
mov v21.d[0], v29.d[1]
cmgt v25.2s, v20.2s, v30.2s
bit v30.8b, v20.8b, v25.8b
bit v29.8b, v21.8b, v25.8b
mov v20.s[0], v30.s[1]
mov v21.s[0], v29.s[1]
cmgt v25.2s, v20.2s, v30.2s
bit v30.8b, v20.8b, v25.8b
bit v29.8b, v21.8b, v25.8b
st1 {v30.s}[0], [x1]
st1 {v29.s}[0], [x2]
End:
ret
#endif

View File

@ -865,4 +865,37 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi
}
}
}
void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
float maxV = input[0];
int maxIdx = 0;
for (int i = 0; i < inputCountUnit; i++) {
int offset = i * UNIT;
for (int j = 0; j < UNIT; j++) {
if (input[offset + j] > maxV) {
maxV = input[offset + j];
maxIdx = offset + j;
}
}
}
maxValue[0] = maxV;
maxIndex[0] = maxIdx;
}
void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
int32_t maxV = input[0];
int maxIdx = 0;
for (int i = 0; i < inputCountUnit; i++) {
int offset = i * UNIT;
for (int j = 0; j < UNIT; j++) {
if (input[offset + j] > maxV) {
maxV = input[offset + j];
maxIdx = offset + j;
}
}
}
maxValue[0] = maxV;
maxIndex[0] = maxIdx;
}
#endif

View File

@ -91,6 +91,9 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi
// dim: 4-element, sizeDW, sizeDH, strideSW, strideDH
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim); // not C4
void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit);
void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit);
#ifdef __cplusplus
}
#endif

View File

@ -354,7 +354,7 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
const int ow = output->width(), oh = output->height();
const int iw = input->width(), ih = input->height();
const int dc_4 = UP_DIV(output->channel(), 4);
const int padX = mPadX, padY = mPadY, kernelSize = 9;
const int padX = mPadX, padY = mPadY;
const bool combine1D2D = (mStrategy.unitType == ComputeStrategy::D2_D1);
const bool offline = (mStrategy.transPhase == ComputeStrategy::Offline);
@ -373,7 +373,6 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
for (int b = 0; b < input->batch(); ++b) {
auto src = input->host<int8_t>() + b * input->stride(0);
auto dst = mTempInput->host<int8_t>() + b * mTempInput->stride(0);
const int threadNumber = ((CPUBackend*)backend())->threadNumber();
const int ic8 = UP_DIV(input->channel(), 8), ic4 = UP_DIV(input->channel(), 4);
// C4 to C8
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
@ -592,7 +591,7 @@ ErrorCode ConvInt83x3::onExecute(const std::vector<Tensor *> &inputs, const std:
auto gemmConcurrencyFunc = [=, &gemmFunc](int xC, int gemmNum, const int8_t* srcOrigin, const int8_t* weight, float* dstOrigin) {
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
const int step = UP_DIV(gemmNum, threadNumber);
gemmFunc(xC, tId * step, ALIMIN((tId + 1) * step, gemmNum), srcOrigin, weight, dstOrigin);
gemmFunc(xC, (int)tId * step, ALIMIN((tId + 1) * step, gemmNum), srcOrigin, weight, dstOrigin);
}
MNN_CONCURRENCY_END()
};

View File

@ -267,7 +267,6 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs,
MNN_CONCURRENCY_END();
auto batch = input->batch();
auto matrixSizeE = output->height() * output->width() * input->batch();
auto outputPlane = output->height() * output->width();
auto ocC4 = UP_DIV(output->channel(), 4);
MNN_CONCURRENCY_BEGIN(y, ocC4) {

View File

@ -15,8 +15,7 @@ namespace MNN {
ConvolutionGroup::ConvolutionGroup(Backend *b, const std::vector<std::shared_ptr<Execution>> &subConvolution)
: MNN::Execution(b) {
mSubConvolution = subConvolution;
auto group = subConvolution.size();
MNN_ASSERT(group > 1);
MNN_ASSERT(subConvolution.size() > 1);
mInputRaw.reset(new Tensor(4));
mInputUnit.reset(new Tensor(4, Tensor::CAFFE_C4));

View File

@ -118,7 +118,6 @@ ConvolutionTiledExecutor::~ConvolutionTiledExecutor() {
ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {
CPUConvolution::onResize(inputs, outputs);
auto layer = mCommon;
auto input = inputs[0];
auto weight = inputs[1];
Tensor* bias = nullptr;

View File

@ -114,7 +114,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
MNNGetMatMulPackMode(&ePack, &lPack, &hPack);
auto srcUnit2 = srcUnit * srcUnit;
auto dstUnit2 = dstUnit * dstUnit;
int ow = output->width();
int oh = output->height();
@ -137,7 +136,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
int tileCount = UP_DIV(totalCount, ePack);
int eRemain = totalCount % ePack;
threadNumber = std::min(threadNumber, tileCount);
auto hDiv = MNNGetC4DivNumber(hPack);
std::vector<size_t> parameters(6);
parameters[0] = eRemain * sizeof(float);
parameters[1] = input->channel();
@ -277,7 +275,6 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
for (int z = 0; z < dc_4; ++z) {
auto dstZAddr = dstStart + z * dstZStep;
auto srcZ = srcXi + z * srcZStep;
auto biasZ = bias + 4 * z;
// Transform
for (int i = 0; i < srcUnit; ++i) {
mDestTransform(srcZ + i * unitStep, midBuffer0 + i * dstUnit * 4,
@ -324,7 +321,7 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector<Tensor *> &inputs, co
MNN_CONCURRENCY_END();
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
for (int dy=tId; dy < dc_4; dy += threadNumber) {
for (int dy=(int)tId; dy < dc_4; dy += threadNumber) {
postFunction(dstOrigin + 4 * ow * oh * dy, bias + 4* dy, ow * oh, 1);
}
}

View File

@ -21,20 +21,7 @@
#endif
bool MNNReorder4x4ByPlatform(float* dst, size_t number) {
for (int i = 0; i < number; ++i) {
auto addr = dst + 16 * i;
auto s0 = _mm_loadu_ps(addr + 4 * 0);
auto s1 = _mm_loadu_ps(addr + 4 * 1);
auto s2 = _mm_loadu_ps(addr + 4 * 2);
auto s3 = _mm_loadu_ps(addr + 4 * 3);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(addr + 4 * 0, s0);
_mm_storeu_ps(addr + 4 * 1, s1);
_mm_storeu_ps(addr + 4 * 2, s2);
_mm_storeu_ps(addr + 4 * 3, s3);
}
return true;
return _SSE_MNNReorder4x4ByPlatform(dst, number);
}
struct FunctionGroup {
@ -60,6 +47,7 @@ struct FunctionGroup {
size_t weight_depth_offset) = _SSE_MNNGemmFloatCommon_4;
void (*MNNPackC4ForMatMul_A)(float* dest, const float* source, size_t e, size_t l,
size_t eReal) = _SSE_MNNPackC4ForMatMul_A;
void (*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose) = _SSE_MNNPackForMatMul_B;
void (*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, float* cache,
const float* postParameters, const float* bias) = _SSE_MNNPackedMatMul;
void (*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
@ -144,167 +132,16 @@ void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size
gFunc.MNNMatrixSub(C, A, B, widthC4, cStride, aStride, bStride, height);
}
#include <algorithm>
#include <cmath>
void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
return _SSE_MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad);
}
void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
auto areaC4 = area / 4;
auto depthC4 = depth / 4;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * area * 4;
auto srcPlane = src + z * area * 4;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + 4 * x;
auto d = dstPlane + 16 * x;
auto s0 = _mm_loadu_ps(s + 0 * area);
auto s1 = _mm_loadu_ps(s + 1 * area);
auto s2 = _mm_loadu_ps(s + 2 * area);
auto s3 = _mm_loadu_ps(s + 3 * area);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(d + 4 * 0, s0);
_mm_storeu_ps(d + 4 * 1, s1);
_mm_storeu_ps(d + 4 * 2, s2);
_mm_storeu_ps(d + 4 * 3, s3);
}
}
auto areaRemain = areaC4 * 4;
auto depthRemain = depthC4 * 4;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
float* dstPlane = depthC4 * area * 4 + dst;
const float* srcPlane = src + depthC4 * area * 4;
for (int x = 0; x < area; ++x) {
for (int y = 0; y < remain; y++) {
dstPlane[4 * x + y] = srcPlane[y * area + x];
}
for (int y = remain; y < 4; y++) {
dstPlane[4 * x + y] = 0;
}
}
}
// Right
for (int z = 0; z < depthC4; ++z) {
float* dstPlane = z * area * 4 + dst;
const float* srcPlane = src + z * area * 4;
for (int x = areaRemain; x < area; ++x) {
float s0 = srcPlane[x];
float s1 = srcPlane[x + area];
float s2 = srcPlane[x + area * 2];
float s3 = srcPlane[x + area * 3];
_mm_store_ps(dstPlane + 4 * x, _mm_set_ps(s3, s2, s1, s0));
}
}
}
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
int w = dim[0];
int h = dim[1];
int srcStride = dim[2];
int dstStride = dim[3];
auto wC4 = w / 4;
auto hC4 = h / 4;
for (int y = 0; y < hC4; ++y) {
auto sy = (float*)srcO + 4 * y;
auto dy = (float*)dstO + 4 * y * dstStride;
for (int x = 0; x < wC4; ++x) {
auto sx = sy + x * 4 * srcStride;
auto dx = dy + 4 * x;
auto s0 = _mm_loadu_ps(sx + srcStride * 0);
auto s1 = _mm_loadu_ps(sx + srcStride * 1);
auto s2 = _mm_loadu_ps(sx + srcStride * 2);
auto s3 = _mm_loadu_ps(sx + srcStride * 3);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(dx + dstStride * 0, s0);
_mm_storeu_ps(dx + dstStride * 1, s1);
_mm_storeu_ps(dx + dstStride * 2, s2);
_mm_storeu_ps(dx + dstStride * 3, s3);
}
}
// Down
for (int i = hC4 * 4; i < h; ++i) {
auto si = srcO + i;
auto di = dstO + i * dstStride;
for (int j = 0; j < w; ++j) {
auto sj = si + j * srcStride;
auto dj = di + j;
*dj = *sj;
}
}
// Right
for (int i = 0; i < hC4 * 4; ++i) {
auto si = srcO + i;
auto di = dstO + i * dstStride;
for (int j = wC4 * 4; j < w; ++j) {
auto sj = si + j * srcStride;
auto dj = di + j;
*dj = *sj;
}
}
}
void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
auto areaC4 = area / 4;
auto depthC4 = depth / 4;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * area * 4;
auto srcPlane = src + z * area * 4;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + 16 * x;
auto d = dstPlane + 4 * x;
auto s0 = _mm_loadu_ps(s + 0 * 4);
auto s1 = _mm_loadu_ps(s + 1 * 4);
auto s2 = _mm_loadu_ps(s + 2 * 4);
auto s3 = _mm_loadu_ps(s + 3 * 4);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(d + 0 * area, s0);
_mm_storeu_ps(d + 1 * area, s1);
_mm_storeu_ps(d + 2 * area, s2);
_mm_storeu_ps(d + 3 * area, s3);
}
}
auto areaRemain = areaC4 * 4;
auto depthRemain = depthC4 * 4;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
float* dstPlane = depthC4 * area * 4 + dst;
const float* srcPlane = src + depthC4 * area * 4;
for (int x = 0; x < area; ++x) {
for (int y = 0; y < remain; y++) {
dstPlane[y * area + x] = srcPlane[4 * x + y];
}
}
}
// Right
for (int z = 0; z < depthC4; ++z) {
const float* srcPlane = z * area * 4 + src;
float* dstPlane = dst + z * area * 4;
for (int x = areaRemain; x < area; ++x) {
for (int y = 0; y < 4; y++) {
dstPlane[y * area + x] = srcPlane[4 * x + y];
}
}
}
}
void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
return gFunc.MNNPackC4ForMatMul_A(dest, source, e, l, eReal);
}
void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
if (!transpose) {
MNNUnpackTranspose(dest, source, l, h);
return;
}
MNNPackC4(dest, source, l, h);
gFunc.MNNPackForMatMul_B(dest, source, h, l, transpose);
}
void MNNGetMatMulPackMode(int* eP, int* lP, int* hP) {

View File

@ -169,6 +169,307 @@ static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, con
TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
}
}
static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) {
auto aStride = parameter[0] / sizeof(float);
auto h = parameter[2];
auto l = parameter[1];
auto cStride = parameter[3] / sizeof(float);
auto bExtraStride = parameter[5] / sizeof(float);
auto bStride = bExtraStride + l * 4;
auto hC4 = UP_DIV(h, 4);
int lC4 = l / 4;
int lR = lC4 * 4;
const int hC4Unit = 4;
int hC16 = hC4 / hC4Unit;
int hR = hC16 * hC4Unit;
auto src = A;
for (int y = 0; y < hC16; ++y) {
auto weight0 = B + (hC4Unit * y + 0) * bStride;
auto dst0 = C + (hC4Unit * y + 0) * cStride;
auto weight1 = B + (hC4Unit * y + 1) * bStride;
auto dst1 = C + (hC4Unit * y + 1) * cStride;
auto weight2 = B + (hC4Unit * y + 2) * bStride;
auto dst2 = C + (hC4Unit * y + 2) * cStride;
auto weight3 = B + (hC4Unit * y + 3) * bStride;
auto dst3 = C + (hC4Unit * y + 3) * cStride;
auto sumAvx00 = _mm256_set1_ps(0.0f);
auto sumAvx01 = _mm256_set1_ps(0.0f);
auto sumAvx10 = _mm256_set1_ps(0.0f);
auto sumAvx11 = _mm256_set1_ps(0.0f);
auto sumAvx20 = _mm256_set1_ps(0.0f);
auto sumAvx21 = _mm256_set1_ps(0.0f);
auto sumAvx30 = _mm256_set1_ps(0.0f);
auto sumAvx31 = _mm256_set1_ps(0.0f);
auto sumAvx40 = _mm256_set1_ps(0.0f);
auto sumAvx41 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < l; ++sy) {
auto S0 = _mm256_broadcast_ss(srcUse + 0);
auto S1 = _mm256_broadcast_ss(srcUse + 1);
auto S2 = _mm256_broadcast_ss(srcUse + 2);
auto S3 = _mm256_broadcast_ss(srcUse + 3);
auto S4 = _mm256_broadcast_ss(srcUse + 4);
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
sumAvx20 = MNNAVXFMA(S2, W0, sumAvx20);
sumAvx21 = MNNAVXFMA(S2, W1, sumAvx21);
sumAvx30 = MNNAVXFMA(S3, W0, sumAvx30);
sumAvx31 = MNNAVXFMA(S3, W1, sumAvx31);
sumAvx40 = MNNAVXFMA(S4, W0, sumAvx40);
sumAvx41 = MNNAVXFMA(S4, W1, sumAvx41);
srcUse += aStride;
weight0 += 4;
weight1 += 4;
weight2 += 4;
weight3 += 4;
}
_mm256_storeu_ps(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32));
_mm256_storeu_ps(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32));
_mm_storeu_ps(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0));
_mm256_storeu_ps(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49));
_mm256_storeu_ps(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49));
_mm_storeu_ps(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1));
_mm256_storeu_ps(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32));
_mm256_storeu_ps(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32));
_mm_storeu_ps(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0));
_mm256_storeu_ps(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49));
_mm256_storeu_ps(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49));
_mm_storeu_ps(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1));
}
for (int y = hR; y < hC4; ++y) {
auto weight = B + y * bStride;
auto dst = C + y * cStride;
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2);
auto s3 = _mm_broadcast_ss(A + 0 * aStride + 3);
auto s4 = _mm_broadcast_ss(A + 0 * aStride + 4);
auto w0 = _mm_loadu_ps(weight + 0 * 4);
auto z0 = _mm_mul_ps(s0, w0);
auto z1 = _mm_mul_ps(s1, w0);
auto z2 = _mm_mul_ps(s2, w0);
auto z3 = _mm_mul_ps(s3, w0);
auto z4 = _mm_mul_ps(s4, w0);
for (int sy = 1; sy < l; ++sy) {
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
s2 = _mm_broadcast_ss(A + sy * aStride + 2);
s3 = _mm_broadcast_ss(A + sy * aStride + 3);
s4 = _mm_broadcast_ss(A + sy * aStride + 4);
w0 = _mm_loadu_ps(weight + sy * 4);
z0 = MNNSSEFMA(s0, w0, z0);
z1 = MNNSSEFMA(s1, w0, z1);
z2 = MNNSSEFMA(s2, w0, z2);
z3 = MNNSSEFMA(s3, w0, z3);
z4 = MNNSSEFMA(s4, w0, z4);
}
_mm_store_ps(dst + 4 * 0, z0);
_mm_store_ps(dst + 4 * 1, z1);
_mm_store_ps(dst + 4 * 2, z2);
_mm_store_ps(dst + 4 * 3, z3);
_mm_store_ps(dst + 4 * 4, z4);
}
}
static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, const size_t* parameter) {
auto aStride = parameter[0] / sizeof(float);
auto h = parameter[2];
auto l = parameter[1];
auto cStride = parameter[3] / sizeof(float);
auto bExtraStride = parameter[5] / sizeof(float);
auto bStride = bExtraStride + l * 4;
auto hC4 = UP_DIV(h, 4);
int lC4 = l / 4;
int lR = lC4 * 4;
const int hC4Unit = 4;
int hC16 = hC4 / hC4Unit;
int hR = hC16 * hC4Unit;
auto src = A;
for (int y = 0; y < hC16; ++y) {
auto weight0 = B + (hC4Unit * y + 0) * bStride;
auto dst0 = C + (hC4Unit * y + 0) * cStride;
auto weight1 = B + (hC4Unit * y + 1) * bStride;
auto dst1 = C + (hC4Unit * y + 1) * cStride;
auto weight2 = B + (hC4Unit * y + 2) * bStride;
auto dst2 = C + (hC4Unit * y + 2) * cStride;
auto weight3 = B + (hC4Unit * y + 3) * bStride;
auto dst3 = C + (hC4Unit * y + 3) * cStride;
auto sumAvx00 = _mm256_set1_ps(0.0f);
auto sumAvx01 = _mm256_set1_ps(0.0f);
auto sumAvx10 = _mm256_set1_ps(0.0f);
auto sumAvx11 = _mm256_set1_ps(0.0f);
auto sumAvx20 = _mm256_set1_ps(0.0f);
auto sumAvx21 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < l; ++sy) {
auto S0 = _mm256_broadcast_ss(srcUse + 0);
auto S1 = _mm256_broadcast_ss(srcUse + 1);
auto S2 = _mm256_broadcast_ss(srcUse + 2);
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
sumAvx20 = MNNAVXFMA(S2, W0, sumAvx20);
sumAvx21 = MNNAVXFMA(S2, W1, sumAvx21);
srcUse += aStride;
weight0 += 4;
weight1 += 4;
weight2 += 4;
weight3 += 4;
}
_mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
_mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
_mm_storeu_ps(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0));
_mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
_mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
_mm_storeu_ps(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1));
_mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
_mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
_mm_storeu_ps(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0));
_mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
_mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
_mm_storeu_ps(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1));
}
for (int y = hR; y < hC4; ++y) {
auto weight = B + y * bStride;
auto dst = C + y * cStride;
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2);
auto w0 = _mm_loadu_ps(weight + 0 * 4);
auto z0 = _mm_mul_ps(s0, w0);
auto z1 = _mm_mul_ps(s1, w0);
auto z2 = _mm_mul_ps(s2, w0);
for (int sy = 1; sy < l; ++sy) {
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
s2 = _mm_broadcast_ss(A + sy * aStride + 2);
w0 = _mm_loadu_ps(weight + sy * 4);
z0 = MNNSSEFMA(s0, w0, z0);
z1 = MNNSSEFMA(s1, w0, z1);
z2 = MNNSSEFMA(s2, w0, z2);
}
_mm_store_ps(dst + 4 * 0, z0);
_mm_store_ps(dst + 4 * 1, z1);
_mm_store_ps(dst + 4 * 2, z2);
}
}
static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, const size_t* parameter) {
auto aStride = parameter[0] / sizeof(float);
auto h = parameter[2];
auto l = parameter[1];
auto cStride = parameter[3] / sizeof(float);
auto bExtraStride = parameter[5] / sizeof(float);
auto bStride = bExtraStride + l * 4;
auto hC4 = UP_DIV(h, 4);
int lC4 = l / 4;
int lR = lC4 * 4;
const int hC4Unit = 4;
int hC16 = hC4 / hC4Unit;
int hR = hC16 * hC4Unit;
auto src = A;
for (int y = 0; y < hC16; ++y) {
auto weight0 = B + (hC4Unit * y + 0) * bStride;
auto dst0 = C + (hC4Unit * y + 0) * cStride;
auto weight1 = B + (hC4Unit * y + 1) * bStride;
auto dst1 = C + (hC4Unit * y + 1) * cStride;
auto weight2 = B + (hC4Unit * y + 2) * bStride;
auto dst2 = C + (hC4Unit * y + 2) * cStride;
auto weight3 = B + (hC4Unit * y + 3) * bStride;
auto dst3 = C + (hC4Unit * y + 3) * cStride;
auto sumAvx00 = _mm256_set1_ps(0.0f);
auto sumAvx01 = _mm256_set1_ps(0.0f);
auto sumAvx10 = _mm256_set1_ps(0.0f);
auto sumAvx11 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < l; ++sy) {
auto S0 = _mm256_broadcast_ss(srcUse + 0);
auto S1 = _mm256_broadcast_ss(srcUse + 1);
auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight0))), *(__m128i*)(weight1), 1));
auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256((_mm256_broadcastsi128_si256(*(__m128i*)(weight2))), *(__m128i*)(weight3), 1));
sumAvx00 = MNNAVXFMA(S0, W0, sumAvx00);
sumAvx01 = MNNAVXFMA(S0, W1, sumAvx01);
sumAvx10 = MNNAVXFMA(S1, W0, sumAvx10);
sumAvx11 = MNNAVXFMA(S1, W1, sumAvx11);
srcUse += aStride;
weight0 += 4;
weight1 += 4;
weight2 += 4;
weight3 += 4;
}
_mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
_mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0));
_mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
_mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1));
_mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
_mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0));
_mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
_mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1));
}
for (int y = hR; y < hC4; ++y) {
auto weight = B + y * bStride;
auto dst = C + y * cStride;
auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0);
auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1);
auto w0 = _mm_loadu_ps(weight + 0 * 4);
auto z0 = _mm_mul_ps(s0, w0);
auto z1 = _mm_mul_ps(s1, w0);
for (int sy = 1; sy < l; ++sy) {
s0 = _mm_broadcast_ss(A + sy * aStride + 0);
s1 = _mm_broadcast_ss(A + sy * aStride + 1);
w0 = _mm_loadu_ps(weight + sy * 4);
z0 = MNNSSEFMA(s0, w0, z0);
z1 = MNNSSEFMA(s1, w0, z1);
}
_mm_store_ps(dst + 4 * 0, z0);
_mm_store_ps(dst + 4 * 1, z1);
}
}
static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) {
auto aStride = parameter[0] / sizeof(float);
@ -303,135 +604,155 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl
C += 8 * 4;
A += 8;
}
if (eSize >= 4) {
if (eSize >= 5) {
_AVX_MNNPackedMatMul_5(C, A, B, parameter);
eSize -= 5;
C += 5 * 4;
A += 5;
}
if (eSize == 4) {
_AVX_MNNPackedMatMul_4(C, A, B, parameter);
eSize -= 4;
C += 4 * 4;
A += 4;
}
if (eSize == 3) {
_AVX_MNNPackedMatMul_3(C, A, B, parameter);
eSize -= 3;
C += 3 * 4;
A += 3;
}
if (eSize == 2) {
_AVX_MNNPackedMatMul_2(C, A, B, parameter);
eSize -= 2;
C += 2 * 4;
A += 2;
}
if (eSize == 0) {
return;
}
int lC4 = l / 4;
int lR = lC4 * 4;
const int hC4Unit = 4;
int hC16 = hC4 / hC4Unit;
int hR = hC16 * hC4Unit;
for (int x = 0; x < eSize; ++x) {
auto src = A + x;
for (int y = 0; y < hC16; ++y) {
auto weight0 = B + (hC4Unit * y + 0) * bStride;
auto dst0 = C + (hC4Unit * y + 0) * cStride + x * 4;
auto weight1 = B + (hC4Unit * y + 1) * bStride;
auto dst1 = C + (hC4Unit * y + 1) * cStride + x * 4;
auto weight2 = B + (hC4Unit * y + 2) * bStride;
auto dst2 = C + (hC4Unit * y + 2) * cStride + x * 4;
auto weight3 = B + (hC4Unit * y + 3) * bStride;
auto dst3 = C + (hC4Unit * y + 3) * cStride + x * 4;
auto sumAvx00 = _mm256_set1_ps(0.0f);
auto sumAvx01 = _mm256_set1_ps(0.0f);
auto src = A;
int x = 0;
for (int y = 0; y < hC16; ++y) {
auto weight0 = B + (hC4Unit * y + 0) * bStride;
auto dst0 = C + (hC4Unit * y + 0) * cStride + x * 4;
auto weight1 = B + (hC4Unit * y + 1) * bStride;
auto dst1 = C + (hC4Unit * y + 1) * cStride + x * 4;
auto weight2 = B + (hC4Unit * y + 2) * bStride;
auto dst2 = C + (hC4Unit * y + 2) * cStride + x * 4;
auto weight3 = B + (hC4Unit * y + 3) * bStride;
auto dst3 = C + (hC4Unit * y + 3) * cStride + x * 4;
auto sumAvx00 = _mm256_set1_ps(0.0f);
auto sumAvx01 = _mm256_set1_ps(0.0f);
auto sumAvx10 = _mm256_set1_ps(0.0f);
auto sumAvx11 = _mm256_set1_ps(0.0f);
auto sumAvx10 = _mm256_set1_ps(0.0f);
auto sumAvx11 = _mm256_set1_ps(0.0f);
auto sumAvx20 = _mm256_set1_ps(0.0f);
auto sumAvx21 = _mm256_set1_ps(0.0f);
auto sumAvx20 = _mm256_set1_ps(0.0f);
auto sumAvx21 = _mm256_set1_ps(0.0f);
auto sumAvx30 = _mm256_set1_ps(0.0f);
auto sumAvx31 = _mm256_set1_ps(0.0f);
auto sumAvx30 = _mm256_set1_ps(0.0f);
auto sumAvx31 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < lC4; ++sy) {
auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
auto W00 = _mm256_loadu_ps(weight0 + 16 * sy + 0);
auto W01 = _mm256_loadu_ps(weight0 + 16 * sy + 8);
auto W10 = _mm256_loadu_ps(weight1 + 16 * sy + 0);
auto W11 = _mm256_loadu_ps(weight1 + 16 * sy + 8);
auto srcUse = src;
for (int sy = 0; sy < lC4; ++sy) {
auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
auto W00 = _mm256_loadu_ps(weight0 + 16 * sy + 0);
auto W01 = _mm256_loadu_ps(weight0 + 16 * sy + 8);
auto W10 = _mm256_loadu_ps(weight1 + 16 * sy + 0);
auto W11 = _mm256_loadu_ps(weight1 + 16 * sy + 8);
auto W20 = _mm256_loadu_ps(weight2 + 16 * sy + 0);
auto W21 = _mm256_loadu_ps(weight2 + 16 * sy + 8);
auto W30 = _mm256_loadu_ps(weight3 + 16 * sy + 0);
auto W31 = _mm256_loadu_ps(weight3 + 16 * sy + 8);
auto W20 = _mm256_loadu_ps(weight2 + 16 * sy + 0);
auto W21 = _mm256_loadu_ps(weight2 + 16 * sy + 8);
auto W30 = _mm256_loadu_ps(weight3 + 16 * sy + 0);
auto W31 = _mm256_loadu_ps(weight3 + 16 * sy + 8);
sumAvx00 = MNNAVXFMA(S0, W00, sumAvx00);
sumAvx01 = MNNAVXFMA(S1, W01, sumAvx01);
sumAvx00 = MNNAVXFMA(S0, W00, sumAvx00);
sumAvx01 = MNNAVXFMA(S1, W01, sumAvx01);
sumAvx10 = MNNAVXFMA(S0, W10, sumAvx10);
sumAvx11 = MNNAVXFMA(S1, W11, sumAvx11);
sumAvx10 = MNNAVXFMA(S0, W10, sumAvx10);
sumAvx11 = MNNAVXFMA(S1, W11, sumAvx11);
sumAvx20 = MNNAVXFMA(S0, W20, sumAvx20);
sumAvx21 = MNNAVXFMA(S1, W21, sumAvx21);
sumAvx20 = MNNAVXFMA(S0, W20, sumAvx20);
sumAvx21 = MNNAVXFMA(S1, W21, sumAvx21);
sumAvx30 = MNNAVXFMA(S0, W30, sumAvx30);
sumAvx31 = MNNAVXFMA(S1, W31, sumAvx31);
srcUse += 4 * aStride;
}
sumAvx00 = _mm256_add_ps(sumAvx00, sumAvx01);
sumAvx10 = _mm256_add_ps(sumAvx10, sumAvx11);
sumAvx20 = _mm256_add_ps(sumAvx20, sumAvx21);
sumAvx30 = _mm256_add_ps(sumAvx30, sumAvx31);
auto sum00 = _mm256_extractf128_ps(sumAvx00, 0);
auto sum01 = _mm256_extractf128_ps(sumAvx00, 1);
auto sum0 = _mm_add_ps(sum00, sum01);
auto sum10 = _mm256_extractf128_ps(sumAvx10, 0);
auto sum11 = _mm256_extractf128_ps(sumAvx10, 1);
auto sum1 = _mm_add_ps(sum10, sum11);
auto sum20 = _mm256_extractf128_ps(sumAvx20, 0);
auto sum21 = _mm256_extractf128_ps(sumAvx20, 1);
auto sum2 = _mm_add_ps(sum20, sum21);
auto sum30 = _mm256_extractf128_ps(sumAvx30, 0);
auto sum31 = _mm256_extractf128_ps(sumAvx30, 1);
auto sum3 = _mm_add_ps(sum30, sum31);
for (int sy = lR; sy < l; ++sy) {
auto s = _mm_broadcast_ss(srcUse);
auto w0 = _mm_loadu_ps(weight0 + 4 * sy);
auto w1 = _mm_loadu_ps(weight1 + 4 * sy);
auto w2 = _mm_loadu_ps(weight2 + 4 * sy);
auto w3 = _mm_loadu_ps(weight3 + 4 * sy);
sum0 = MNNSSEFMA(s, w0, sum0);
sum1 = MNNSSEFMA(s, w1, sum1);
sum2 = MNNSSEFMA(s, w2, sum2);
sum3 = MNNSSEFMA(s, w3, sum3);
srcUse += aStride;
}
_mm_store_ps(dst0, sum0);
_mm_store_ps(dst1, sum1);
_mm_store_ps(dst2, sum2);
_mm_store_ps(dst3, sum3);
sumAvx30 = MNNAVXFMA(S0, W30, sumAvx30);
sumAvx31 = MNNAVXFMA(S1, W31, sumAvx31);
srcUse += 4 * aStride;
}
for (int y = hR; y < hC4; ++y) {
auto weight = B + y * bStride;
auto dst = C + y * cStride + x * 4;
auto sumAvx0 = _mm256_set1_ps(0.0f);
auto sumAvx1 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < lC4; ++sy) {
auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
auto W0 = _mm256_loadu_ps(weight + 16 * sy + 0);
auto W1 = _mm256_loadu_ps(weight + 16 * sy + 8);
sumAvx0 = MNNAVXFMA(S0, W0, sumAvx0);
sumAvx1 = MNNAVXFMA(S1, W1, sumAvx1);
srcUse += 4 * aStride;
}
sumAvx0 = _mm256_add_ps(sumAvx0, sumAvx1);
auto sum0 = _mm256_extractf128_ps(sumAvx0, 0);
auto sum1 = _mm256_extractf128_ps(sumAvx0, 1);
auto sum = _mm_add_ps(sum0, sum1);
for (int sy = lR; sy < l; ++sy) {
auto s = _mm_broadcast_ss(srcUse);
auto w = _mm_loadu_ps(weight + 4 * sy);
sum = MNNSSEFMA(s, w, sum);
srcUse += aStride;
}
_mm_store_ps(dst, sum);
sumAvx00 = _mm256_add_ps(sumAvx00, sumAvx01);
sumAvx10 = _mm256_add_ps(sumAvx10, sumAvx11);
sumAvx20 = _mm256_add_ps(sumAvx20, sumAvx21);
sumAvx30 = _mm256_add_ps(sumAvx30, sumAvx31);
auto sum00 = _mm256_extractf128_ps(sumAvx00, 0);
auto sum01 = _mm256_extractf128_ps(sumAvx00, 1);
auto sum0 = _mm_add_ps(sum00, sum01);
auto sum10 = _mm256_extractf128_ps(sumAvx10, 0);
auto sum11 = _mm256_extractf128_ps(sumAvx10, 1);
auto sum1 = _mm_add_ps(sum10, sum11);
auto sum20 = _mm256_extractf128_ps(sumAvx20, 0);
auto sum21 = _mm256_extractf128_ps(sumAvx20, 1);
auto sum2 = _mm_add_ps(sum20, sum21);
auto sum30 = _mm256_extractf128_ps(sumAvx30, 0);
auto sum31 = _mm256_extractf128_ps(sumAvx30, 1);
auto sum3 = _mm_add_ps(sum30, sum31);
for (int sy = lR; sy < l; ++sy) {
auto s = _mm_broadcast_ss(srcUse);
auto w0 = _mm_loadu_ps(weight0 + 4 * sy);
auto w1 = _mm_loadu_ps(weight1 + 4 * sy);
auto w2 = _mm_loadu_ps(weight2 + 4 * sy);
auto w3 = _mm_loadu_ps(weight3 + 4 * sy);
sum0 = MNNSSEFMA(s, w0, sum0);
sum1 = MNNSSEFMA(s, w1, sum1);
sum2 = MNNSSEFMA(s, w2, sum2);
sum3 = MNNSSEFMA(s, w3, sum3);
srcUse += aStride;
}
_mm_store_ps(dst0, sum0);
_mm_store_ps(dst1, sum1);
_mm_store_ps(dst2, sum2);
_mm_store_ps(dst3, sum3);
}
for (int y = hR; y < hC4; ++y) {
auto weight = B + y * bStride;
auto dst = C + y * cStride + x * 4;
auto sumAvx0 = _mm256_set1_ps(0.0f);
auto sumAvx1 = _mm256_set1_ps(0.0f);
auto srcUse = src;
for (int sy = 0; sy < lC4; ++sy) {
auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride));
auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride));
auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1));
auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride));
auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride));
auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1));
auto W0 = _mm256_loadu_ps(weight + 16 * sy + 0);
auto W1 = _mm256_loadu_ps(weight + 16 * sy + 8);
sumAvx0 = MNNAVXFMA(S0, W0, sumAvx0);
sumAvx1 = MNNAVXFMA(S1, W1, sumAvx1);
srcUse += 4 * aStride;
}
sumAvx0 = _mm256_add_ps(sumAvx0, sumAvx1);
auto sum0 = _mm256_extractf128_ps(sumAvx0, 0);
auto sum1 = _mm256_extractf128_ps(sumAvx0, 1);
auto sum = _mm_add_ps(sum0, sum1);
for (int sy = lR; sy < l; ++sy) {
auto s = _mm_broadcast_ss(srcUse);
auto w = _mm_loadu_ps(weight + 4 * sy);
sum = MNNSSEFMA(s, w, sum);
srcUse += aStride;
}
_mm_store_ps(dst, sum);
}
}

View File

@ -75,3 +75,5 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
size_t dst_depth_quad, const QuanPostTreatParameters* post);
void _SSE_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number);

View File

@ -9,6 +9,27 @@
#include "GemmCommon.hpp"
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#include "backend/cpu/compute/CommonOptFunction.h"
#include <algorithm>
#include <cmath>
bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number) {
for (int i = 0; i < number; ++i) {
auto addr = dst + 16 * i;
auto s0 = _mm_loadu_ps(addr + 4 * 0);
auto s1 = _mm_loadu_ps(addr + 4 * 1);
auto s2 = _mm_loadu_ps(addr + 4 * 2);
auto s3 = _mm_loadu_ps(addr + 4 * 3);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(addr + 4 * 0, s0);
_mm_storeu_ps(addr + 4 * 1, s1);
_mm_storeu_ps(addr + 4 * 2, s2);
_mm_storeu_ps(addr + 4 * 3, s3);
}
return true;
}
void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) {
const int pack = 12;
const int mid = 1; // Deprecate
@ -279,3 +300,156 @@ E##u = _mm_add_epi32(E##u, _mm_madd_epi16(w##u##v, s3##v));\
_mm_storeu_ps((float*)dst_x, _mm_castsi128_ps(d0));
}
}
void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
auto areaC4 = area / 4;
auto depthC4 = depth / 4;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * area * 4;
auto srcPlane = src + z * area * 4;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + 4 * x;
auto d = dstPlane + 16 * x;
auto s0 = _mm_loadu_ps(s + 0 * area);
auto s1 = _mm_loadu_ps(s + 1 * area);
auto s2 = _mm_loadu_ps(s + 2 * area);
auto s3 = _mm_loadu_ps(s + 3 * area);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(d + 4 * 0, s0);
_mm_storeu_ps(d + 4 * 1, s1);
_mm_storeu_ps(d + 4 * 2, s2);
_mm_storeu_ps(d + 4 * 3, s3);
}
}
auto areaRemain = areaC4 * 4;
auto depthRemain = depthC4 * 4;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
float* dstPlane = depthC4 * area * 4 + dst;
const float* srcPlane = src + depthC4 * area * 4;
for (int x = 0; x < area; ++x) {
for (int y = 0; y < remain; y++) {
dstPlane[4 * x + y] = srcPlane[y * area + x];
}
for (int y = remain; y < 4; y++) {
dstPlane[4 * x + y] = 0;
}
}
}
// Right
for (int z = 0; z < depthC4; ++z) {
float* dstPlane = z * area * 4 + dst;
const float* srcPlane = src + z * area * 4;
for (int x = areaRemain; x < area; ++x) {
float s0 = srcPlane[x];
float s1 = srcPlane[x + area];
float s2 = srcPlane[x + area * 2];
float s3 = srcPlane[x + area * 3];
_mm_store_ps(dstPlane + 4 * x, _mm_set_ps(s3, s2, s1, s0));
}
}
}
void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
int w = dim[0];
int h = dim[1];
int srcStride = dim[2];
int dstStride = dim[3];
auto wC4 = w / 4;
auto hC4 = h / 4;
for (int y = 0; y < hC4; ++y) {
auto sy = (float*)srcO + 4 * y;
auto dy = (float*)dstO + 4 * y * dstStride;
for (int x = 0; x < wC4; ++x) {
auto sx = sy + x * 4 * srcStride;
auto dx = dy + 4 * x;
auto s0 = _mm_loadu_ps(sx + srcStride * 0);
auto s1 = _mm_loadu_ps(sx + srcStride * 1);
auto s2 = _mm_loadu_ps(sx + srcStride * 2);
auto s3 = _mm_loadu_ps(sx + srcStride * 3);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(dx + dstStride * 0, s0);
_mm_storeu_ps(dx + dstStride * 1, s1);
_mm_storeu_ps(dx + dstStride * 2, s2);
_mm_storeu_ps(dx + dstStride * 3, s3);
}
}
// Down
for (int i = hC4 * 4; i < h; ++i) {
auto si = srcO + i;
auto di = dstO + i * dstStride;
for (int j = 0; j < w; ++j) {
auto sj = si + j * srcStride;
auto dj = di + j;
*dj = *sj;
}
}
// Right
for (int i = 0; i < hC4 * 4; ++i) {
auto si = srcO + i;
auto di = dstO + i * dstStride;
for (int j = wC4 * 4; j < w; ++j) {
auto sj = si + j * srcStride;
auto dj = di + j;
*dj = *sj;
}
}
}
void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
auto areaC4 = area / 4;
auto depthC4 = depth / 4;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * area * 4;
auto srcPlane = src + z * area * 4;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + 16 * x;
auto d = dstPlane + 4 * x;
auto s0 = _mm_loadu_ps(s + 0 * 4);
auto s1 = _mm_loadu_ps(s + 1 * 4);
auto s2 = _mm_loadu_ps(s + 2 * 4);
auto s3 = _mm_loadu_ps(s + 3 * 4);
_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
_mm_storeu_ps(d + 0 * area, s0);
_mm_storeu_ps(d + 1 * area, s1);
_mm_storeu_ps(d + 2 * area, s2);
_mm_storeu_ps(d + 3 * area, s3);
}
}
auto areaRemain = areaC4 * 4;
auto depthRemain = depthC4 * 4;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
float* dstPlane = depthC4 * area * 4 + dst;
const float* srcPlane = src + depthC4 * area * 4;
for (int x = 0; x < area; ++x) {
for (int y = 0; y < remain; y++) {
dstPlane[y * area + x] = srcPlane[4 * x + y];
}
}
}
// Right
for (int z = 0; z < depthC4; ++z) {
const float* srcPlane = z * area * 4 + src;
float* dstPlane = dst + z * area * 4;
for (int x = areaRemain; x < area; ++x) {
for (int y = 0; y < 4; y++) {
dstPlane[y * area + x] = srcPlane[4 * x + y];
}
}
}
}
void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
if (!transpose) {
MNNUnpackTranspose(dest, source, l, h);
return;
}
MNNPackC4(dest, source, l, h);
}

View File

@ -1,165 +0,0 @@
//
// BufferPool.cpp
// MNN
//
// Created by MNN on 2018/12/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "BufferPool.hpp"
//#define DUMP_USAGE
//#define MNN_DEBUG_MEMORY
namespace MNN {
namespace CUDA {
BufferPool::Node::~Node() {
if (nullptr == parent) {
runtime->free(pointer);
}
}
void* BufferPool::alloc(size_t size, bool seperate) {
#ifdef DUMP_USAGE
auto memoryUsed = size / 1024.0f / 1024.0f;
MNN_PRINT("Alloc: %f\n", memoryUsed);
#endif
void* pointer = nullptr;
// reuse if possible
if (!seperate) {
pointer = getFromFreeList(&mFreeList, size);
if (nullptr != pointer) {
return pointer;
}
}
// alloc otherwise
pointer = mRuntime->alloc(size);
if (nullptr == pointer) {
return nullptr;
}
mTotalSize += size;
// save node
std::shared_ptr<Node> node(new Node);
node->size = size;
node->pointer = pointer;
node->runtime = mRuntime;
mUsedList[pointer] = node;
#ifdef DUMP_USAGE
MNN_PRINT("mTotalSize: %f\n", mTotalSize / 1024.0f / 1024.0f);
#endif
return pointer;
}
void BufferPool::returnMemory(FREELIST* listP, std::shared_ptr<Node> node, bool permitMerge) {
auto& list = *listP;
list.insert(std::make_pair(node->size, node));
// update parent use count
if (nullptr != node->parent && permitMerge) {
auto parent = node->parent;
parent->useCount -= 1;
// merge if all subnodes were freed
auto needMerge = parent->useCount == 0;
while (needMerge) {
// collect all subnodes
for (auto iter = list.begin(); iter != list.end();) {
if (iter->second->parent.get() == parent.get()) {
iter = list.erase(iter);
continue;
}
iter++;
}
// do merge downside up
list.insert(std::make_pair(parent->size, parent));
needMerge = false;
if (parent->parent.get() != nullptr) {
parent = parent->parent;
parent->useCount -= 1;
needMerge = parent->useCount == 0;
}
}
}
}
bool BufferPool::free(void* pointer, bool needRelease) {
// get node
auto x = mUsedList.find(pointer);
if (x == mUsedList.end()) {
MNN_ASSERT(false);
return false;
}
if (needRelease) {
MNN_ASSERT(x->second->parent == nullptr);
mTotalSize -= x->second->size;
mUsedList.erase(x);
return true;
}
// mark as reusable
auto node = x->second;
mUsedList.erase(x);
returnMemory(&mFreeList, node);
#ifdef DUMP_USAGE
auto memoryUsed = x->second->size / 1024.0f / 1024.0f;
MNN_PRINT("Free: %f\n", memoryUsed);
#endif
return true;
}
void BufferPool::release(bool allRelease) {
if (allRelease) {
mUsedList.clear();
mFreeList.clear();
mTotalSize = 0;
return;
}
for (auto f : mFreeList) {
mTotalSize -= f.first;
}
mFreeList.clear();
}
void* BufferPool::getFromFreeList(FREELIST* list, size_t size, bool permiteSplit) {
#ifdef MNN_DEBUG_MEMORY
return nullptr;
#endif
// get node larger than size
auto x = list->lower_bound(size);
if (x == list->end()) {
return nullptr;
}
// update parent use count
void* pointer = x->second->pointer;
if (permiteSplit && nullptr != x->second->parent) {
x->second->parent->useCount += 1;
}
// uses up all aligned space
auto sizeAlign = size;
if (sizeAlign >= x->first || (!permiteSplit)) {
mUsedList.insert(std::make_pair(pointer, x->second));
list->erase(x);
return pointer;
}
// split otherwise
std::shared_ptr<Node> first(new Node);
first->parent = x->second;
first->size = sizeAlign;
first->pointer = x->second->pointer;
mUsedList.insert(std::make_pair(pointer, first));
x->second->useCount += 1;
std::shared_ptr<Node> second(new Node);
second->parent = x->second;
second->size = x->second->size - sizeAlign;
second->pointer = ((uint8_t*)x->second->pointer) + sizeAlign;
list->insert(std::make_pair(second->size, second));
list->erase(x);
return pointer;
}
} // namespace CUDA
} // namespace MNN

View File

@ -1,94 +0,0 @@
//
// BufferPool.hpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef BufferPool_hpp
#define BufferPool_hpp
#include <map>
#include <memory>
#include <vector>
#include "runtime/CUDARuntime.hpp"
namespace MNN {
namespace CUDA {
/** memory utils wrapper. provides memory reusing with alignment ability. */
class BufferPool {
public:
/**
* @brief init buffer allocator with pointer alignment.
* @param CUDARuntime given runtime.
*/
BufferPool(CUDARuntime* runtime) : mRuntime(runtime) {
// nothing to do
}
/**
* @brief deinit buffer allocator. frees all allocated memories.
*/
~BufferPool() {
release();
}
public:
/**
* @brief alloc CHUNK pointer with given size. if any reusable pointer matches size, reuse it.
* @param size given size.
* @param seperate if true, the memory can't be alloc from free pool
* @return allocated or used CHUNK pointer.
* @sa free
* @sa release
*/
void* alloc(size_t size, bool seperate = false);
/**
* @brief mark CHUNK pointer as reusable.
* @param pointer given CHUNK pointer.
* @param release true if need free directly.
* @return true if pointer is a CHUNK pointer, false otherwise.
* @sa release
*/
bool free(void* pointer, bool release = false);
/**
* @brief free all allocated memories.
* @sa allocSeparate
* @sa alloc
* if allRelease, clear all memory , otherwise delete freelist
*/
void release(bool allRelease = true);
/**
* @brief query total size allocated indeed.
* @return total size allocated indeed.
*/
size_t totalSize() const {
return mTotalSize;
}
private:
class Node {
public:
~Node();
void* pointer;
size_t size;
std::shared_ptr<Node> parent = nullptr;
int useCount = 0;
CUDARuntime* runtime;
};
typedef std::multimap<size_t, std::shared_ptr<Node>> FREELIST;
static void returnMemory(FREELIST* list, std::shared_ptr<Node> node, bool permitMerge = true);
void* getFromFreeList(FREELIST* list, size_t size, bool permiteSplit = true);
std::map<void*, std::shared_ptr<Node>> mUsedList;
FREELIST mFreeList;
size_t mTotalSize = 0;
CUDARuntime* mRuntime;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -24,6 +24,21 @@ std::map<OpType, CUDABackend::Creator*>* gCreator() {
std::call_once(gOnce, [&]() { creators = new std::map<OpType, CUDABackend::Creator*>; });
return creators;
};
class CUDARuntimeAllocator : public BufferAllocator::Allocator {
public:
CUDARuntimeAllocator(CUDARuntime* rt) : mRuntime(rt) {
// Do nothing
}
virtual ~ CUDARuntimeAllocator() = default;
virtual std::pair<void*, int> onAlloc(int size) override {
return std::make_pair(mRuntime->alloc(size), 0);
}
virtual void onRelease(std::pair<void*, int> ptr) override {
mRuntime->free(ptr.first);
}
private:
CUDARuntime* mRuntime;
};
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
// Shader precision
if (precision == BackendConfig::Precision_Low) {
@ -36,28 +51,25 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
mIsCreateError = true;
return;
}
mBufferPool.reset(new BufferPool(mCUDARuntime.get()));
mStaticBufferPool.reset(new BufferPool(mCUDARuntime.get()));
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
mBufferPool.reset(new BufferAllocator(allocator));
}
}
CUDARuntimeWrapper::~CUDARuntimeWrapper() {
// Do nothing
}
Backend* CUDARuntimeWrapper::onCreate() const {
return new CUDABackend(mBufferPool, mStaticBufferPool, mCUDARuntime);
return new CUDABackend(mBufferPool, mCUDARuntime);
}
void CUDARuntimeWrapper::onGabageCollect(int level) {
mStaticBufferPool->release(false);
if (level > 50) {
mBufferPool->release(false);
}
mBufferPool->release(false);
}
CUDABackend::CUDABackend(std::shared_ptr<BufferPool> dy, std::shared_ptr<BufferPool> st,
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
std::shared_ptr<CUDARuntime> rt)
: Backend(MNN_FORWARD_CUDA) {
mBufferPool = dy;
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
mStaticBufferPool = st;
mCUDARuntime = rt;
}
@ -66,12 +78,6 @@ CUDABackend::~CUDABackend() {
#ifdef LOG_VERBOSE
MNN_PRINT("enter CUDABackend::~CUDABackend \n");
#endif
for (auto p : mStatic) {
mStaticBufferPool->free(p);
}
for (auto p : mDynamic) {
mBufferPool->free(p);
}
}
CUDARuntime* CUDABackend::getCUDARuntime() {
@ -84,23 +90,22 @@ bool CUDABackend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storag
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
#endif
int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
std::pair<void*, int> buffer;
if (storageType == DYNAMIC_SEPERATE) {
auto buffer = mBufferPool->alloc(mallocSize, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
buffer = mBufferPool->alloc(mallocSize, true);
} else if (storageType == DYNAMIC) {
auto buffer = mBufferPool->alloc(mallocSize, false);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
buffer = mBufferPool->alloc(mallocSize, false);
} else {
MNN_ASSERT(storageType == STATIC);
auto buffer = mStaticBufferPool->alloc(mallocSize, false);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
}
MNN_ASSERT(0 != ((Tensor*)nativeTensor)->buffer().device);
if (STATIC == storageType) {
mStatic.insert((void*)nativeTensor->buffer().device);
} else {
mDynamic.insert((void*)nativeTensor->buffer().device);
buffer = mStaticBufferPool->alloc(mallocSize, false);
}
if(nullptr == buffer.first) {
return false;
};
auto host = (uint8_t*)buffer.first + buffer.second;
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
auto des = TensorUtils::getDescribe(nativeTensor);
des->extra.offset = buffer.second;
return true;
}
@ -108,24 +113,22 @@ bool CUDABackend::onReleaseBuffer(const Tensor* nativeTensor, StorageType storag
if (storageType == DYNAMIC_SEPERATE) {
return true;
}
auto buffer = nativeTensor->deviceId();
auto buffer = (uint8_t*)nativeTensor->deviceId();
auto des = TensorUtils::getDescribe(nativeTensor);
auto pointer = std::make_pair(buffer - des->extra.offset, des->extra.offset);
if (storageType == DYNAMIC) {
mDynamic.erase((void*)buffer);
mBufferPool->free((void*)buffer);
mBufferPool->free(pointer);
return true;
}
if (storageType == STATIC) {
mStatic.erase((void*)buffer);
mStaticBufferPool->free((void*)buffer);
mStaticBufferPool->free(pointer);
}
return true;
}
bool CUDABackend::onClearBuffer() {
for (auto p : mDynamic) {
mBufferPool->free(p);
}
mDynamic.clear();
mBufferPool->release(true);
return true;
}
size_t CUDABackend::realSize(const Tensor* tensor) {
@ -172,9 +175,9 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (NULL == exe) {
if (nullptr != op->name()) {
MNN_PRINT("The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str());
MNN_PRINT("The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
// MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
}
return NULL;
}

View File

@ -12,12 +12,11 @@
#include <set>
#include <vector>
#include "MNN_generated.h"
#include "backend/cuda/core/BufferPool.hpp"
#include "backend/cuda/core/runtime/CUDARuntime.hpp"
#include "core/Backend.hpp"
#include "core/Macro.h"
#include "core/ConvolutionCommon.hpp"
#include "core/BufferAllocator.hpp"
namespace MNN {
namespace CUDA {
class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
@ -31,15 +30,14 @@ public:
}
private:
std::shared_ptr<BufferPool> mBufferPool;
std::shared_ptr<BufferPool> mStaticBufferPool;
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime;
bool mIsCreateError{false};
};
class CUDABackend final : public Backend {
public:
CUDABackend(std::shared_ptr<BufferPool> dy, std::shared_ptr<BufferPool> st, std::shared_ptr<CUDARuntime> rt);
CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt);
~CUDABackend();
CUDARuntime *getCUDARuntime();
@ -64,10 +62,10 @@ public:
static bool addCreator(OpType t, Creator *c);
BufferPool *getBufferPool() const {
BufferAllocator *getBufferPool() const {
return mBufferPool.get();
}
BufferPool *getStaticBufferPool() const {
BufferAllocator *getStaticBufferPool() const {
return mStaticBufferPool.get();
}
virtual std::pair<float, bool> onMeasure(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
@ -75,10 +73,8 @@ public:
static size_t realSize(const Tensor *tensor);
private:
std::set<void *> mStatic;
std::set<void *> mDynamic;
std::shared_ptr<BufferPool> mBufferPool;
std::shared_ptr<BufferPool> mStaticBufferPool;
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<BufferAllocator> mStaticBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime;
};

View File

@ -57,6 +57,7 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
// Note that all cublas scalars (alpha, beta) and scalar results such as dot
// output resides at device side.
cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST));
cudnn_check(cudnnCreate(&mCudnnHandle));
}
CUDARuntime::~CUDARuntime() {
@ -64,13 +65,27 @@ CUDARuntime::~CUDARuntime() {
MNN_PRINT("start ~CUDARuntime !\n");
#endif
cublas_check(cublasDestroy(mCublasHandle));
cudnn_check(cudnnDestroy(mCudnnHandle));
#ifdef LOG_VERBOSE
MNN_PRINT("end ~CUDARuntime !\n");
#endif
}
int CUDARuntime::blocks_num(const int total_threads) const {
return (total_threads + mProp.maxThreadsPerBlock - 1) / mProp.maxThreadsPerBlock;
int CUDARuntime::blocks_num(const int total_threads) {
int maxNum = mProp.maxThreadsPerBlock;
if(total_threads / 32 > maxNum) {
mThreadPerBlock = maxNum;
} else if(total_threads / 16 > maxNum) {
mThreadPerBlock = maxNum / 2;
} else if(total_threads / 8 > maxNum) {
mThreadPerBlock = maxNum / 4;
} else if(total_threads / 4 > maxNum) {
mThreadPerBlock = maxNum / 8;
} else {
mThreadPerBlock = 128;
}
return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
}
bool CUDARuntime::isSupportedFP16() const {
@ -126,6 +141,7 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe
default:
MNN_ERROR("bad cuda memcpy kind\n");
}
//TODO, support Async Afterwards
cuda_check(cudaMemcpy(dst, src, size_in_bytes, cuda_kind));
}
@ -137,4 +153,8 @@ cublasHandle_t CUDARuntime::cublas_handle() {
return mCublasHandle;
}
cudnnHandle_t CUDARuntime::cudnn_handle() {
return mCudnnHandle;
}
} // namespace MNN

View File

@ -106,26 +106,29 @@ public:
void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false);
void memset(void *dst, int value, size_t size_in_bytes);
cublasHandle_t cublas_handle();
cudnnHandle_t cudnn_handle();
int threads_num() const {
return mProp.maxThreadsPerBlock;
int threads_num() {
return mThreadPerBlock;
}
int major_sm() const {
return mProp.major;
}
int blocks_num(const int total_threads) const;
int blocks_num(const int total_threads);
private:
cudaDeviceProp mProp;
int mDeviceId;
cublasHandle_t mCublasHandle;
cudnnHandle_t mCudnnHandle;
bool mIsSupportedFP16 = false;
bool mSupportDotInt8 = false;
bool mSupportDotAccInt8 = false;
float mFlops = 4.0f;
bool mIsCreateError{false};
int mThreadPerBlock = 128;
};
} // namespace MNN

View File

@ -0,0 +1,80 @@
#include "ArgMaxExecution.hpp"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void ARGMAX(const int count, const int outside, const int inside, const int dim,
const T *input, T *output) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
const int o = i / inside;
const int n = i % inside;
T* outPtr = output + inside * o;
const T* inpPtr = input + inside * dim * o;
int index = 0;
T maxValue = inpPtr[0];
for(int j=1; j<dim; j++) {
T value = inpPtr[j*inside];
if(maxValue < value) {
index = j;
maxValue = value;
}
}
outPtr[n] = index;
}
return;
}
ArgMaxExecution::ArgMaxExecution(const Op* op, Backend *backend) : Execution(backend) {
mOp = op;
mAxis = mOp->main_as_ArgMax()->axis();
}
ArgMaxExecution::~ArgMaxExecution(){
// Do nothing
}
ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = inputs[0];
auto output = outputs[0];
if (mAxis < 0) {
mAxis = input->dimensions() + mAxis;
}
mInside = 1;
mOutside = 1;
for (int i=0; i<mAxis; ++i) {
mOutside *= input->length(i);
}
for (int i=mAxis+1; i<input->dimensions(); ++i) {
mInside *= input->length(i);
}
mDim = input->length(mAxis);
return NO_ERROR;
}
ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
auto input = (void *)inputs[0]->deviceId();
auto output = (void *)outputs[0]->deviceId();
int count = mOutside * mInside;
int block_num = runtime->blocks_num(count);
int thread_num = runtime->threads_num();
ARGMAX<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const float*)input,(float *)output);
return NO_ERROR;
}
class ArgMaxCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
return new ArgMaxExecution(op, backend);
}
};
static CUDACreatorRegister<ArgMaxCreator> __init(OpType_ArgMax);
}
}

View File

@ -0,0 +1,33 @@
//
// ArgMaxExecution.hpp
// MNN
//
// Created by MNN on 2020/07/29.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef ArgMaxExecution_hpp
#define ArgMaxExecution_hpp
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
class ArgMaxExecution : public Execution {
public:
ArgMaxExecution(const Op* op, Backend *backend);
virtual ~ArgMaxExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const Op* mOp;
int mAxis;
int mInside;
int mOutside;
int mDim;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -0,0 +1,117 @@
#include "BatchMatMulExecution.hpp"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void transpose_bias(T *input, T *output, const T* bias, int batch, int e, int h) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) {
int i = index % (e*h);
int b = index / (e*h);
int y = i / e;
output[index] = input[index] + bias[b * h + y];
}
return;
}
BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) {
mTransposeA = transposeA;
mTransposeB = transposeB;
}
BatchMatMulExecution::~ BatchMatMulExecution() {
// do nothing
}
ErrorCode BatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto C = outputs[0];
auto dimensions = C->dimensions();
int batch = 1;
for (int i = 0; i < dimensions - 2; ++i) {
batch *= C->length(i);
}
auto e = C->length(dimensions-2);
auto h = C->length(dimensions-1);
if(inputs.size() > 2) {
mTempOutput.reset(Tensor::createDevice<float>({batch*h*e}));
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
ErrorCode BatchMatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto blasHandle = runtime->cublas_handle();
const Tensor* A = inputs[0];
const Tensor* B = inputs[1];
auto dimensions = A->dimensions();
int batch = 1;
for (int i = 0; i < dimensions - 2; ++i) {
batch *= A->length(i);
}
auto w0 = inputs[0]->length(dimensions-1);
auto h0 = inputs[0]->length(dimensions-2);
auto C = outputs[0];
auto e = C->length(dimensions-2);
auto h = C->length(dimensions-1);
auto l = w0;
if (mTransposeA) {
l = h0;
}
auto APtr = (const float*)A->deviceId();
auto BPtr = (const float*)B->deviceId();
auto CDestPtr = (float*)C->deviceId();
float alpha = 1.0f;
float beta = 0.0f;
auto tranB = CUBLAS_OP_N;
auto ldB = h;
if (mTransposeB) {
ldB = l;
tranB = CUBLAS_OP_T;
}
auto tranA = CUBLAS_OP_N;
auto ldA = l;
if (mTransposeA) {
ldA = e;
tranA = CUBLAS_OP_T;
}
if(inputs.size() == 2) {
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch);
cublas_check(status);
//cudaThreadSynchronize();
} else {
auto CPtr = (float*)mTempOutput->deviceId();
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch);
cublas_check(status);
//cudaThreadSynchronize();
// Transpose batch, h, e -> batch, e, h
int block_num = runtime->blocks_num(batch*e*h);
int threads_num = runtime->threads_num();
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h);
}
return NO_ERROR;
}
class BatchMatMulCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto param = op->main_as_BatchMatMulParam();
return new BatchMatMulExecution(param->adjX(), param->adjY(), backend);
}
};
static CUDACreatorRegister<BatchMatMulCreator> __init(OpType_BatchMatMul);
}
}

View File

@ -0,0 +1,31 @@
//
// BatchMatMulExecution.hpp
// MNN
//
// Created by MNN on 2020/07/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef BatchMatMulExecution_hpp
#define BatchMatMulExecution_hpp
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
class BatchMatMulExecution : public Execution {
public:
BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend);
virtual ~BatchMatMulExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
std::shared_ptr<Tensor> mTempOutput;
bool mTransposeA;
bool mTransposeB;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -48,14 +48,16 @@ __global__ void MUL(const T *input0, const T* input1, T *output, size_t count, s
template <typename T>
__global__ void DIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = input0[i * s0] / input1[i * s1];
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
output[i] = sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001);
}
return;
}
template <typename T>
__global__ void REALDIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = input0[i * s0] / input1[i * s1];
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
output[i] = sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001);
}
return;
}
@ -123,7 +125,9 @@ __global__ void NOTEQUAL(const T *input0, const T* input1, int *output, size_t c
template <typename T>
__global__ void FLOORDIV(const T *input0, const T* input1, T *output, size_t count, size_t s0, size_t s1) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = floor(1.0*(input0[i * s0] / input1[i * s1]));
int sgn = input1[i * s1] > 0 ? 1 : (input1[i * s1] < 0 ? -1 : 0);
output[i] = floor(1.0*sgn * input0[i * s0] / max(abs((float)input1[i * s1]), 0.0000001));
}
return;
}
@ -133,7 +137,10 @@ __global__ void FLOORMOD(const T *input0, const T* input1, T *output, size_t cou
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
T x = input0[i * s0];
T y = input1[i * s1];
output[i] = x - floor(1.0*(x / y)) * y;
int sgn = y > 0 ? 1 : (y < 0 ? -1 : 0);
T tmp = floor(1.0*sgn * x / max((float)abs(y), 0.0000001));
output[i] = x - tmp * y;
}
return;
}

View File

@ -54,7 +54,7 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
runtime->memcpy(mConstBuffer, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
mTotalCount = parameters.total;
if(inputs.size() == 1) {
@ -149,16 +149,17 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int block_num = runtime->blocks_num(mTotalCount);
int threads_num = runtime->threads_num();
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
if (inputs.size() == 1) {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)mFilter,
(const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
(const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
} else if (inputs.size() == 3) {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
} else {
MNN_ASSERT(inputs.size() == 2);
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
}
return NO_ERROR;
}
@ -249,9 +250,10 @@ ErrorCode DeconvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs
parameters.outputSize[1] = outputs[0]->height();
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
parameters.subChannel = inputs[0]->channel();
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
runtime->memcpy(mConstBuffer, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
runtime->memcpy(constPtr, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
mTotalCount = parameters.total;
return NO_ERROR;
}
@ -260,12 +262,13 @@ ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector<Tensor *> &input
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int block_num = runtime->blocks_num(mTotalCount);
int threads_num = runtime->threads_num();
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
if (inputs.size() > 2) {
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
} else {
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)mConstBuffer);
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
}
return NO_ERROR;
}

View File

@ -22,7 +22,7 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
protected:
void *mConstBuffer;
std::pair<void*, int> mConstBuffer;
const Op *mOp;
int mTotalCount;

View File

@ -70,7 +70,8 @@ ConvSingleInputExecution::ConvSingleInputExecution(Backend* backend, const MNN::
cudnn_data_type_ = CUDNN_DATA_FLOAT;
cudnn_data_type_len_ = 0;
cudnn_check(cudnnCreate(&cudnn_handle_));
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
cudnn_handle_ = runtime->cudnn_handle();
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
@ -79,7 +80,7 @@ ConvSingleInputExecution::ConvSingleInputExecution(Backend* backend, const MNN::
cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc_));
cudnn_check(cudnnCreateActivationDescriptor(&act_desc_));
//weight host->device
const float* filterDataPtr = nullptr;
int weightSize = 0;
@ -111,10 +112,24 @@ ConvSingleInputExecution::ConvSingleInputExecution(Backend* backend, const MNN::
}
use_bias_ = true;
}
mKernelInfo.kernelN = common->outputCount();
mKernelInfo.kernelC = weightSize / (mKernelInfo.kernelN * mKernelInfo.kernelY * mKernelInfo.kernelX);
std::vector<int> filter_shape = {mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX};
cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
filter_shape[1], filter_shape[2], filter_shape[3]));
cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX,
mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
}
//set group num
cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
}
ConvSingleInputExecution::~ConvSingleInputExecution() {
cudnn_check(cudnnDestroy(cudnn_handle_));
cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
@ -152,9 +167,32 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
mKernelInfo.kernelN = output->channel();
mKernelInfo.kernelC = input->channel() / mKernelInfo.groups;
if(mIOInfo.iw==0) {
mIOInfo.iw = 1;
}
if(mIOInfo.ih==0) {
mIOInfo.ih = 1;
}
if(mIOInfo.ic==0) {
mIOInfo.ic = 1;
}
if(mIOInfo.ib==0) {
mIOInfo.ib = 1;
}
if(mIOInfo.ow==0) {
mIOInfo.ow = 1;
}
if(mIOInfo.oh==0) {
mIOInfo.oh = 1;
}
if(mIOInfo.oc==0) {
mIOInfo.oc = 1;
}
if(mIOInfo.ob==0) {
mIOInfo.ob = 1;
}
std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
std::vector<int> output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
std::vector<int> filter_shape = {mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX};
// printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]);
// printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]);
@ -162,8 +200,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0],
in_shape[1], in_shape[2], in_shape[3]));
cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
filter_shape[1], filter_shape[2], filter_shape[3]));
cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0],
output_shape[1], output_shape[2], output_shape[3]));
@ -205,14 +241,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
}
input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX,
mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
}
//set group num
cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
// algorithm
constexpr int requested_algo_count = 1;
int returned_algo_count;
@ -246,7 +274,6 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
//MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
@ -264,7 +291,6 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
const float alpha = 1;
const float beta = 0;
if(use_pad_) {
std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
@ -289,6 +315,7 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
if(use_relu_ || use_relu6_) {
cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr));
}
return NO_ERROR;
}

View File

@ -65,7 +65,8 @@ DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const M
cudnn_data_type_ = CUDNN_DATA_FLOAT;
cudnn_data_type_len_ = 0;
cudnn_check(cudnnCreate(&cudnn_handle_));
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
cudnn_handle_ = runtime->cudnn_handle();
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
@ -110,7 +111,6 @@ DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const M
}
DeconvSingleInputExecution::~DeconvSingleInputExecution() {
cudnn_check(cudnnDestroy(cudnn_handle_));
cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));

View File

@ -0,0 +1,101 @@
#include "GatherV2Execution.hpp"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void GATHERV2(const int count, const int outside, const int inside, const int iNum, const int oNum,
const T *input, const int* indice, T *output) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
const int o = i / oNum;
const int n = i % oNum;
T* outPtr = output + inside * oNum * o;
const T* inpPtr = input + inside * iNum * o;
for(int j=0; j<inside; j++) {
outPtr[n*inside+j] = inpPtr[indice[n]*inside+j];
}
}
return;
}
GatherV2Execution::GatherV2Execution(const Op* op, Backend *backend) : Execution(backend) {
mOp = op;
}
GatherV2Execution::~GatherV2Execution(){
// Do nothing
}
ErrorCode GatherV2Execution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto params = inputs[0];
mAxis = 0;
if (mOp->main_type() == OpParameter_Axis) {
mAxis = mOp->main_as_Axis()->axis();
}
MNN_ASSERT(mAxis > -params->buffer().dimensions && mAxis < params->buffer().dimensions);
if (mAxis < 0) {
mAxis = params->buffer().dimensions + mAxis;
}
auto indices = inputs[1];
auto output = outputs[0];
mOutNum = indices->elementSize();
mInside = 1;
mOutside = 1;
for (int i=0; i<mAxis; ++i) {
mOutside *= params->length(i);
}
for (int i=mAxis+1; i<params->dimensions(); ++i) {
mInside *= params->length(i);
}
mInpNum = params->length(mAxis);
return NO_ERROR;
}
ErrorCode GatherV2Execution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
auto params = (void *)inputs[0]->deviceId();
auto indices = (void *)inputs[1]->deviceId();
auto output = (void *)outputs[0]->deviceId();
if (inputs.size() == 3) {
cudaMemcpy(&mAxis, (void *)inputs[2]->deviceId(), sizeof(int), cudaMemcpyDeviceToHost);
auto input0 = inputs[0];
MNN_ASSERT(mAxis > -input0->dimensions() && mAxis < input0->dimensions());
if (mAxis < 0) {
mAxis = input0->dimensions() + mAxis;
}
mInside = 1;
mOutside = 1;
for (int i=0; i<mAxis; ++i) {
mOutside *= input0->length(i);
}
for (int i=mAxis+1; i<input0->dimensions(); ++i) {
mInside *= input0->length(i);
}
mInpNum = input0->length(mAxis);
}
int count = mOutside * mOutNum;
int block_num = runtime->blocks_num(count);
int thread_num = runtime->threads_num();
GATHERV2<<<block_num, thread_num>>>(count, mOutside, mInside, mInpNum, mOutNum, (const float*)params, (int *)indices,
(float *)output);
return NO_ERROR;
}
class GatherV2Creator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
return new GatherV2Execution(op, backend);
}
};
static CUDACreatorRegister<GatherV2Creator> __init(OpType_GatherV2);
}
}

View File

@ -0,0 +1,34 @@
//
// GatherV2Execution.hpp
// MNN
//
// Created by MNN on 2020/07/29.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef GatherV2Execution_hpp
#define GatherV2Execution_hpp
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
class GatherV2Execution : public Execution {
public:
GatherV2Execution(const Op* op, Backend *backend);
virtual ~GatherV2Execution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const Op* mOp;
int mAxis;
int mInside;
int mOutside;
int mInpNum;
int mOutNum;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -0,0 +1,352 @@
#include "LayerNormExecution.hpp"
namespace MNN {
namespace CUDA {
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define FINAL_MASK 0xffffffff
template <typename T>
__inline__ __device__
T warpReduceSum(T val)
{
for(int mask = 16; mask > 0; mask >>= 1)
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
return val;
}
template <typename T>
__inline__ __device__
T blockReduceSum(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceSum<T>(val);
if(lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceSum(val);
return val;
}
template <typename T>
__global__
void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl)
{
int tid = threadIdx.x;
__shared__ float s_mean;
__shared__ float s_variance;
float mean = 0.0f;
float variance = 0.0f;
float local_out = 0.0f;
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
local_out += (float)(input[blockIdx.x * n + idx*256 + tid]);
}
mean = blockReduceSum<float>(local_out);
if(threadIdx.x == 0)
s_mean = mean / n;
__syncthreads();
float var_tmp = 0.0f;
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
var_tmp += ((input[blockIdx.x * n + idx*256 + tid] - s_mean) * (input[blockIdx.x * n + idx*256 + tid] - s_mean));
}
variance += blockReduceSum<float>(var_tmp);
if(threadIdx.x == 0)
s_variance = variance / n + epsilon;
__syncthreads();
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
out[blockIdx.x * n + idx*256+tid] =
(T)(((input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
}
}
template <typename T>
__global__
void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
__shared__ float s_mean;
__shared__ float s_variance;
float mean = 0.0f;
float variance = 0.0f;
float local_out = 0.0f;
float value_tmp[8];
value_tmp[0] = input[blockIdx.x * 2048 + 0*256 + tid];
value_tmp[1] = input[blockIdx.x * 2048 + 1*256 + tid];
value_tmp[2] = input[blockIdx.x * 2048 + 2*256 + tid];
value_tmp[3] = input[blockIdx.x * 2048 + 3*256 + tid];
value_tmp[4] = input[blockIdx.x * 2048 + 4*256 + tid];
value_tmp[5] = input[blockIdx.x * 2048 + 5*256 + tid];
value_tmp[6] = input[blockIdx.x * 2048 + 6*256 + tid];
value_tmp[7] = input[blockIdx.x * 2048 + 7*256 + tid];
#pragma unroll(8)
for(int idx=0; idx<8; idx++) {
local_out += (float)value_tmp[idx];
}
mean = blockReduceSum<float>(local_out);
if(threadIdx.x == 0)
s_mean = mean / n;
__syncthreads();
float var_tmp = 0.0f;
#pragma unroll(8)
for(int idx=0; idx<8; idx++) {
var_tmp += ((value_tmp[idx] - s_mean) * (value_tmp[idx] - s_mean));
}
variance += blockReduceSum<float>(var_tmp);
if(threadIdx.x == 0)
s_variance = variance / n + epsilon;
__syncthreads();
#pragma unroll(8)
for(int idx=0; idx<8; idx++) {
out[blockIdx.x * 2048 + idx*256+tid] =
(T)(((value_tmp[idx] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
}
}
template <typename T>
__global__
void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
__shared__ float s_mean;
__shared__ float s_variance;
float mean = 0.0f;
float variance = 0.0f;
float local_out = 0.0f;
float value_tmp[4];
value_tmp[0] = input[blockIdx.x * 1024 + 0*256 + tid];
value_tmp[1] = input[blockIdx.x * 1024 + 1*256 + tid];
value_tmp[2] = input[blockIdx.x * 1024 + 2*256 + tid];
value_tmp[3] = input[blockIdx.x * 1024 + 3*256 + tid];
#pragma unroll(4)
for(int idx=0; idx<4; idx++) {
local_out += (float)value_tmp[idx];
}
mean = blockReduceSum<float>(local_out);
if(threadIdx.x == 0)
s_mean = mean / n;
__syncthreads();
float var_tmp = 0.0f;
#pragma unroll(4)
for(int idx=0; idx<4; idx++) {
var_tmp += ((value_tmp[idx] - s_mean) * (value_tmp[idx] - s_mean));
}
variance += blockReduceSum<float>(var_tmp);
if(threadIdx.x == 0)
s_variance = variance / n + epsilon;
__syncthreads();
#pragma unroll(4)
for(int idx=0; idx<4; idx++) {
out[blockIdx.x * 1024 + idx*256+tid] =
(T)(((value_tmp[idx] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
}
}
template <typename T>
__global__
void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
__shared__ float s_mean;
__shared__ float s_variance;
float mean = 0.0f;
float variance = 0.0f;
float local_out = 0.0f;
float value_tmp[2];
value_tmp[0] = input[blockIdx.x * 512 + 0*256 + tid];
value_tmp[1] = input[blockIdx.x * 512 + 1*256 + tid];
local_out += (float)value_tmp[0];
local_out += (float)value_tmp[1];
mean = blockReduceSum<float>(local_out);
if(threadIdx.x == 0)
s_mean = mean / n;
__syncthreads();
float var_tmp = 0.0f;
var_tmp += ((value_tmp[0] - s_mean) * (value_tmp[0] - s_mean));
var_tmp += ((value_tmp[1] - s_mean) * (value_tmp[1] - s_mean));
variance += blockReduceSum<float>(var_tmp);
if(threadIdx.x == 0)
s_variance = variance / n + epsilon;
__syncthreads();
out[blockIdx.x * 512 + 0*256+tid] =
(T)(((value_tmp[0] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[0*256 + tid])) + (float)(__ldg(&beta[0*256 + tid])));
out[blockIdx.x * 512 + 1*256+tid] =
(T)(((value_tmp[1] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[1*256 + tid])) + (float)(__ldg(&beta[1*256 + tid])));
}
template<typename T>
__global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon,
const T* in, T* out, const T* gamma_data, const T* beta_data) {
CUDA_KERNEL_LOOP(i, count) {
const int o = i / inside;
const int index = i % inside;
const T* inner_input = in + o * inside;
T* inner_output = out + o * inside;
T sum = 0.f;
for (int j = 0; j < inside; ++j) {
sum += inner_input[j];
}
T mean = sum / inside;
T square_sum = 0.f;
for (int j = 0; j < inside; ++j) {
square_sum += (inner_input[j] - mean) * (inner_input[j] - mean);
}
T variable = square_sum / inside;
variable = 1.f / sqrt(variable + epsilon);
inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
}
}
LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backend *backend) : Execution(backend) {
int axis_size = layer_norm_param->axis()->size();
mAxises.resize(axis_size);
for (int i = 0; i < axis_size; ++i) {
mAxises[i] = layer_norm_param->axis()->Get(i);
}
mEps = layer_norm_param->epsilon();
int size = layer_norm_param->gamma()->size();
mGammaTensor.reset(Tensor::createDevice<float>({size}));
auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n");
}
mDeviceGamma = (void *)mGammaTensor.get()->buffer().device;
const float* gamma_data = layer_norm_param->gamma()->data();
cudaMemcpy(mDeviceGamma, gamma_data, size * sizeof(float), cudaMemcpyHostToDevice);
if (layer_norm_param->beta()->size() != size) {
MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n");
}
mBetaTensor.reset(Tensor::createDevice<float>({size}));
status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n");
}
mDeviceBeta = (void *)mBetaTensor.get()->buffer().device;
const float* beta_data = layer_norm_param->beta()->data();
cudaMemcpy(mDeviceBeta, beta_data, size * sizeof(float), cudaMemcpyHostToDevice);
}
LayerNormExecution::~LayerNormExecution() {
if (nullptr != mGammaTensor) {
backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC);
}
if (nullptr != mBetaTensor) {
backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC);
}
}
ErrorCode LayerNormExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
auto input = inputs[0];
mOutside = 1;
mInside = 1;
int rank = input->dimensions();
std::vector<int> axis(mAxises.size());
for (int i = 0; i < mAxises.size(); ++i) {
if (mAxises[i] < 0) {
mAxises[i] += rank;
}
}
std::sort(axis.begin(), axis.end());
for (int i = 0; i < rank - axis.size(); ++i) {
mOutside *= input->length(i);
}
for (int i = rank - axis.size(); i < rank; ++i) {
mInside *= input->length(i);
}
return NO_ERROR;
}
ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int block_num = runtime->blocks_num(mOutside*mInside);
int threads_num = runtime->threads_num();
auto input_addr = (void*)inputs[0]->deviceId();
auto output_addr = (void*)outputs[0]->deviceId();
if(mInside < 128) {
LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr,
(const float *)mDeviceGamma, (const float *)mDeviceBeta);
} else {
if(mInside == 2048) {
input_layernorm_2048<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else if(mInside == 1024) {
input_layernorm_1024<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else if(mInside == 512) {
input_layernorm_512<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else {
int sumPerKnl = (mInside+255) / 256;
input_layernorm<<<mOutside, 256>>>((float *)output_addr, (const float *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl);
}
}
return NO_ERROR;
}
class LayerNormCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto param = op->main_as_LayerNorm();
return new LayerNormExecution(param, backend);
}
};
static CUDACreatorRegister<LayerNormCreator> __init(OpType_LayerNorm);
}
}

View File

@ -0,0 +1,49 @@
//
// LayerNormExecution.hpp
// MNN
//
// Created by MNN on 2019/01/31.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef LayerNormExecution_hpp
#define LayerNormExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
namespace MNN {
namespace CUDA {
class LayerNormExecution : public Execution {
public:
LayerNormExecution(const LayerNorm* layer_norm_param, Backend *backend);
virtual ~LayerNormExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
CUDARuntime *mRuntime;
void *mDeviceGamma = nullptr;
void *mDeviceBeta = nullptr;
std::vector<int> mAxises;
int mInside = 1;
int mOutside = 1;
float mEps = 0.001;
std::unique_ptr<Tensor> mGammaTensor;
std::unique_ptr<Tensor> mBetaTensor;
std::shared_ptr<Tensor> LayerNormTensor;
std::shared_ptr<Tensor> biasTensor;
};
} // namespace CUDA
} // namespace MNN
#endif /* LayerNormExecution_hpp */

View File

@ -3,20 +3,11 @@ namespace MNN {
namespace CUDA {
template <typename T>
__global__ void transpose(T *input, T *output, size_t e, size_t h) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
__global__ void transpose_bias(T *input, T *output, const T* bias, int e, int h) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
int y = i % e;
int x = i / e;
output[y * h + x] = input[i];
}
return;
}
template <typename T>
__global__ void transpose_bias(T *input, T *output, const T* bias, size_t e, size_t h) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < e * h; i += blockDim.x * gridDim.x) {
int y = i % e;
int x = i / e;
output[y * h + x] = input[i] + bias[x];
output[i] = input[i] + bias[x];
}
return;
}
@ -32,12 +23,14 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
auto C = outputs[0];
auto e = C->length(0);
auto h = C->length(1);
mTempOutput.reset(Tensor::createDevice<float>({e, h}));
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
if(inputs.size() > 2) {
mTempOutput.reset(Tensor::createDevice<float>({e, h}));
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
return NO_ERROR;
}
@ -58,33 +51,40 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
}
auto APtr = (const float*)A->deviceId();
auto BPtr = (const float*)B->deviceId();
auto CPtr = (float*)mTempOutput->deviceId();
auto CDestPtr = (float*)C->deviceId();
float alpha = 1.0f;
float beta = 0.0f;
auto tranA = CUBLAS_OP_T;
auto ldA = l;
if (mTransposeA) {
ldA = e;
tranA = CUBLAS_OP_N;
}
auto tranB = CUBLAS_OP_T;
auto tranB = CUBLAS_OP_N;
auto ldB = h;
if (mTransposeB) {
ldB = l;
tranB = CUBLAS_OP_N;
tranB = CUBLAS_OP_T;
}
auto tranA = CUBLAS_OP_N;
auto ldA = l;
if (mTransposeA) {
ldA = e;
tranA = CUBLAS_OP_T;
}
auto status = cublasSgemm(blasHandle, tranA, tranB, e, h, l, &alpha, APtr, ldA, BPtr, ldB, &beta, CPtr, e);
//cudaThreadSynchronize();
// Transpose h, e -> e, h
int block_num = runtime->blocks_num(e*h);
int threads_num = runtime->threads_num();
auto CDestPtr = (float*)C->deviceId();
if (inputs.size() > 2) {
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
if(inputs.size() == 2) {
auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h);
cublas_check(status);
//cudaThreadSynchronize();
} else {
transpose<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, e, h);
auto CPtr = (float*)mTempOutput->deviceId();
auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h);
cublas_check(status);
//cudaThreadSynchronize();
// Transpose h, e -> e, h
transpose_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
}
return NO_ERROR;
}

View File

@ -0,0 +1,78 @@
#include "SoftmaxExecution.hpp"
namespace MNN {
namespace CUDA {
SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
cudnn_handle_ = runtime->cudnn_handle();
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
cudnn_data_type_ = CUDNN_DATA_FLOAT;
mAxis = axis;
}
SoftmaxExecution::~SoftmaxExecution() {
cudnnDestroyTensorDescriptor(input_desc_);
cudnnDestroyTensorDescriptor(output_desc_);
}
ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
inside = 1;
outside = 1;
if(mAxis < 0) {
mAxis += inputs[0]->dimensions();
}
axis = inputs[0]->length(mAxis);
for (int i=0; i<mAxis; ++i) {
outside *= inputs[0]->length(i);
}
for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
inside *= inputs[0]->length(i);
}
std::vector<int> tensor_shape = {outside, axis, inside, 1};
cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
tensor_shape[1], tensor_shape[2], tensor_shape[3]));
cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
tensor_shape[1], tensor_shape[2], tensor_shape[3]));
return NO_ERROR;
}
ErrorCode SoftmaxExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = (void*)inputs[0]->deviceId();
auto output = (void*)outputs[0]->deviceId();
const float alpha = 1;
const float beta = 0;
cudnn_check(cudnnSoftmaxForward(cudnn_handle_, CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_CHANNEL,
&alpha,
input_desc_, input,
&beta,
output_desc_, output));
return NO_ERROR;
}
class SoftmaxCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto type = inputs[0]->getType();
if (type.code != halide_type_float) {
MNN_PRINT("softmax data type:%s not support", type.code);
return nullptr;
}
auto axis = op->main_as_Axis()->axis();
return new SoftmaxExecution(axis, backend);
}
};
static CUDACreatorRegister<SoftmaxCreator> __init(OpType_Softmax);
}
}

View File

@ -0,0 +1,42 @@
//
// SoftmaxExecution.hpp
// MNN
//
// Created by MNN on 2019/01/31.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef SoftmaxExecution_hpp
#define SoftmaxExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
namespace MNN {
namespace CUDA {
class SoftmaxExecution : public Execution {
public:
SoftmaxExecution(int axis, Backend *backend);
virtual ~SoftmaxExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
cudnnDataType_t cudnn_data_type_;
int mAxis;
int axis;
int inside;
int outside;
};
} // namespace CUDA
} // namespace MNN
#endif /* SoftmaxExecution_hpp */

View File

@ -68,17 +68,13 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc,
for (int g = 0; g < group; g++) {
auto g_dst = dst + g * goc_4 * gic_4 * kh * kw * 16; // g
#pragma clang loop vectorize(enable)
for (int o = 0; o < goc; o++) {
auto zo = o / 4, ro = o % 4;
auto o_dst = g_dst + zo * gic_4 * kh * kw * 16 + ro * 4; // o/4 x 4
#pragma clang loop vectorize(enable)
for (int i = 0; i < gic; i++) {
auto zi = i / 4, ri = i % 4;
auto i_dst = o_dst + zi * kh * kw * 16 + ri; // i/4 x 4
#pragma clang loop vectorize(enable)
for (int h = 0; h < kh; h++) {
#pragma clang loop vectorize(enable) unroll(enable)
for (int w = 0; w < kw; w++) {
// to [g][o/4][i/4][h][w][16]
// from [g][o][i][h][w]
@ -92,9 +88,6 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc,
}
void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
if (conv->quanParameter()) {
qnt = ConvolutionCommon::load(conv->quanParameter(), true);

View File

@ -88,9 +88,7 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int kh,
for (int g = 0; g < group; g++) {
auto z = g / 4, r = g % 4;
auto z_dst = dst + z * kh * kw * 4 + r;
#pragma clang loop vectorize(enable)
for (int h = 0; h < kh; h++) {
#pragma clang loop vectorize(enable) unroll(enable)
for (int w = 0; w < kw; w++) {
// to [g/4][h][w][4]
// from [g][h][w]

View File

@ -20,8 +20,6 @@ MetalMatMul::MetalMatMul(Backend *backend, const MatMul *matmul) : Execution(bac
mTransposeB = matmul->transposeB();
}
ErrorCode MetalMatMul::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
struct matP {
int size[4];
int stride[4];

View File

@ -6,7 +6,6 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#if MNN_METAL_ENABLED
#import "backend/metal/MetalRaster.hpp"
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
@ -14,6 +13,7 @@
#include "core/TensorUtils.hpp"
#include "core/OpCommonUtils.hpp"
#if MNN_METAL_ENABLED
namespace MNN {
struct SamplerInfo {
@ -186,7 +186,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, local.first, local.second));
}
mShapeTemp.clear();
for (auto& iter : mTempInput) {
for (int i = 0; i < mTempInput.size(); ++i) {
id<MTLBuffer> shape = [context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly];
mShapeTemp.emplace_back(std::move(shape));
}

View File

@ -71,7 +71,6 @@ ErrorCode MetalReduction::onResize(const std::vector<Tensor *> &inputs, const st
ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto &input = inputs[0], &output = outputs[0];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
@ -79,7 +78,6 @@ ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const s
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConst offset:0 atIndex:2];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}

Some files were not shown because too many files have changed in this diff Show More