[MNN:Sync] Sync Internal Gitlab: 2.5.1

This commit is contained in:
xiaying 2023-05-18 19:11:50 +08:00
parent d7d1efe03b
commit c70ecef660
98 changed files with 3853 additions and 1168 deletions

View File

@ -74,6 +74,6 @@ Pod::Spec.new do |s|
end
s.compiler_flags = '-arch arm64 -march=armv8.2-a+simd+fp16'
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1'}
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1 MNN_USE_SPARSE_COMPUTE=1'}
s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
end

View File

@ -2,13 +2,17 @@
## Linux / macOS / Ubuntu
[从源码编译](../compile/tools.html#benchmark),然后执行如下命令:
```bash
./benchmark.out models_folder loop_count warm_up_count forwardtype
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
```
参数如下:
- models_folder: benchmark models文件夹[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
- loop_count: 可选默认是10
- warm_up_count: 预热次数
- forwardtype: 可选默认是0即CPUforwardtype有0->CPU1->Metal3->OpenCL6->OpenGL7->Vulkan
- numberThread: 可选默认是4为 CPU 线程数或者 GPU 的运行模式
- precision: 可选,默认是 2 precision_low
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
## Android
在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`默认编译armv7加参数-64编译armv8参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`

View File

@ -107,6 +107,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
std::shared_ptr<Runtime> bn(creator->onCreate(info));
mRuntimes[mAttr->firstType] = bn;
}
_refreshRuntime();
}
int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
@ -139,6 +140,7 @@ Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int nu
defaultConfig.flags = 4;
std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
mAttr->constantBackend = defaultBackend;
_refreshRuntime();
}
Executor::~Executor(){
// Do nothing
@ -205,15 +207,38 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
auto executor = new Executor(runtime, type, numberThread);
return std::shared_ptr<Executor>(executor);
}
void Executor::_refreshRuntime() {
mRuntimeInfo.first.clear();
mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
auto firstIter = mRuntimes.find(getAttr()->firstType);
if (firstIter != mRuntimes.end()) {
mRuntimeInfo.first.insert(std::make_pair(firstIter->first.first, firstIter->second));
} else {
MNN_ASSERT(false);
}
for (auto& iter : mRuntimes) {
if (iter.first.first != getAttr()->firstType.first) {
mRuntimeInfo.first.insert(std::make_pair(iter.first.first, iter.second));
}
}
}
RuntimeInfo Executor::getRuntime() {
RuntimeInfo info;
auto glo = ExecutorScope::Current();
info.second = glo->mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
for (auto& iter : glo->mRuntimes) {
info.first.insert(std::make_pair(iter.first.first, iter.second));
return glo->mRuntimeInfo;
}
bool Executor::getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr) {
if (nullptr == expr) {
return false;
}
return info;
if (nullptr == expr->inside()->mCache.get()) {
return false;
}
auto session = expr->inside()->mCache->getSession();
if (nullptr == session) {
return false;
}
return session->getInfo(code, ptr);
}
static bool loadCache(std::shared_ptr<Runtime> &rt, const void* buffer, size_t size) {
@ -352,6 +377,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
} else {
res->mInside->mUserConfig = false;
}
glo->_refreshRuntime();
return res;
}
ExecutorAttr* Executor::getAttr() const {
@ -603,6 +629,7 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
scheduleInfo.pipelineInfo[0].first.info.type = MNN_FORWARD_CPU;
} else {
scheduleInfo.pipelineInfo[0].first.info.type = current->getAttr()->firstType.first;
scheduleInfo.pipelineInfo[0].first.info.numThread = current->getAttr()->firstType.second;
}
scheduleInfo.pipelineInfo[0].first.needComputeShape = false;
scheduleInfo.pipelineInfo[0].first.needComputeGeometry = mLazyMode != LAZY_CONTENT;

View File

@ -343,6 +343,9 @@ public:
/** Resize Info, int*, 0: ready to execute, 1: need malloc, 2: need resize */
RESIZE_STATUS = 3,
/** Mode / NumberThread, int* */
THREAD_NUMBER = 4,
ALL
};

View File

@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 5
#define MNN_VERSION_PATCH 0
#define MNN_VERSION_PATCH 1
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */

View File

@ -133,11 +133,15 @@ public:
friend class StaticModule;
RuntimeManager();
};
static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
private:
void _refreshRuntime();
Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
// TODO: Remove mRuntimes, only use mRuntimeInfo
std::map<std::pair<MNNForwardType, int>, std::shared_ptr<Runtime>> mRuntimes;
RuntimeInfo mRuntimeInfo;
std::shared_ptr<DebugTools> mDebug;
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
LazyMode mLazyMode = LAZY_FULL;

View File

@ -3953,7 +3953,7 @@
CODE_SIGN_STYLE = Automatic;
DEAD_CODE_STRIPPING = YES;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = Q48UX93J22;
DEVELOPMENT_TEAM = 6G7464HHUS;
DYLIB_COMPATIBILITY_VERSION = 1;
DYLIB_CURRENT_VERSION = 1;
DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3971,6 +3971,7 @@
"ENABLE_ARMV82=1",
"MNN_COREML_ENABLED=1",
"USE_LZ4_FLAG=1",
"MNN_USE_SPARSE_COMPUTE=1",
);
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
GCC_WARN_SHADOW = NO;
@ -3995,7 +3996,7 @@
METAL_LIBRARY_FILE_BASE = mnn;
ONLY_ACTIVE_ARCH = YES;
OTHER_CFLAGS = "";
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
PROVISIONING_PROFILE_SPECIFIER = "";
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -4016,7 +4017,7 @@
CODE_SIGN_STYLE = Automatic;
DEAD_CODE_STRIPPING = YES;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = Q48UX93J22;
DEVELOPMENT_TEAM = 6G7464HHUS;
DYLIB_COMPATIBILITY_VERSION = 1;
DYLIB_CURRENT_VERSION = 1;
DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -4033,6 +4034,7 @@
"ENABLE_ARMV82=1",
"MNN_COREML_ENABLED=1",
"USE_LZ4_FLAG=1",
"MNN_USE_SPARSE_COMPUTE=1",
);
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
GCC_WARN_SHADOW = YES;
@ -4056,7 +4058,7 @@
MACH_O_TYPE = staticlib;
METAL_LIBRARY_FILE_BASE = mnn;
OTHER_CFLAGS = "";
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
PROVISIONING_PROFILE_SPECIFIER = "";
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -4075,7 +4077,7 @@
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
CODE_SIGN_STYLE = Automatic;
DEVELOPMENT_TEAM = Q48UX93J22;
DEVELOPMENT_TEAM = 6G7464HHUS;
GCC_ENABLE_CPP_EXCEPTIONS = NO;
GCC_ENABLE_CPP_RTTI = NO;
HEADER_SEARCH_PATHS = (
@ -4088,7 +4090,7 @@
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
PRODUCT_NAME = "$(TARGET_NAME)";
TARGETED_DEVICE_FAMILY = "1,2";
};
@ -4100,7 +4102,7 @@
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
CODE_SIGN_STYLE = Automatic;
DEVELOPMENT_TEAM = Q48UX93J22;
DEVELOPMENT_TEAM = 6G7464HHUS;
GCC_ENABLE_CPP_EXCEPTIONS = NO;
GCC_ENABLE_CPP_RTTI = NO;
HEADER_SEARCH_PATHS = (
@ -4113,7 +4115,7 @@
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
PRODUCT_NAME = "$(TARGET_NAME)";
TARGETED_DEVICE_FAMILY = "1,2";
};

View File

@ -23,6 +23,10 @@ USE_TRT=False
if len(sys.argv) > 1 and sys.argv[1] == '-trt':
USE_TRT=True
USE_CUDA=False
if len(sys.argv) > 1 and sys.argv[1] == '-cuda':
USE_CUDA=True
def build_deps():
if os.path.isdir('../../schema/private'):
IS_INTERNAL_BUILD = args.internal
@ -49,6 +53,7 @@ def build_deps():
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
extra_opts += ' -DMNN_BUILD_TORCH=ON ' if IS_BUILD_TORCH else ' '
extra_opts += ' -DMNN_CUDA=ON ' if USE_CUDA else ' '
os.system('cmake ' + extra_opts +
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \

View File

@ -9,6 +9,10 @@
set -e
echo "clean build cache:"
echo ">>> rm -rf build dist *.egg-info wheelhouse/*"
rm -rf build dist *.egg-info wheelhouse/*
PROJECT_ROOT=$(cd `dirname $0`;cd ../../;pwd)
echo $PROJECT_ROOT
export PROJECT_ROOT
@ -17,6 +21,8 @@ for PYBIN in /opt/python/*/bin; do
"${PYBIN}/pip" install -U numpy
if [ "$1" == "-trt" ]; then
USE_TRT=true "${PYBIN}/python" setup.py bdist_wheel
elif [ "$1" == "-cuda" ]; then
USE_CUDA=true "${PYBIN}/python" setup.py bdist_wheel
else
"${PYBIN}/python" setup.py bdist_wheel
fi
@ -26,6 +32,8 @@ done
for whl in dist/*.whl; do
if [ "$1" == "-trt" ]; then
LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/tensorrt:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
elif [ "$1" == "-cuda" ]; then
LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/cuda:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
else
auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
fi

View File

@ -59,9 +59,11 @@ def report(*args):
package_name = 'MNN'
USE_TRT=check_env_flag('USE_TRT')
USE_CUDA = check_env_flag("USE_CUDA")
IS_INTERNAL_BUILD = False
print ("USE_TRT ", USE_TRT)
print("USE_CUDA:", USE_CUDA)
if os.path.isdir('../../schema/private'):
IS_INTERNAL_BUILD = args.serving
@ -149,7 +151,8 @@ def configure_extension_build():
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
if USE_TRT:
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
if USE_TRT or USE_CUDA:
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
engine_library_dirs += ['/usr/local/cuda/lib64/']
@ -187,6 +190,7 @@ def configure_extension_build():
engine_include_dirs += [np.get_include()]
trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
cuda_depend = ['-lMNN_Cuda_Main']
engine_depend = ['-lMNN']
# enable logging & model authentication on linux.
@ -196,12 +200,16 @@ def configure_extension_build():
if USE_TRT:
engine_depend += trt_depend
if USE_CUDA:
engine_depend += cuda_depend
tools_compile_args = []
tools_libraries = []
tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
tools_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter")]
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "3rd_party", "protobuf", "cmake")]
# add libTorch dependency
@ -227,7 +235,7 @@ def configure_extension_build():
os.path.join(torch_lib, 'libc10.dylib')]),
('.dylibs', [os.path.join(torch_path, '.dylibs', 'libiomp5.dylib')])]
'''
if USE_TRT:
if USE_TRT or USE_CUDA:
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
tools_library_dirs += ['/usr/local/cuda/lib64/']
@ -269,6 +277,9 @@ def configure_extension_build():
if USE_TRT:
tools_depend += trt_depend
if USE_CUDA:
tools_depend += cuda_depend
if IS_DARWIN:
engine_link_args += ['-stdlib=libc++']
engine_link_args += ['-Wl,-all_load']

View File

@ -942,6 +942,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
int32_t aMin;
int32_t readType;
bool has_scaleInt;
bool shapeInt32;
uint32_t weightSize;
std::vector<uint32_t> index;
IDSTQuanT()
: type(0),
useInt32(false),
@ -951,7 +954,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
aMax(0),
aMin(0),
readType(0),
has_scaleInt(false) {
has_scaleInt(false),
shapeInt32(false),
weightSize(0) {
}
};
@ -993,6 +998,15 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
bool has_scaleInt() const {
return GetField<uint8_t>(24, 0) != 0;
}
bool shapeInt32() const {
return GetField<uint8_t>(26, 0) != 0;
}
uint32_t weightSize() const {
return GetField<uint32_t>(28, 0);
}
const flatbuffers::Vector<uint32_t> *index() const {
return GetPointer<const flatbuffers::Vector<uint32_t> *>(30);
}
bool Verify(flatbuffers::Verifier &verifier) const {
return VerifyTableStart(verifier) &&
VerifyOffset(verifier, 4) &&
@ -1008,6 +1022,10 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
VerifyField<int32_t>(verifier, 20) &&
VerifyField<int32_t>(verifier, 22) &&
VerifyField<uint8_t>(verifier, 24) &&
VerifyField<uint8_t>(verifier, 26) &&
VerifyField<uint32_t>(verifier, 28) &&
VerifyOffset(verifier, 30) &&
verifier.VerifyVector(index()) &&
verifier.EndTable();
}
IDSTQuanT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -1051,6 +1069,15 @@ struct IDSTQuanBuilder {
void add_has_scaleInt(bool has_scaleInt) {
fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(has_scaleInt), 0);
}
void add_shapeInt32(bool shapeInt32) {
fbb_.AddElement<uint8_t>(26, static_cast<uint8_t>(shapeInt32), 0);
}
void add_weightSize(uint32_t weightSize) {
fbb_.AddElement<uint32_t>(28, weightSize, 0);
}
void add_index(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index) {
fbb_.AddOffset(30, index);
}
explicit IDSTQuanBuilder(flatbuffers::FlatBufferBuilder &_fbb)
: fbb_(_fbb) {
start_ = fbb_.StartTable();
@ -1075,8 +1102,13 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
int32_t aMax = 0,
int32_t aMin = 0,
int32_t readType = 0,
bool has_scaleInt = false) {
bool has_scaleInt = false,
bool shapeInt32 = false,
uint32_t weightSize = 0,
flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index = 0) {
IDSTQuanBuilder builder_(_fbb);
builder_.add_index(index);
builder_.add_weightSize(weightSize);
builder_.add_readType(readType);
builder_.add_aMin(aMin);
builder_.add_aMax(aMax);
@ -1086,6 +1118,7 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
builder_.add_type(type);
builder_.add_alpha(alpha);
builder_.add_buffer(buffer);
builder_.add_shapeInt32(shapeInt32);
builder_.add_has_scaleInt(has_scaleInt);
builder_.add_useInt32(useInt32);
return builder_.Finish();
@ -4390,6 +4423,9 @@ inline void IDSTQuan::UnPackTo(IDSTQuanT *_o, const flatbuffers::resolver_functi
{ auto _e = aMin(); _o->aMin = _e; };
{ auto _e = readType(); _o->readType = _e; };
{ auto _e = has_scaleInt(); _o->has_scaleInt = _e; };
{ auto _e = shapeInt32(); _o->shapeInt32 = _e; };
{ auto _e = weightSize(); _o->weightSize = _e; };
{ auto _e = index(); if (_e) { _o->index.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->index[_i] = _e->Get(_i); } } };
}
inline flatbuffers::Offset<IDSTQuan> IDSTQuan::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IDSTQuanT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -4411,6 +4447,9 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
auto _aMin = _o->aMin;
auto _readType = _o->readType;
auto _has_scaleInt = _o->has_scaleInt;
auto _shapeInt32 = _o->shapeInt32;
auto _weightSize = _o->weightSize;
auto _index = _o->index.size() ? _fbb.CreateVector(_o->index) : 0;
return MNN::CreateIDSTQuan(
_fbb,
_buffer,
@ -4423,7 +4462,10 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
_aMax,
_aMin,
_readType,
_has_scaleInt);
_has_scaleInt,
_shapeInt32,
_weightSize,
_index);
}
inline QuantizedFloatParamT *QuantizedFloatParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@ -5908,7 +5950,10 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
{ flatbuffers::ET_INT, 0, -1 },
{ flatbuffers::ET_INT, 0, -1 },
{ flatbuffers::ET_INT, 0, -1 },
{ flatbuffers::ET_BOOL, 0, -1 }
{ flatbuffers::ET_BOOL, 0, -1 },
{ flatbuffers::ET_BOOL, 0, -1 },
{ flatbuffers::ET_UINT, 0, -1 },
{ flatbuffers::ET_UINT, 1, -1 }
};
static const char * const names[] = {
"buffer",
@ -5921,10 +5966,13 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
"aMax",
"aMin",
"readType",
"has_scaleInt"
"has_scaleInt",
"shapeInt32",
"weightSize",
"index"
};
static const flatbuffers::TypeTable tt = {
flatbuffers::ST_TABLE, 11, type_codes, nullptr, nullptr, names
flatbuffers::ST_TABLE, 14, type_codes, nullptr, nullptr, names
};
return &tt;
}

View File

@ -65,6 +65,10 @@ table IDSTQuan {
aMin:int;
readType:int;
has_scaleInt:bool;
shapeInt32:bool = false;
// For sparse
weightSize:uint32;
index:[uint32];
}
enum QuantizeAlgo : byte {

View File

@ -263,100 +263,6 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i
}
}
template<typename Func, typename V, int pack>
void executeVecInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
Func compute;
int sizeDivUnit = elementSize / pack;
int remainCount = elementSize - sizeDivUnit * pack;
#ifdef MNN_USE_NEON
sizeDivUnit = (elementSize * 4) / pack;
remainCount = (elementSize * 4) - sizeDivUnit * pack;
#endif
auto src0 = inputRaw0;
auto src1 = inputRaw1;
auto dst = (int8_t*)outputRaw;
#ifdef MNN_USE_SSE
V zeroPointV((uint8_t)(128));
#else
V zeroPointV((uint8_t)(0));
#endif
if (-1 == needBroadcast) {
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
V a = V::load(src0);
a -= zeroPointV;
V b = V::load(src1);
b -= zeroPointV;
V::save(dst, compute(a, b) + zeroPointV);
src0 += pack;
src1 += pack;
dst += pack;
}
}
if (remainCount > 0) {
int8_t tempSrc0[pack];
int8_t tempSrc1[pack];
int8_t tempDst[pack];
::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
V a = V::load(tempSrc0);
a -= zeroPointV;
V b = V::load(tempSrc1);
b -= zeroPointV;
V::save(tempDst, compute(a, b) + zeroPointV);
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
}
} else if (0 == needBroadcast) {
const int8_t srcValue0 = src0[0];
V a = V(srcValue0);
a -= zeroPointV;
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
const auto src1Ptr = src1;
auto dstPtr = dst;
V b = V::load(src1Ptr);
b -= zeroPointV;
V::save(dstPtr, compute(a, b) + zeroPointV);
src1 += pack;
dst += pack;
}
}
if (remainCount > 0) {
int8_t tempSrc1[pack];
int8_t tempDst[pack];
::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
V b = V::load(tempSrc1);
b -= zeroPointV;
V::save(tempDst, compute(a, b) + zeroPointV);
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
}
} else {
const int8_t srcValue1 = src1[0];
V b = V(srcValue1);
b -= zeroPointV;
if (sizeDivUnit > 0) {
for (int i = 0; i < sizeDivUnit; ++i) {
const auto src0Ptr = src0;
auto dstPtr = dst;
V a = V::load(src0Ptr);
a -= zeroPointV;
V::save(dstPtr, compute(a, b) + zeroPointV);
src0 += pack;
dst += pack;
}
}
if (remainCount > 0) {
int8_t tempSrc0[pack];
int8_t tempDst[pack];
::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
V a = V::load(tempSrc0);
a -= zeroPointV;
V::save(tempDst, compute(a, b) +zeroPointV);
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
}
}
}
template<typename Vec>
struct VecBinaryAdd {
Vec operator()(Vec& x, Vec& y) const {
@ -426,43 +332,49 @@ void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int
template<typename Tin, typename Tout, typename Func>
void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
Func f;
int input0DataCount = elementSize;
int input1DataCount = elementSize;
int size = elementSize;
#ifdef MNN_USE_NEON
input0DataCount = elementSize * 4;
input1DataCount = elementSize * 4;
size *= 4;
#endif
const Tin* input0Data = (const Tin*)inputRaw0;
const Tin* input1Data = (const Tin*)inputRaw1;
Tout* outputData = (Tout*)outputRaw;
float inp0 = 0, inp1 = 0, output = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = (int8_t*)inputRaw0;
const int8_t* inputData1 = (int8_t*)inputRaw1;
int8_t* outputData = (int8_t*)outputRaw;
#endif
if (needBroadcast == 0) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
for (int i = 0; i < input1DataCount; i++) {
inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
for (int i = 0; i < size; ++i) {
if (needBroadcast == 0) {
inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
output = f(inp0, inp1);
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
}
} else if (needBroadcast == 1) {
for (int i = 0; i < input0DataCount; i++) {
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
} else if (needBroadcast == 1) {
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
output = f(inp0, inp1);
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
}
} else { // both input contains more than one elementwhich means no scalar input
for (int i = 0; i < input0DataCount; i++) {
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
} else {
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
output = f(inp0, inp1);
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
}
int value = (int)roundf(output * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}

View File

@ -104,9 +104,16 @@ float CPURuntime::onGetMemoryInMB() {
auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
return staticMemoryInMB;
}
bool CPURuntime::onCheckInfo(Backend::Info& info) const {
#ifdef MNN_USE_THREAD_POOL
int threadNumber = mThreadNumber;
if (mTaskIndex < 0) {
threadNumber = 1;
}
info.numThread = threadNumber;
#endif
return true;
}
Backend* CPURuntime::onCreate(const BackendConfig* config) const {
auto precision = mPrecision;

View File

@ -31,6 +31,8 @@ public:
}
void onConcurrencyBegin() const;
void onConcurrencyEnd() const;
virtual bool onCheckInfo(Backend::Info& info) const override;
private:
std::shared_ptr<BufferAllocator> mStaticAllocator;

View File

@ -35,13 +35,12 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
}
MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));
std::vector<float> scale0(mTotalSize), scale1(mTotalSize), outputScale(mTotalSize);
std::fill(scale0.begin(), scale0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
std::fill(scale1.begin(), scale1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
std::fill(outputScale.begin(), outputScale.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
mInputQuant0 = scale0;
mInputQuant1 = scale1;
mOutputQuant = outputScale;
mInputQuant0.resize(mTotalSize);
mInputQuant1.resize(mTotalSize);
mOutputQuant.resize(mTotalSize);
std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
mActivationExe.reset(new CPURelu(backend(), 0.0));
@ -56,15 +55,10 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
auto output = outputs[0];
auto schedule = ((CPUBackend*)backend())->multiThreadDivide(mTotalSize);
#ifdef MNN_USE_SSE
auto input0Ptr = input->host<uint8_t>();
auto input1Ptr = input1->host<uint8_t>();
auto outputPtr = outputs[0]->host<uint8_t>();
#else
auto input0Ptr = input->host<int8_t>();
auto input1Ptr = input1->host<int8_t>();
auto outputPtr = outputs[0]->host<int8_t>();
#endif
int inpBytes = 1;
int outBytes = 1;
@ -90,7 +84,7 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
#ifdef MNN_USE_NEON
mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize / 4, mNeedBroadcastIndex);
#else
mProc((int8_t*)out, (int8_t*)inp0, (int8_t*)inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
#endif
}
}

View File

@ -40,19 +40,21 @@ public:
};
class CPUConvolution : public Execution {
public:
struct ResourceDequantizeInfo {
int bits = 32;
std::shared_ptr<Tensor> mScaleBias;
std::vector<int8_t> mLowBitWeightMap;
};
struct Resource {
std::shared_ptr<Tensor> mWeight;
std::shared_ptr<Tensor> mBias;
ResourceDequantizeInfo mDequantize;
Backend* backend;
bool copyBiasAlign(const float* bias, int outputCount);
~ Resource() {
if (nullptr != mBias) {
backend->onReleaseBuffer(mBias.get(), Backend::STATIC);
}
if (nullptr != mWeight) {
backend->onReleaseBuffer(mWeight.get(), Backend::STATIC);
}
}
int hU;
int lU;
int lP;
int hP;
};
struct ResourceInt8 {
std::vector<int> mInt8WeightKernelSum;

View File

@ -19,7 +19,6 @@
#include <vector>
#include "../CPURuntime.hpp"
#include "common/MemoryFormater.h"
#include "common/CommonCompute.hpp"
// TODO: Find better way to optimize it
#include "../CPUBinary.hpp"
#include "../CPUUnary.hpp"
@ -174,107 +173,6 @@ void MNNUnpackC2Common(T* dst, const T* src, size_t area, size_t depth, int* are
}
}
/*
source: source matrix is h x l
transpose: if false, export compressed matrix as h x l, other export as l x h.
*/
void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose) {
// 1. in convolution, source B layout is OC x (KH * KW * IC),
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
// 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
// BCSR
if (transpose) {
int rowOffset = 0;
for (int i = 0; i < l; i += 1) {
*NNZMap = 0;
for(int j = 0; j < h; j += sparseBlockOC) {
if(!MNN::CommonCompute::checkAllZeros(source + j * l + i, l, sparseBlockOC, 1)) {
*dest = *(source + j * l + l);
dest++;
*NNZMap = *NNZMap + 1;
*dataOffsetMap = rowOffset;
dataOffsetMap++;
rowOffset = 0;
}
rowOffset += eP;
}
NNZMap++;
rowOffset -= h * eP;
}
} else { // BCSC
int columOffset = 0;
int i = 0;
for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
*NNZMap = 0;
for(int j = 0; j < l; j += 1) {
if (!MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1)) {
for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
*dest = *(source + ioc * l);
dest++;
}
*NNZMap = *NNZMap + 1;
*dataOffsetMap = columOffset;
dataOffsetMap++;
columOffset = 0;
}
columOffset += eP;
source++;
}
NNZMap++;
source += l * (sparseBlockOC - 1);
columOffset -= l * eP;
}
for (; i < h; i++) {
*NNZMap = 0;
for(int j = 0; j < l; j++) {
if (*source != 0.0f) {
*dest = *source;
dest++;
*NNZMap = *NNZMap + 1;
*dataOffsetMap = columOffset;
dataOffsetMap++;
columOffset = 0;
}
columOffset += eP;
source++;
}
NNZMap++;
columOffset -= l * eP;
}
*dataOffsetMap = columOffset; //
}
return;
}
void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l) {
size_t nnzBlock = 0;
size_t nnzTail = 0;
int ocEven = (h / sparseBlockOC) * sparseBlockOC;
size_t ioc = 0;
for (; ioc < ocEven; ioc += sparseBlockOC) {
for (size_t i = 0; i < l; i++) {
bool isZero = MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1);
nnzBlock += !isZero;
source++;
}
source += (sparseBlockOC - 1) * l;
}
for (; ioc < h; ioc++) {
for (size_t i = 0; i < l; i++) {
bool isZero = (*source) == 0.0f;
nnzTail += !isZero;
source++;
}
}
weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
weightBlockNumber = nnzBlock + nnzTail;
return;
}
#ifndef MNN_USE_NEON
void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
@ -2875,8 +2773,6 @@ void MNNCoreFunctionInit() {
gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;
gCoreFunction->MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode;
gCoreFunction->MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B; // sparse packing B
gCoreFunction->MNNGetOptimalBlockShape = MNNGetOptimalBlockShape;
gCoreFunction->MNNAdjustOptimalSparseKernel = _MNNAdjustOptimalSparseKernel;
gCoreFunction->MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1;
@ -2995,4 +2891,4 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
areaOffset,
};
MNNPackC2(dst, src, area, depth, offset);
}
}

View File

@ -198,10 +198,6 @@ struct CoreFunctions {
MNNBinaryExecute(*MNNSelectBinaryFunctionForFloat)(int opType);
MNNUnaryExecute(*MNNSelectUnaryFunctionForFloat)(int opType, int precisionMode);
// sparse matrix multiply
void(*MNNPackForSparseMatMul_B)(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose);
void(*MNNGetOptimalBlockShape)(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l);
// B matrix is sparsed
typedef void(*MNNPackedSparseMatMul)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
void(*MNNAdjustOptimalSparseKernel)(int& sparseBlockOC, MNNPackedSparseMatMul& packedSparseMatMul);

View File

@ -26,29 +26,25 @@ namespace MNN {
static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend,
const Convolution2D* conv2d, const float* originWeight, size_t originWeightSize,
const float* bias, size_t biasSize) {
const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> weightQuantInfo, bool supportSparse) {
auto cpuBackend = (CPUBackend*)backend;
bool lowMemory = cpuBackend->memoryMode() == BackendConfig::Memory_Low;
auto common = conv2d->common();
#ifdef MNN_USE_ONEDNN
return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
#endif
#ifdef MNN_USE_SPARSE_COMPUTE
auto core = static_cast<CPUBackend*>(backend)->functions();
int bytes = core->bytes;
#ifdef MNN_USE_SSE
const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
#else
const bool onlySSENotAVX = false;
#endif
if (!onlySSENotAVX && bytes == 4 && conv2d->sparseParameter()) {
if (SparseConvolutionTiledExecutor::shouldUseSparseConvolution(originWeightSize, conv2d->sparseParameter())) {
return new SparseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize,
if (conv2d->sparseParameter() && nullptr != weightQuantInfo.get()) {
if (supportSparse) {
return new SparseConvolutionTiledExecutor(common, backend, weightQuantInfo->quan,
conv2d->sparseParameter(), bias, biasSize);
}
}
#endif
if (lowMemory || originWeightSize == 0) {
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
}
bool fastWay = common->kernelY() == 1 && common->kernelX() == 1
&& output->width() == input->width() && output->height() == input->height()
&& common->strideX() == 1 && common->strideY() == 1;
@ -56,16 +52,12 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
}
if (!ConvolutionWinogradBridge::canUseWinograd(common)) {
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
}
auto cpuBackend = (CPUBackend*)backend;
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
}
PerfConfig convPerfconfig = DenseConvolutionTiledExecutor::bestTileConvolutionConfig(common, input, output, cpuBackend->threadNumber(), backend);
auto winogradConfig = ConvolutionWinogradBridge::bestWinogradUnit(common, input, output, cpuBackend->threadNumber(), backend, convPerfconfig);
if (winogradConfig.unit <= 1) {
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
}
return ConvolutionWinogradBridge::createWinogradImpl(common, input, output, backend, originWeight, originWeightSize, bias, biasSize,
winogradConfig);
@ -78,22 +70,39 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
// Multi Input
return new ConvolutionTiledExecutorMultiInput(conv2d->common(), backend);
}
bool lowMemory = static_cast<CPUBackend*>(backend)->memoryMode() == BackendConfig::Memory_Low && static_cast<CPUBackend*>(backend)->functions()->bytes == 4;
const float* originWeight = nullptr;
const float* originBias = nullptr;
int originWeightSize = 0;
int originBiasSize = 0;
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
std::unique_ptr<Tensor> externalWeightTensor, externalBiasTensor;
bool supportSparse = false;
#ifdef MNN_USE_SPARSE_COMPUTE
auto core = static_cast<CPUBackend*>(backend)->functions();
int bytes = core->bytes;
#ifdef MNN_USE_SSE
const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
#else
const bool onlySSENotAVX = false;
#endif
supportSparse = !onlySSENotAVX && bytes == 4;
#endif
if (nullptr != conv2d->quanParameter()) {
quanCommon = ConvolutionCommon::load(conv2d->quanParameter());
bool forceFloat = false;
if (!supportSparse && conv2d->quanParameter()->index() != nullptr) {
// The weight is storage as float sparse, but the backend don't support sparse compute, expand it
forceFloat = true;
}
quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), forceFloat, lowMemory);
if (nullptr == quanCommon) {
MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
return nullptr;
}
if (quanCommon->weightFloat.get() == nullptr) {
if (conv2d->quanParameter()->has_scaleInt()) {
if (backend->type() != MNN_FORWARD_CPU) {
// From BF16
// From BF16 / FP16
return nullptr;
}
return ConvolutionIntFactory::create(inputs[0], outputs[0], op, backend, quanCommon.get());
@ -114,7 +123,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
return nullptr;
}
auto common = conv2d->common();
if (nullptr == originWeight) {
if (nullptr == originWeight && nullptr != op->main_as_Convolution2D()->weight()) {
originWeight = op->main_as_Convolution2D()->weight()->data();
originWeightSize = op->main_as_Convolution2D()->weight()->size();
}
@ -130,7 +139,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
MNN_ASSERT(group > 0);
if (1 == group) {
return _createUnit(inputs[0], outputs[0], backend, conv2d, originWeight, originWeightSize,
originBias, originBiasSize);
originBias, originBiasSize, quanCommon, supportSparse);
}
// TODO: Use Geometry to split
// Split
@ -144,7 +153,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
for (int i = 0; i < group; ++i) {
auto newConvolution =
_createUnit(emptyInput.get(), emptyOutput.get(), backend, conv2d, originWeight + groupWeightSize * i,
groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount);
groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount, quanCommon, supportSparse);
subConvolution.push_back(std::shared_ptr<Execution>(newConvolution));
}
return new ConvolutionGroup(backend, subConvolution);

View File

@ -5,7 +5,7 @@
// Created by MNN on 2018/07/16.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include <math.h>
#include "DenseConvolutionTiledExecutor.hpp"
#include <MNN/AutoTime.hpp>
#include "backend/cpu/CPUBackend.hpp"
@ -19,6 +19,7 @@
#include "common/MemoryFormater.h"
#define PARAMETERSIZE 6
#define MNN_ALLOC_MEMORY_INDIRECTLY
using Vec4 = MNN::Math::Vec<float, 4>;
namespace MNN {
@ -27,10 +28,86 @@ void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source,
function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
}
static bool _initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common> int8Info, std::shared_ptr<CPUConvolution::Resource> resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize) {
int weightLength = hU * lU * hP * lP;
resource->mWeight.reset(Tensor::createDevice<uint8_t>(
{weightLength}));
auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
if (!res) {
return false;
}
resource->mDequantize.bits = 8;
resource->lU = lU;
resource->hU = hU;
resource->lP = lP;
resource->hP = hP;
// Reorder weight
MNN_ASSERT(lP == 1);
auto dstWInt8 = resource->mWeight->host<int8_t>();
auto srcWInt8 = int8Info->weight.get();
for (int y=0; y<outputCount; ++y) {
int yo = y / hP;
int yi = y % hP;
auto srcY = srcWInt8 + y * srcChannel * kernelSize;
auto dstY = dstWInt8 + yo * lP * hP * lU + yi;
for (int iz=0; iz<srcChannel; ++iz) {
for (int k=0; k<kernelSize; ++k) {
int sx = iz * kernelSize + k;
int dx = iz + k * srcChannel;
dstY[dx * hP] = srcY[sx];
}
}
}
// Save scale bias
resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<float>({hU * hP * 2}));
res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
if (!res) {
return false;
}
auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
auto biasPtr = resource->mDequantize.mScaleBias->host<float>() + hU * hP;
::memset(alphaPtr, 0, 2 * hU * hP * sizeof(float));
int h = int8Info->alpha.size();
if (int8Info->asymmetric) {
h = h / 2;
for (int i=0; i<h; ++i) {
alphaPtr[i] = int8Info->alpha.get()[2 * i + 1];
biasPtr[i] = int8Info->alpha.get()[2 * i];
}
} else {
for (int i=0; i<h; ++i) {
alphaPtr[i] = int8Info->alpha.get()[i];
}
}
if (int8Info->canUseInt4) {
MNN_ASSERT(weightLength % 2 == 0);
weightLength = UP_DIV(weightLength, 2);
resource->mDequantize.bits = 4;
resource->mDequantize.mLowBitWeightMap = int8Info->weightMap;
std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<uint8_t>(
{weightLength}));
auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
if (!res) {
return false;
}
auto srcPtr = resource->mWeight->host<int8_t>();
auto dstPtr = weightLow->host<uint8_t>();
for (int i=0; i<weightLength; ++i) {
int s0 = srcPtr[2 * i + 0];
int s1 = srcPtr[2 * i + 1];
s0 = int8Info->weightReverseMap[(int)s0 + 128];
s1 = int8Info->weightReverseMap[(int)s1 + 128];
int d = s0 * 16 + s1;
dstPtr[i] = d;
}
resource->mWeight = weightLow;
}
return true;
}
DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
const float* originWeight, size_t originWeightSize,
const float* bias, size_t biasSize)
const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
: ConvolutionTiledExecutor(b, bias, biasSize) {
auto outputCount = (int)biasSize;
@ -38,22 +115,40 @@ DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2D
auto core = static_cast<CPUBackend*>(b)->functions();
int bytes = core->bytes;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
bool useInt8Weight = 0 == originWeightSize;
if (useInt8Weight) {
MNN_ASSERT(nullptr != int8Info.get());
originWeightSize = int8Info->weight.size();
}
// Don't use common->inputCount for old model common->inputCount is zero
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
auto lSize = srcCount * common->kernelX() * common->kernelY();
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
{UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
if (!mValid) {
return;
auto hU = UP_DIV(outputCount, hP);
auto lU = UP_DIV(lSize, lP);
if (useInt8Weight) {
// Quantize weight to int8
auto allocSuccess = _initQuantizeResource(int8Info, mResource, hU, hP, lU, lP, outputCount, srcCount, common->kernelX() * common->kernelY());
if (!allocSuccess) {
mValid = false;
return;
}
} else {
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
{hU * lU * hP * lP * bytes}));
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
if (!mValid) {
return;
}
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
if (!mValid) {
return;
}
initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
// MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
// formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
}
initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
// MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
// formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
mProxy.reset(new DenseConvolutionTiledImpl(common, b));
}
@ -77,6 +172,121 @@ bool DenseConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution
return true;
}
ErrorCode DenseConvolutionTiledExecutor::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
bool needDequantize = mResource->mDequantize.bits <= 8;
if (needDequantize) {
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
auto res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::STATIC);
if (!res) {
return OUT_OF_MEMORY;
}
if (nullptr != mWeightCache.weightInt8) {
res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
#endif
auto hU = mResource->hU;
auto hP = mResource->hP;
auto mid = mResource->lU * mResource->lP;
auto srcInt8 = mResource->mWeight->host<int8_t>();
if (mResource->mDequantize.bits == 4) {
int weightLength = hU * hP * mid;
weightLength = UP_DIV(weightLength, 2);
auto srcPtr = mResource->mWeight->host<uint8_t>();
auto dstPtr = mWeightCache.weightInt8->host<int8_t>();
for (int i=0; i<weightLength; ++i) {
int d = srcPtr[i];
int s0 = d / 16;
int s1 = d % 16;
s0 = mResource->mDequantize.mLowBitWeightMap[s0];
s1 = mResource->mDequantize.mLowBitWeightMap[s1];
dstPtr[2 * i + 0] = s0;
dstPtr[2 * i + 1] = s1;
}
srcInt8 = mWeightCache.weightInt8->host<int8_t>();
}
auto alpha = mResource->mDequantize.mScaleBias->host<float>();
auto bias = mResource->mDequantize.mScaleBias->host<float>() + hU * hP;
auto dstFloat = mWeightCache.weight->host<float>();
for (int yo=0; yo<hU; ++yo) {
auto dstY = dstFloat + yo * mid * hP;
auto srcY = srcInt8 + yo * mid * hP;
auto k = alpha + yo * hP;
auto b = bias + yo * hP;
for (int x=0; x<mid; ++x) {
auto dstX = dstY + x * hP;
auto srcX = srcY + x * hP;
for (int yi=0; yi<hP; ++yi) {
dstX[yi] = srcX[yi] * k[yi] + b[yi];
}
}
}
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
if (mWeightCache.weightInt8 != nullptr) {
backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
}
#endif
}
auto code = mProxy->onExecute(mInputs, outputs);
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
if (needDequantize) {
backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::STATIC);
}
((Runtime*)(static_cast<CPUBackend*>(backend())->getRuntime()))->onGabageCollect(0);
#endif
return code;
}
ErrorCode DenseConvolutionTiledExecutor::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
bool needDequantize = mResource->mDequantize.bits <= 8;
if (needDequantize) {
if (mWeightCache.weight == nullptr) {
int weightLength = mResource->hU * mResource->lU * mResource->hP * mResource->lP;
mWeightCache.weight.reset(new Tensor);
mWeightCache.weight->buffer().type = halide_type_of<float>();
TensorUtils::getDescribe(mWeightCache.weight.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
mWeightCache.weight->buffer().dimensions = 1;
mWeightCache.weight->setLength(0, weightLength);
if (mWeightCache.weightInt8 == nullptr && mResource->mDequantize.bits == 4) {
mWeightCache.weightInt8.reset(new Tensor);
mWeightCache.weightInt8->buffer().type = halide_type_of<int8_t>();
mWeightCache.weightInt8->buffer().dimensions = 1;
mWeightCache.weightInt8->setLength(0, weightLength);
TensorUtils::getDescribe(mWeightCache.weightInt8.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
}
}
mInputs[1] = mWeightCache.weight.get();
#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
bool res = false;
if (nullptr != mWeightCache.weightInt8) {
res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
if (nullptr != mWeightCache.weightInt8) {
backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
}
#endif
}
auto code = mProxy->onResize(mInputs, outputs);
if (NO_ERROR != code) {
return code;
}
if (needDequantize) {
#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
#endif
}
return NO_ERROR;
}
ErrorCode ConvolutionTiledExecutorMultiInput::onExecute(const std::vector<Tensor*>& inputs,
const std::vector<Tensor*>& outputs) {
int depth = inputs[1]->channel();

View File

@ -34,25 +34,25 @@ protected:
class DenseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
public:
DenseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
size_t originWeightSize, const float *bias, size_t biasSize);
size_t originWeightSize, const float *bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common>);
DenseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common, Backend* b);
virtual ~DenseConvolutionTiledExecutor();
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
return mProxy->onExecute(inputs, outputs);
}
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
return mProxy->onResize(mInputs, outputs);
}
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
void initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
static PerfConfig bestTileConvolutionConfig(const Convolution2DCommon *common, const Tensor *inputTensor,
const Tensor *outputTensor, int threadNumber, Backend* b) {
return DenseConvolutionTiledImpl::bestTileConvolutionConfig(common, inputTensor, outputTensor, threadNumber, b);
}
struct DequantizeCache {
std::shared_ptr<MNN::Tensor> weight;
std::shared_ptr<MNN::Tensor> weightInt8;
};
protected:
DequantizeCache mWeightCache;
std::shared_ptr<DenseConvolutionTiledImpl> mProxy;
};

View File

@ -1577,130 +1577,255 @@ void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWi
void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float sum = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
sum = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
sum = inp0 + inp1;
} else if (needBroadcast == 1) {
sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
sum = inp0 + inp1;
} else {
sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
sum = inp0 + inp1;
}
float value = sum * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float res = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = inp0 - inp1;
} else if (needBroadcast == 1) {
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
res = inp0 - inp1;
} else {
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = inp0 - inp1;
}
float value = res * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float res = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = inp0 * inp1;
} else if (needBroadcast == 1) {
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
res = inp0 * inp1;
} else {
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = inp0 * inp1;
}
float value = res * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float res = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
res = std::min(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = std::min(inp0, inp1);
} else if (needBroadcast == 1) {
res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
res = std::min(inp0, inp1);
} else {
res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = std::min(inp0, inp1);
}
float value = res * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float res = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
res = std::max(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = std::max(inp0, inp1);
} else if (needBroadcast == 1) {
res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
res = std::max(inp0, inp1);
} else {
res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = std::max(inp0, inp1);
}
float value = res * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
float res = 0, inp0 = 0, inp1 = 0;
float res = 0;
#ifdef MNN_USE_SSE
const uint8_t zeroPoint = 128;
const int zeroPoint = 128;
const int maxValue = 255;
const int minValue = 0;
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
uint8_t* outputData = (uint8_t*)outputRaw;
#else
const uint8_t zeroPoint = 0;
const int zeroPoint = 0;
const int maxValue = 127;
const int minValue = -128;
const int8_t* inputData0 = inputRaw0;
const int8_t* inputData1 = inputRaw1;
int8_t* outputData = outputRaw;
#endif
for (int i = 0; i < elementSize; ++i) {
if (needBroadcast == 0) {
inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = (inp0 - inp1) * (inp0 - inp1);
} else if (needBroadcast == 1) {
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
res = (inp0 - inp1) * (inp0 - inp1);
} else {
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
res = (inp0 - inp1) * (inp0 - inp1);
}
float value = res * outputScale[i];
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
if (value > maxValue) {
value = maxValue;
}
if (value < minValue) {
value = minValue;
}
outputData[i] = value;
}
}
#endif // #ifndef MNN_USE_NEON
#ifndef MNN_USE_SSE

View File

@ -17,16 +17,155 @@
#include "math/Vec.hpp"
#include "core/BufferAllocator.hpp"
#include "common/MemoryFormater.h"
#include "common/CommonCompute.hpp"
using Vec4 = MNN::Math::Vec<float, 4>;
namespace MNN {
/*
source: source matrix is h x l
transpose: if false, export compressed matrix as h x l, other export as l x h.
*/
static int _fillIndex(int32_t* targetIndexes, uint32_t begin, uint32_t end, const uint32_t* indexes, uint32_t indexSize, int indexStart) {
int mid = -1;
int current = -1;
for (int i=indexStart; i<indexSize; ++i) {
if (indexes[i] >= begin) {
mid = i;
current = indexes[i];
break;
}
}
uint32_t number = end - begin;
for (uint32_t i=0; i<number; ++i) {
targetIndexes[i] = -1;
}
auto offset = current - begin;
do {
if (current < begin || current >= end) {
break;
}
targetIndexes[current - begin] = mid;
mid++;
if (mid >= indexSize) {
break;
}
current = indexes[mid];
} while (true);
return mid;
}
static void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const uint32_t* indexes, uint32_t indexSize, int sparseBlockOC, size_t h, size_t l) {
size_t nnzBlock = 0;
size_t nnzTail = 0;
int ocEven = (h / sparseBlockOC) * sparseBlockOC;
std::vector<int32_t> tempIndexes(sparseBlockOC * l);
size_t ioc = 0;
int offset = 0;
for (; ioc < ocEven; ioc += sparseBlockOC) {
offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+sparseBlockOC) * l, indexes, indexSize, offset);
for (size_t i = 0; i < l; i++) {
bool allZero = true;
for (int u=0; u<sparseBlockOC; ++u) {
if (tempIndexes[u*l + i] >= 0) {
allZero = false;
break;
}
}
if (!allZero) {
nnzBlock++;
}
}
}
for (; ioc < h; ioc++) {
offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+1) * l, indexes, indexSize, offset);
for (size_t i = 0; i < l; i++) {
if (tempIndexes[i] >= 0) {
nnzTail++;
}
}
}
weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
weightBlockNumber = nnzBlock + nnzTail;
return;
}
static void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, size_t h, size_t ic, size_t kernelSize, const int eP) {
// 1. in convolution, source B layout is OC x (KH * KW * IC),
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
// 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
auto l = ic * kernelSize;
int columOffset = 0;
int i = 0;
std::vector<int32_t> tempIndexes(sparseBlockOC * l);
int offset = 0;
for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
*NNZMap = 0;
offset = _fillIndex(tempIndexes.data(), i * l, (i+sparseBlockOC) * l, indexes, indexSize, offset);
// Origin weight is oc, ic, kernelSize, new weight order is oc, kernelsize, ic
for (int x=0; x<kernelSize; ++x) {
for (int y=0; y<ic; ++y) {
auto j = y * kernelSize + x;
bool allZero = true;
for (int u=0; u<sparseBlockOC; ++u) {
if (tempIndexes[u*l + j] >= 0) {
allZero = false;
break;
}
}
if (!allZero) {
for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
auto index = tempIndexes[ioc*l + j];
if (index >= 0) {
*dest = source[index];
} else {
*dest = 0.0f;
}
dest++;
}
*NNZMap = *NNZMap + 1;
*dataOffsetMap = columOffset;
dataOffsetMap++;
columOffset = 0;
}
columOffset += eP;
}
}
NNZMap++;
columOffset -= l * eP;
}
for (; i < h; i++) {
*NNZMap = 0;
offset = _fillIndex(tempIndexes.data(), i * l, (i+1) * l, indexes, indexSize, offset);
for (int x=0; x<kernelSize; ++x) {
for (int y=0; y<ic; ++y) {
auto j = y * kernelSize + x;
auto index = tempIndexes[j];
if (index >= 0) {
*dest = source[index];
dest++;
*NNZMap = *NNZMap + 1;
*dataOffsetMap = columOffset;
dataOffsetMap++;
columOffset = 0;
}
columOffset += eP;
}
}
NNZMap++;
columOffset -= l * eP;
}
*dataOffsetMap = columOffset; //
return;
}
void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMap, int* dataOffsetMap,
int sparseBlockOC, const float* source, float* cache, int depth,
int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, int depth,
int outputCount, int kernelSize, int eP, size_t weightNNZElement,
size_t weightBlockNumber, const CoreFunctions* function) {
ConvolutionTiledExecutor::initWeight(source, cache, depth, outputCount, kernelSize, function);
function->MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, cache, outputCount, kernelSize * depth, eP, false);
MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, source, indexes, indexSize, outputCount, depth, kernelSize, eP);
// MNN_PRINT("\nBCSR origin weight:");
// formatMatrix(source, {outputCount, kernelSize * depth});
@ -40,13 +179,13 @@ void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMa
SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend* b,
const float* originWeight, size_t originWeightSize, const SparseCommon* sparseCommon,
const IDSTQuan* weight, const SparseCommon* sparseCommon,
const float* bias, size_t biasSize)
: ConvolutionTiledExecutor(b, bias, biasSize) {
auto outputCount = (int)biasSize;
// Don't use common->inputCount for old model common->inputCount is zero
auto lSize = originWeightSize / outputCount;
auto lSize = weight->weightSize() / outputCount;
auto srcCount = lSize / (common->kernelX() * common->kernelY());
int eP, lP, hP;
@ -64,7 +203,7 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
if (optimalSparseBlockOC != sparseBlockOC) {
size_t optimalWeightNNZElement = weightNNZElement;
size_t optimalWeightBlockNumber = weightBlockNumber;
core->MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, originWeight, optimalSparseBlockOC, outputCount, lSize);
MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, weight->index()->data(), weight->index()->size(), optimalSparseBlockOC, outputCount, lSize);
MNN_ASSERT(sparseBlockOC == 1 || sparseBlockOC == 2 || sparseBlockOC == 4 || sparseBlockOC == 8);
// MNN_PRINT("caution: sparsity changed!!!\nsparseBlockOC:%d -> %d weightNNZElement:%zu -> %zu, weightBlockNumber:%zu -> %zu, outputCount:%d, divide:%d, tail:%d\n",
// sparseBlockOC, optimalSparseBlockOC, weightNNZElement, optimalWeightNNZElement, weightBlockNumber, optimalWeightBlockNumber, outputCount, outputCount / optimalSparseBlockOC, outputCount % optimalSparseBlockOC);
@ -72,26 +211,25 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
weightNNZElement = optimalWeightNNZElement;
weightBlockNumber = optimalWeightBlockNumber;
}
MNN_ASSERT(weightNNZElement > 0);
MNN_ASSERT(weightBlockNumber > 0);
mSparseIndexData.reset(new SparseIndexData(sparseBlockOC, weightNNZElement, weightBlockNumber, backend()));
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
{ static_cast<int>(weightNNZElement + 1) * bytes })); // one more element in case of weight are all zeros
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({static_cast<int>(outputCount * lSize * sizeof(float))})); // cache must be float
mSparseIndexData->mNNZMap.reset(Tensor::createDevice<unsigned int>({outputCount / sparseBlockOC + outputCount % sparseBlockOC}));
mSparseIndexData->mDataOffsetMap.reset(Tensor::createDevice<int>({static_cast<int>(weightBlockNumber + 1)}));
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mNNZMap.get(), Backend::STATIC);
mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mDataOffsetMap.get(), Backend::STATIC);
if (!mValid) {
return;
}
initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, weight->alpha()->data(), weight->index()->data(), weight->index()->size(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
mProxy.reset(new SparseConvolutionTiledImpl(common, packedSparseMatmul, sparseBlockOC, b));
}

View File

@ -67,8 +67,7 @@ public:
class SparseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
public:
SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
size_t originWeightSize, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);
SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const IDSTQuan* weight, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);
SparseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, std::shared_ptr<SparseIndexData> mSparseIndexData,
const Convolution2DCommon *common, MNNPackedSparseMatMul packedSparseMatmul, int sparseBlockOC, Backend *b);
@ -84,24 +83,9 @@ public:
virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
void initWeight(float *dest, unsigned int *NNZMap, int *dataOffsetMap, int sparseBlockOC, const float *source,
float *cache, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
const uint32_t* indexes, uint32_t indexSize, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
size_t weightBlockNumber, const CoreFunctions *function);
static bool shouldUseSparseConvolution(size_t originWeightSize, const SparseCommon* sparseCommon) {
auto sparseBlockOC = sparseCommon->args()->LookupByKey("sparseBlockOC")->i();
size_t weightNNZElement = sparseCommon->args()->LookupByKey("NNZElement")->i();
return shouldUseSparseConvolution((originWeightSize - weightNNZElement) / ((double)originWeightSize), sparseBlockOC);
}
static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
std::vector<float> thresholds = getSparsityThreshold();
return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
}
static inline std::vector<float> getSparsityThreshold() {
// sparsity threadhold values, when sparseblock is
// {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
}
protected:
std::shared_ptr<SparseConvolutionTiledImpl> mProxy;
std::shared_ptr<SparseIndexData> mSparseIndexData;
@ -110,4 +94,4 @@ protected:
#undef RELEASE_BUFFER_HINT
} // namespace MNN
#endif /* SparseConvolutionTiledExecutor_hpp */
#endif /* SparseConvolutionTiledExecutor_hpp */

View File

@ -1,4 +1,4 @@
set(CUDA_MIN_VERSION "7.0")
set(CUDA_MIN_VERSION "8.0")
find_package(CUDA ${CUDA_MIN_VERSION})
set (EXTRA_LIBS "")
@ -21,6 +21,16 @@ if(CUDA_FOUND)
include(${CMAKE_CURRENT_SOURCE_DIR}/SelectCudaComputeArch.cmake)
CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${CUDA_ARCHS})
list(LENGTH CUDA_ARCH_FLAGS_readable_code arch_count)
# Current Supported Arch List
IF (${arch_count} EQUAL 1)
set(support_archs 60 61 62 70 72 75 80 86)
list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
IF (${list_index} EQUAL -1)
message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
ENDIF()
ENDIF()
IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
@ -41,6 +51,27 @@ if(CUDA_FOUND)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
ENDIF()
# Limit minimum cuda version for each archs
IF (${arch_count} EQUAL 1)
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "80") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "80"))
IF (CUDA_VERSION VERSION_LESS "11.2")
message(FATAL_ERROR "Please update cuda version to 11.2 or higher!")
ENDIF()
ENDIF()
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "75") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "75"))
IF (CUDA_VERSION VERSION_LESS "10.2")
message(FATAL_ERROR "Please update cuda version to 10.2 or higher!")
ENDIF()
ENDIF()
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "70") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "70"))
IF (CUDA_VERSION VERSION_LESS "10.1")
message(FATAL_ERROR "Please update cuda version to 10.1 or higher!")
ENDIF()
ENDIF()
ENDIF()
message(STATUS "Enabling CUDA support (version: ${CUDA_VERSION_STRING},"
" archs: ${CUDA_ARCH_FLAGS_readable})")
else()

View File

@ -36,9 +36,9 @@
# - "Auto" detects local machine GPU compute arch at runtime.
# - "Common" and "All" cover common and entire subsets of architectures
# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing
# NAME: Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing Ampere
# NUM: Any number. Only those pairs are currently accepted by NVCC though:
# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5
# 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5 8.0
# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
# Additionally, sets ${out_variable}_readable to the resulting numeric list
# Example:
@ -58,39 +58,19 @@ endif()
# See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list
# This list will be used for CUDA_ARCH_NAME = All option
set(CUDA_KNOWN_GPU_ARCHITECTURES "")
# CUDA 9.X and later do not support the Fermi architecture anymore.
if(CUDA_VERSION VERSION_LESS "9.0")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Fermi")
endif()
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
set(CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
if(CUDA_VERSION VERSION_LESS "7.0")
set(CUDA_LIMIT_GPU_ARCHITECTURE "5.2")
endif()
set(CUDA_COMMON_GPU_ARCHITECTURES "3.5" "5.0")
# This list is used to filter CUDA archs when autodetecting
set(CUDA_ALL_GPU_ARCHITECTURES "3.0" "3.2" "3.5" "5.0")
if(CUDA_VERSION VERSION_EQUAL "7.0" OR CUDA_VERSION VERSION_GREATER "7.0")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
if(CUDA_VERSION VERSION_LESS "8.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
set(CUDA_LIMIT_GPU_ARCHITECTURE "6.0")
endif()
endif()
set(CUDA_ALL_GPU_ARCHITECTURES "3.5" "5.0")
if(CUDA_VERSION VERSION_EQUAL "8.0" OR CUDA_VERSION VERSION_GREATER "8.0")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "6.0" "6.1" "6.2")
if(CUDA_VERSION VERSION_LESS "9.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX")
set(CUDA_LIMIT_GPU_ARCHITECTURE "7.0")
@ -101,22 +81,58 @@ if(CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0" "7.0+PTX")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.0" "7.0+PTX" "7.2" "7.2+PTX")
if(CUDA_VERSION VERSION_LESS "10.0")
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
endif()
endif()
if(CUDA_VERSION VERSION_GREATER "10.5")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
if(CUDA_VERSION VERSION_LESS "11.1")
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
endif()
endif()
if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5" "7.5+PTX")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.5" "7.5+PTX")
if(CUDA_VERSION VERSION_LESS "11.0")
set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
endif()
endif()
if(NOT CUDA_VERSION VERSION_LESS "11.1")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
if(CUDA_VERSION VERSION_LESS "11.8")
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6+PTX")
endif()
endif()
if(NOT CUDA_VERSION VERSION_LESS "11.8")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
if(CUDA_VERSION VERSION_LESS "12.0")
set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
endif()
endif()
################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
@ -175,7 +191,8 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
set(CUDA_GPU_DETECT_OUTPUT_FILTERED "")
separate_arguments(CUDA_GPU_DETECT_OUTPUT)
foreach(ITEM IN ITEMS ${CUDA_GPU_DETECT_OUTPUT})
if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE OR ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE))
if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE OR
ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE))
list(GET CUDA_COMMON_GPU_ARCHITECTURES -1 NEWITEM)
string(APPEND CUDA_GPU_DETECT_OUTPUT_FILTERED " ${NEWITEM}")
else()
@ -228,14 +245,10 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
set(arch_ptx ${arch_bin})
else()
# Look for it in our list of known architectures
if(${arch_name} STREQUAL "Fermi")
set(arch_bin 2.0 "2.1(2.0)")
elseif(${arch_name} STREQUAL "Kepler+Tegra")
set(arch_bin 3.2)
elseif(${arch_name} STREQUAL "Kepler+Tesla")
if(${arch_name} STREQUAL "Kepler+Tesla")
set(arch_bin 3.7)
elseif(${arch_name} STREQUAL "Kepler")
set(arch_bin 3.0 3.5)
set(arch_bin 3.5)
set(arch_ptx 3.5)
elseif(${arch_name} STREQUAL "Maxwell+Tegra")
set(arch_bin 5.3)
@ -245,12 +258,25 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
elseif(${arch_name} STREQUAL "Pascal")
set(arch_bin 6.0 6.1)
set(arch_ptx 6.1)
elseif(${arch_name} STREQUAL "Volta+Tegra")
set(arch_bin 7.2)
elseif(${arch_name} STREQUAL "Volta")
set(arch_bin 7.0 7.0)
set(arch_ptx 7.0)
elseif(${arch_name} STREQUAL "Turing")
set(arch_bin 7.5)
set(arch_ptx 7.5)
elseif(${arch_name} STREQUAL "Ampere+Tegra")
set(arch_bin 8.7)
elseif(${arch_name} STREQUAL "Ampere")
set(arch_bin 8.0 8.6)
set(arch_ptx 8.0 8.6)
elseif(${arch_name} STREQUAL "Ada")
set(arch_bin 8.9)
set(arch_ptx 8.9)
elseif(${arch_name} STREQUAL "Hopper")
set(arch_bin 9.0)
set(arch_ptx 9.0)
else()
message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
endif()
@ -282,17 +308,20 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
set(nvcc_flags "")
set(nvcc_archs_readable "")
set(nvcc_archs_code "")
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified ARCH for the concrete CODE
list(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}")
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_code ${CMAKE_MATCH_1})
else()
# User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
list(APPEND nvcc_flags " -gencode arch=compute_${arch},code=sm_${arch}")
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
list(APPEND nvcc_archs_code ${arch})
endif()
endforeach()
@ -305,4 +334,5 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
endfunction()
set(${out_variable}_readable_code ${nvcc_archs_code} PARENT_SCOPE)
endfunction()

View File

@ -215,7 +215,8 @@ using GemmTensor_F16_F16_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F16_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -232,7 +233,8 @@ using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F16_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -249,7 +251,8 @@ using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -266,7 +269,8 @@ using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmCuda_F32_F32_Linear_AlignCuda = cutlass::gemm::device::Gemm<
float,
@ -334,7 +338,8 @@ using GemmTensor_F32_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
float,
@ -351,7 +356,8 @@ using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Linear,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
using GemmCuda_F16_F16_Relu_AlignCuda = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -470,7 +476,8 @@ using GemmTensor_F16_F16_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F16_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -487,7 +494,8 @@ using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F16_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -504,7 +512,8 @@ using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -521,7 +530,8 @@ using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmCuda_F32_F32_Relu_AlignCuda = cutlass::gemm::device::Gemm<
float,
@ -589,7 +599,8 @@ using GemmTensor_F32_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
float,
@ -606,7 +617,8 @@ using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Relu,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
using GemmCuda_F16_F16_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -725,7 +737,8 @@ using GemmTensor_F16_F16_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F16_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -742,7 +755,8 @@ using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F16_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -759,7 +773,8 @@ using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::half_t,
@ -776,7 +791,8 @@ using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
using GemmCuda_F32_F32_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
float,
@ -844,7 +860,8 @@ using GemmTensor_F32_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueCudaOp_F32_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
float,
@ -861,8 +878,9 @@ using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
cutlass::gemm::GemmShape<16, 8, 8>,
EpilogueTensorOp_F32_Relu6,
SwizzleThreadBlock,
NumStages>;
NumStages,
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
}
}
#endif
#endif

View File

@ -4,36 +4,6 @@ namespace CUDA {
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define FINAL_MASK 0xffffffff
template <typename T>
__inline__ __device__
T warpReduceSum(T val)
{
for(int mask = 16; mask > 0; mask >>= 1)
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
return val;
}
template <typename T>
__inline__ __device__
T blockReduceSum(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceSum<T>(val);
if(lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceSum(val);
return val;
}
template <typename T>
__global__
void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)

View File

@ -10,7 +10,7 @@
#define LayerNormExecution_hpp
#include "core/Execution.hpp"
#include "MNNCUDAFunction.cuh"
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"

View File

@ -1,6 +1,8 @@
#ifndef MNNCUDAFunction_cuh
#define MNNCUDAFunction_cuh
#include <stdint.h>
struct DivModFast {
DivModFast(int d = 1)
{
@ -35,4 +37,68 @@ struct DivModFast {
uint32_t l_; // ceil(log2(d_))
uint32_t m_; // m' in the papaer
};
#define FINAL_MASK 0xffffffff
template <typename T>
__inline__ __device__
T warpReduceSum(T val)
{
for(int mask = 16; mask > 0; mask >>= 1) {
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
}
return val;
}
template <typename T>
__inline__ __device__
T blockReduceSum(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceSum<T>(val);
if(lane == 0) {
shared[wid] = val;
}
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceSum(val);
return val;
}
template <typename T>
__inline__ __device__
T warpReduceMax(T val)
{
for(int mask = 16; mask > 0; mask >>= 1) {
val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
}
return val;
}
template <typename T>
__inline__ __device__
T blockReduceMax(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceMax<T>(val);
if(lane == 0) {
shared[wid] = val;
}
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceMax(val);
return val;
}
#endif

View File

@ -425,59 +425,109 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
cutlass_check(status);
} else {
if(hAlignment) {
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F16_F16_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F16_F16_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
cutlass::Status status = mGemmF16F16LnAlign8Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF16F16LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F16_F16_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F16_F16_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
cutlass::Status status = mGemmF16F16LnAlign1Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF16F16LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
}
@ -541,63 +591,31 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
} else {
if(hAlignment) {
if(mNeedConvertMatAB) {
typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
} else {
if(mNeedConvertMatAB) {
typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F16_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F16_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
cutlass::Status status = mGemmF16F32LnAlign8Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF16F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
@ -609,47 +627,179 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F32_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F32_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
cutlass::Status status = mGemmF32F32LnAlign8Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF32F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
}
} else {
if(mNeedConvertMatAB) {
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F16_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F16_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
cutlass::Status status = mGemmF16F32LnAlign1Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF16F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
} else {
if(mConvertGemmSplitK) {
int split_k_slices = 16;
typename GemmTensor_F32_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = GemmTensor_F32_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
cutlass::Status status = mGemmF32F32LnAlign1Sm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmF32F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBatch}; // batch_count
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
if(workspace_size != 0) {
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
}
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
}
}
}
@ -695,7 +845,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
mNeedBTempBuffer = (needBTranspose || !lAlignment) || mFp16Fp32MixInfer;
mNeedConvertMatAB = (mNeedATempBuffer || mNeedBTempBuffer);
//MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
std::pair<void*, size_t> bufferAData, bufferBData;
@ -730,6 +880,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
}
//printf("MatMulAB:%p-%p-%p-%p\n", A->host<void*>(), A->deviceId(), B->host<void*>(), B->deviceId());
mConvertGemmSplitK = ((mBatch == 1) && (mGemmInfo.elhPad[1] >= 16384));
// Set Cutlass Param Arguments
mResizeSetArgument = (mTempMatA != nullptr && mTempMatB != nullptr && C->deviceId() != 0);
if(mResizeSetArgument) {
@ -855,19 +1006,39 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
} else {
if(hAlignment) {
if(mNeedConvertMatAB) {
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF16F32LnAlign8Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
cutlass_check(status);
}
} else {
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF32F32LnAlign8Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
cutlass_check(status);
}
}
} else {
if(mNeedConvertMatAB) {
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF16F32LnAlign1Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
cutlass_check(status);
}
} else {
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF32F32LnAlign1Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
cutlass_check(status);
}
}
}
}
@ -878,15 +1049,25 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
cutlass_check(status);
} else {
if(hAlignment) {
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF16F16LnAlign8Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
cutlass_check(status);
}
} else {
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
cutlass_check(status);
if(mConvertGemmSplitK) {
cutlass::Status status = mGemmF16F16LnAlign1Sm75();
cutlass_check(status);
} else {
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
cutlass_check(status);
}
}
}
}
// printf("normal:%d rrlayout:%d convertab:%d halign:%d\n", mFp16Fp32MixInfer, mUseRRLayout, mNeedConvertMatAB, hAlignment);
return NO_ERROR;
}

View File

@ -12,6 +12,7 @@
#include "backend/cuda/core/CUDABackend.hpp"
#include "MNNCUDADefine.hpp"
#include "CutlassGemmBatchedParam.hpp"
#include "CutlassGemmParam.hpp"
#include "MNNCUDAFunction.cuh"
namespace MNN {
@ -34,12 +35,18 @@ private:
std::shared_ptr<Tensor> mBiasTensor;
GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F16LnAlign1RCSm75;
GemmTensor_F16_F16_Linear_AlignCuda_Sm75 mGemmF16F16LnAlign1Sm75;
GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF32F32LnAlign1RCSm75;
GemmTensor_F32_F32_Linear_AlignCuda_Sm75 mGemmF32F32LnAlign1Sm75;
GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F32LnAlign1RCSm75;
GemmTensor_F16_F32_Linear_AlignCuda_Sm75 mGemmF16F32LnAlign1Sm75;
GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F16LnAlign8RCSm75;
GemmTensor_F16_F16_Linear_AlignTensor_Sm75 mGemmF16F16LnAlign8Sm75;
GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF32F32LnAlign8RCSm75;
GemmTensor_F32_F32_Linear_AlignTensor_Sm75 mGemmF32F32LnAlign8Sm75;
GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F32LnAlign8RCSm75;
GemmTensor_F16_F32_Linear_AlignTensor_Sm75 mGemmF16F32LnAlign8Sm75;
GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF16F16LnAlign8RRSm75;
GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF32F32LnAlign8RRSm75;
@ -69,6 +76,7 @@ private:
bool mFp16Infer = false;
bool mFp32Infer = false;
bool mFp16Fp32MixInfer = false;
bool mConvertGemmSplitK = false;
};
} // namespace CUDA
} // namespace MNN

View File

@ -190,7 +190,7 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
DivModFast sy(size[1]);
DivModFast sx(size[2]);
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
count /= 2;

View File

@ -168,7 +168,18 @@ static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const
return true;
}
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1, Tensor* tensor) {
if(tensor->dimensions() < 2) {
return false;
}
if(slice0.src.stride[1] == tensor->width() && slice0.src.stride[0] == tensor->width() * tensor->height()) {
// area pack for fast blit only
return false;
}
if(slice1.src.stride[1] == tensor->width() && slice1.src.stride[0] == tensor->width() * tensor->height()) {
// area pack for fast blit only
return false;
}
if(slice0.size[1] % PACK_NUMBER != 0 || slice0.size[0] != 1) {
return false;
}
@ -242,7 +253,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
mFast = false;
break;
}
if(!_directBlitC4(slice0, slice)) {
if(!_directBlitC4(slice0, slice, output)) {
mFast = false;
break;
}

View File

@ -2,15 +2,86 @@
namespace MNN {
namespace CUDA {
template<typename T>
static void callSumFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
int inside = param->inside;
int outside = param->outside;
int axis = param->axis;
int count = outside * inside;
if(axis % 256 == 0 || axis >= 768) {
int calc_multi_num = (axis + 255) / 256;
SUM_REDUCE_AXIS<<<count, 256>>>(input, output, outside, axis, inside, 256, calc_multi_num);
checkKernelErrors;
} else if(axis >= 32) {
int calc_multi_num = (axis + 63) / 64;
SUM_REDUCE_AXIS<<<count, 64>>>(input, output, outside, axis, inside, 64, calc_multi_num);
checkKernelErrors;
} else {
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
SUM_NAIVE<<<block_num, threads_num>>>(input, output, outside, axis, inside);
checkKernelErrors;
}
}
template<typename T>
static void callMeanFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
int inside = param->inside;
int outside = param->outside;
int axis = param->axis;
int count = outside * inside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
MEAN<<<block_num, threads_num>>>(input, output, outside, axis, inside);
checkKernelErrors;
}
template<typename T>
static void callMaxFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
int inside = param->inside;
int outside = param->outside;
int axis = param->axis;
int count = outside * inside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
MAXIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
checkKernelErrors;
}
template<typename T>
static void callMinFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
int inside = param->inside;
int outside = param->outside;
int axis = param->axis;
int count = outside * inside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
MINIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
checkKernelErrors;
}
template<typename T>
static void callProdFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
int inside = param->inside;
int outside = param->outside;
int axis = param->axis;
int count = outside * inside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
PROD<<<block_num, threads_num>>>(input, output, outside, axis, inside);
checkKernelErrors;
}
ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
mType = opType;
mAxis = axis;
auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
mParam = staticPool->alloc(sizeof(ReduceParam));
}
ReductionExecution::~ ReductionExecution() {
auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
staticPool->free(mParam);
}
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -27,9 +98,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
mCpuParam.inside = inside;
mCpuParam.outside = outside;
mCpuParam.axis = axis;
cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
//MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
// MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
return NO_ERROR;
}
@ -37,47 +106,46 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
auto input = (void*)inputs[0]->deviceId();
auto output = (void*)outputs[0]->deviceId();
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int inside = mCpuParam.inside;;
int inside = mCpuParam.inside;
int outside = mCpuParam.outside;
int count = inside * outside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
if (inputs[0]->getType() == halide_type_of<float>()) {
if (static_cast<CUDABackend*>(backend())->useFp16()) {
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
callMeanFunc((const half*)input, (half*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
callSumFunc((const half*)input, (half*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
callMinFunc((const half*)input, (half*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
callMaxFunc((const half*)input, (half*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
callProdFunc((const half*)input, (half*)output, &mCpuParam, runtime);
return NO_ERROR;
}
} else {
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
callMeanFunc((const float*)input, (float*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
callSumFunc((const float*)input, (float*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
callMinFunc((const float*)input, (float*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
callMaxFunc((const float*)input, (float*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
callProdFunc((const float*)input, (float*)output, &mCpuParam, runtime);
return NO_ERROR;
}
}
@ -88,25 +156,26 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callMeanFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callSumFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
// SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callProdFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_ANY:
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
case ReductionType_ALL:
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
return NO_ERROR;
}
MNN_ASSERT(false);

View File

@ -25,7 +25,6 @@ private:
ReductionType mType;
int mAxis;
ReduceParam mCpuParam;
std::pair<void*, int> mParam;
};
} // namespace CUDA
} // namespace MNN

View File

@ -1,91 +1,143 @@
#ifndef ReductionTemplate_cuh
#define ReductionTemplate_cuh
#include "MNNCUDAFunction.cuh"
struct ReduceParam {
int inside;
int axis;
int outside;
};
template <typename T>
__global__ void SUM(const T *input, T *output, const ReduceParam* param) {
int count = param->inside * param->outside;
__global__ void SUM_NAIVE(const T *input, T *output,
const int outside,
const int axis,
const int inside
) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / param->inside;
int x = i % param->inside;
int y = i / inside;
int x = i % inside;
float sumValue = 0.0;
int axis = param->axis;
const T* basicInput = input + y * param->axis * param->inside + x;
const T* basicInput = input + y * axis * inside + x;
for (int v=0; v<axis; ++v) {
sumValue += (float)basicInput[v * param->inside];
sumValue += (float)basicInput[v * inside];
}
output[y * param->inside + x] = (T)sumValue;
output[y * inside + x] = (T)sumValue;
}
return;
}
template <typename T>
__global__ void MEAN(const T *input, T *output, const ReduceParam* param) {
int count = param->inside * param->outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / param->inside;
int x = i % param->inside;
float sumValue = 0.0;
int axis = param->axis;
const T* basicInput = input + y * param->axis * param->inside + x;
for (int v=0; v<axis; ++v) {
sumValue += (float)basicInput[v * param->inside];
__global__ void SUM_REDUCE_AXIS(const T *input, T *output,
const int outside,
const int axis,
const int inside,
const int per_block_size,
const int calc_multi_num
) {
int idx_outside = blockIdx.x / inside;
int idx_inside = blockIdx.x - idx_outside * inside;
const T* src = input + idx_outside * axis * inside + idx_inside;
int tid = threadIdx.x;
float local_src = 0.0;
__shared__ float sumValue;
for(int i=0; i<calc_multi_num; i++) {
if(tid + i * per_block_size < axis) {
local_src += (float)(src[(tid + i * per_block_size) * inside]);
}
output[y * param->inside + x] = (T)(sumValue / (float)param->axis);
}
float maxRes = blockReduceSum<float>(local_src);
if(tid == 0)
sumValue = maxRes;
__syncthreads();
output[idx_outside * inside + idx_inside] = (T)sumValue;
return;
}
template <typename T>
__global__ void MEAN(const T *input, T *output,
const int outside,
const int axis,
const int inside
) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
float sumValue = 0.0;
const T* basicInput = input + y * axis * inside + x;
for (int v=0; v<axis; ++v) {
sumValue += (float)basicInput[v * inside];
}
output[y * inside + x] = (T)(sumValue / (float)axis);
}
return;
}
template <typename T>
__global__ void MINIMUM(const T *input, T *output, const ReduceParam* param) {
int count = param->inside * param->outside;
__global__ void MINIMUM(const T *input, T *output,
const int outside,
const int axis,
const int inside
) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / param->inside;
int x = i % param->inside;
int axis = param->axis;
const T* basicInput = input + y * param->axis * param->inside + x;
int y = i / inside;
int x = i % inside;
const T* basicInput = input + y * axis * inside + x;
float res = (float)basicInput[0];
for (int v=1; v<axis; ++v) {
res = min((float)basicInput[v * param->inside], res);
res = min((float)basicInput[v * inside], res);
}
output[y * param->inside + x] = (T)res;
output[y * inside + x] = (T)res;
}
return;
}
template <typename T>
__global__ void MAXIMUM(const T *input, T *output, const ReduceParam* param) {
int count = param->inside * param->outside;
__global__ void MAXIMUM(const T *input, T *output,
const int outside,
const int axis,
const int inside
) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / param->inside;
int x = i % param->inside;
const T* basicInput = input + y * param->axis * param->inside + x;
int axis = param->axis;
int y = i / inside;
int x = i % inside;
const T* basicInput = input + y * axis * inside + x;
float res = (float)basicInput[0];
for (int v=1; v<axis; ++v) {
res = max((float)basicInput[v * param->inside], res);
res = max((float)basicInput[v * inside], res);
}
output[y * param->inside + x] = (T)res;
output[y * inside + x] = (T)res;
}
return;
}
template <typename T>
__global__ void PROD(const T *input, T *output, const ReduceParam* param) {
int count = param->inside * param->outside;
__global__ void PROD(const T *input, T *output,
const int outside,
const int axis,
const int inside
) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / param->inside;
int x = i % param->inside;
int axis = param->axis;
int y = i / inside;
int x = i % inside;
float sumValue = 1.0;
const T* basicInput = input + y * param->axis * param->inside + x;
const T* basicInput = input + y * axis * inside + x;
for (int v=0; v<axis; ++v) {
sumValue *= (float)basicInput[v * param->inside];
sumValue *= (float)basicInput[v * inside];
}
output[y * param->inside + x] = (T)sumValue;
output[y * inside + x] = (T)sumValue;
}
return;
}

View File

@ -30,62 +30,6 @@ __global__ void SOFTMAX(const T *input, T *output,
}
}
template <typename T>
__inline__ __device__
T warpReduceSum(T val)
{
for(int mask = 16; mask > 0; mask >>= 1)
val += __shfl_xor_sync(0xffffffff, val, mask, 32);
return val;
}
template <typename T>
__inline__ __device__
T warpReduceMax(T val)
{
for(int mask = 16; mask > 0; mask >>= 1)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
return val;
}
template <typename T>
__inline__ __device__
T blockReduceSum(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceSum<T>(val);
if(lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceSum(val);
return val;
}
template <typename T>
__inline__ __device__
T blockReduceMax(T val)
{
static __shared__ T shared[32];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
val = warpReduceMax<T>(val);
if(lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
val = warpReduceMax(val);
return val;
}
template <typename T>
__global__ void SOFTMAX_WARP_32(const T *input, T *output,
const int inside,

View File

@ -11,6 +11,7 @@
#include <vector>
#include "ReductionTemplate.cuh"
#include "MNNCUDAFunction.cuh"
#include "backend/cuda/core/CUDABackend.hpp"
#include <float.h>

View File

@ -143,7 +143,11 @@ def generateGemmFile(headfile):
hpp += out_align + out_precision_name + epilogue_name + ",\n "
hpp += "SwizzleThreadBlock,\n "
hpp += "NumStages>;\n\n"
hpp += "NumStages"
if sm_name == "_Sm75":
hpp += ",\n 128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, 128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, true>;\n\n"
else :
hpp += ">;\n\n"
hpp += "}\n}\n#endif"
with open(headfile, "w") as f:

View File

@ -428,20 +428,6 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
valid = false;
break;
}
//input in raster not used, origin instead
auto des = TensorUtils::getDescribe(t)->regions;
for(auto region : des)
{
auto tensor = region.origin;
auto tensorShape = OpenCL::tensorShapeFormat(tensor);
int originHeight = tensorShape[0] * tensorShape[1];
int originWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
if (originHeight > maxImageSize.at(0) || originWidth > maxImageSize.at(1)) {
valid = false;
break;
}
}
}
for (auto t : outputs) {
auto tensorShape = OpenCL::tensorShapeFormat(t);

View File

@ -123,15 +123,8 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
isSetWorkGroupAttribute = true;
} else if (deviceVendor.find("Intel") != std::string::npos) {
mGpuType = INTEL;
std::string opencl_c_version = mFirstGPUDevicePtr->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
int version = 0;
for (auto s : opencl_c_version) {
if (s >= '0' && s <= '9') {
version += (s - '0');
version *= 10;
}
}
if (version >= 120) {
const std::string extensions = mFirstGPUDevicePtr->getInfo<CL_DEVICE_EXTENSIONS>();
if (extensions.find("cl_intel_subgroups") != std::string::npos) {
mSupportedIntelSubgroup = true;
uint32_t execution_units_count = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
uint32_t num_threads_per_eu = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NUM_THREADS_PER_EU_INTEL>();

View File

@ -16,10 +16,8 @@ namespace MNN {
namespace OpenCL {
BinaryBufExecution::BinaryBufExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
: CommonExecution(backend), mCompute(compute) {
: CommonExecution(backend, op), mCompute(compute) {
mBuildOptions.emplace("-DOPERATOR=" + compute);
mOp = op;
mOpType = op->type();
}
uint32_t BinaryBufExecution::realSize(const Tensor* tensor) {

View File

@ -0,0 +1,351 @@
//
// LoopBufExecution.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#include "backend/opencl/execution/buffer/LoopBufExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
namespace MNN {
namespace OpenCL {
static void _TileOrPackTensor(Tensor *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, const std::set<std::string> &buildOptions) {
kernel = runTime->buildKernel("loop_buf", KernelName, buildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
uint32_t index = 0;
kernel.setArg(index++, mGlobalWorkSize[0]);
kernel.setArg(index++, mGlobalWorkSize[1]);
kernel.setArg(index++, mGlobalWorkSize[2]);
kernel.setArg(index++, openCLBuffer(input));
kernel.setArg(index++, openCLBuffer(output));
kernel.setArg(index++, Width);
kernel.setArg(index++, Height);
kernel.setArg(index++, Channel);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, kernel).first;
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
}
static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs, const LoopParam *loop) {
if (loop->inputIndexes() != nullptr) {
for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
result[loop->inputIndexes()->data()[i]] = inputs[i];
}
}
for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
result[loop->outputIndexes()->data()[i]] = outputs[i];
}
}
LoopGatherBufExecution::LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
: CommonExecution(bn, op) {
mLoop = loop;
mTensors.resize(mLoop->tensorNumber());
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
}
ErrorCode LoopGatherBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
auto runTime = mOpenCLBackend->getOpenCLRuntime();
_setTensorStack(mTensors, inputs, outputs, mLoop);
mUnits.clear();
mOffsetTensors.clear();
mTmpTensors.resize(2);
int x = cmd->size()->data()[0];
int y = cmd->size()->data()[1];
int z = cmd->size()->data()[2];
int n = mLoop->loopNumber();
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
for (int i = 0; i < 3; ++i) {
mStride_src[i] = srcStride[i];
mStride_dst[i] = dstStride[i];
}
mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
// tile input
{
auto input = mTensors[cmd->indexes()->data()[1]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
mOpenCLBackend->onAcquireBuffer(mTmpTensors[1].get(), Backend::DYNAMIC);
Unit unit;
_TileOrPackTensor(mTensors[cmd->indexes()->data()[1]], mTmpTensors[1].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, "tile_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
if (mIter[i] >= 0) {
auto input = mTensors[cmd->iterIndexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
Unit unit;
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
}
// gather
{
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}));
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
int offset_index = 0;
Unit unit;
std::string KernelName = "batch_gather_buf";
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
uint32_t index = 0;
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
if (mIter[i] >= 0) {
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
} else {
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
}
}
unit.kernel.setArg(index++, x);
unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
unit.kernel.setArg(index++, sizeof(mStep), mStep);
unit.kernel.setArg(index++, sizeof(mIter), mIter);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
mUnits.emplace_back(unit);
}
//pack output
{
auto output = mTensors[cmd->indexes()->data()[0]];
std::vector<int> Shape = tensorShapeFormat(output);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
Unit unit;
_TileOrPackTensor(mTmpTensors[0].get(), mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
for (int i = 0; i < mTmpTensors.size(); ++i) {
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
}
for (int i = 0; i < mOffsetTensors.size(); ++i) {
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
LoopBatchMatMulBufExecution::LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
: CommonExecution(bn, op) {
mLoop = loop;
mTensors.resize(mLoop->tensorNumber());
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
mHasBias = cmd->indexes()->size() > 3;
mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
}
ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
auto runTime = mOpenCLBackend->getOpenCLRuntime();
_setTensorStack(mTensors, inputs, outputs, mLoop);
mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
mUnits.clear();
mOffsetTensors.clear();
mTmpTensors.resize(3);
if (mHasBias) {
mTmpTensors.resize(4);
mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
}
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
int e = cmd->size()->data()[0];
int l = cmd->size()->data()[1];
int h = cmd->size()->data()[2];
int n = mLoop->loopNumber();
// tile input
for (int i = 1; i < cmd->indexes()->size(); ++i) {
auto input = mTensors[cmd->indexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
Unit unit;
_TileOrPackTensor(input, mTmpTensors[i].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
if (mIter[i] >= 0) {
auto input = mTensors[cmd->iterIndexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
Unit unit;
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
}
// matmul
{
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}));
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
int offset_index = 0;
Unit unit;
std::string KernelName = "batch_matmul_buf";
if (mHasBias) {
mBuildOptions.emplace("-DBIAS");
}
if (mTransposeA) {
mBuildOptions.emplace("-DTRANSPOSE_A");
}
if (mTransposeB) {
mBuildOptions.emplace("-DTRANSPOSE_B");
}
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
uint32_t index = 0;
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[2].get()));
if (mHasBias) {
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[3].get()));
}
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
if (mIter[i] >= 0) {
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
} else {
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
}
}
unit.kernel.setArg(index++, e);
unit.kernel.setArg(index++, l);
unit.kernel.setArg(index++, h);
unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
unit.kernel.setArg(index++, sizeof(mIter), mIter);
unit.kernel.setArg(index++, sizeof(mStep), mStep);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
mUnits.emplace_back(unit);
}
//pack output
{
auto output = mTensors[cmd->indexes()->data()[0]];
std::vector<int> Shape = tensorShapeFormat(output);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
Unit unit;
_TileOrPackTensor(mTmpTensors[0].get(), output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
mUnits.emplace_back(unit);
}
for (int i = 0; i < cmd->indexes()->size(); ++i) {
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
}
for (int i = 0; i < mOffsetTensors.size(); ++i) {
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
class LoopBufCreator : public OpenCLBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override {
auto loop = op->main_as_LoopParam();
if (nullptr == loop || loop->commands() == nullptr) {
return nullptr;
}
if (nullptr != loop->initCommand()) {
return nullptr;
}
// Make Tensor Stack
if (1 == loop->commands()->size()) {
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
auto subop = cmd->op();
if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
return new LoopGatherBufExecution(loop, op, backend);
}
if (OpType_MatMul == subop->type() && loop->parallel()) {
return new LoopBatchMatMulBufExecution(loop, op, backend);
}
}
return nullptr;
}
};
OpenCLCreatorRegister<LoopBufCreator> __LoopBuf_op(OpType_While, BUFFER);
} // namespace OpenCL
} // namespace MNN
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,60 @@
//
// LoopBufExecution.hpp
// MNN
//
// Created by MNN on 2023/04/23.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifndef LoopBufExecution_hpp
#define LoopBufExecution_hpp
#include "backend/opencl/execution/image/CommonExecution.hpp"
namespace MNN {
namespace OpenCL {
class LoopGatherBufExecution : public CommonExecution {
public:
LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
virtual ~LoopGatherBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const LoopParam *mLoop;
std::vector<Tensor *> mTensors;
std::vector<std::shared_ptr<Tensor>> mTmpTensors;
std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
int mStride_src[4];
int mStride_dst[4];
int mStep[2];
int mIter[2];
std::set<std::string> mBuildOptions;
};
class LoopBatchMatMulBufExecution : public CommonExecution {
public:
LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
virtual ~LoopBatchMatMulBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const LoopParam *mLoop;
std::vector<Tensor *> mTensors;
std::vector<std::shared_ptr<Tensor>> mTmpTensors;
std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
int mOffset[4];
int mStep[4];
int mIter[4];
bool mHasBias = false;
bool mTransposeA = false;
bool mTransposeB = false;
std::set<std::string> mBuildOptions;
};
} // namespace OpenCL
} // namespace MNN
#endif /* LoopBufExecution_hpp */
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -18,10 +18,8 @@ namespace MNN {
namespace OpenCL {
RasterBufExecution::RasterBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
: CommonExecution(backend) {
: CommonExecution(backend, op) {
mOpenCLBackend = (OpenCLBackend *)backend;
mOp = op;
mOpType = op->type();
//nothing to do
}

View File

@ -15,7 +15,7 @@
namespace MNN {
namespace OpenCL {
ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
#ifdef LOG_VERBOSE
MNN_PRINT("start ReductionBufExecution init !\n");
#endif
@ -46,7 +46,6 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
MNN_ASSERT(false);
break;
}
mOp = op;
#ifdef LOG_VERBOSE
MNN_PRINT("end ReductionBufExecution init !\n");
#endif
@ -70,20 +69,20 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
std::set<std::string> buildOption;
switch (mReductType) {
case 0:
buildOption.emplace("-DOPERATE=num+in");
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
buildOption.emplace("-DGET_AVG");
break;
case 1:
buildOption.emplace("-DOPERATE=max(num,in)");
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
break;
case 2:
buildOption.emplace("-DOPERATE=min(num,in)");
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
break;
case 3:
buildOption.emplace("-DOPERATE=num*in");
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
break;
case 4:
buildOption.emplace("-DOPERATE=num+in");
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
break;
default:
MNN_ASSERT(false);
@ -103,6 +102,7 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
return NO_ERROR;
}

View File

@ -15,7 +15,7 @@ namespace MNN {
namespace OpenCL {
ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
: CommonExecution(backend) {
: CommonExecution(backend, op) {
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
auto mPreluParamPtr = op->main_as_PRelu();
int preluSize = mPreluParamPtr->slopeCount();
@ -50,9 +50,6 @@ ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MN
MNN_ERROR("Map error preluDataPtrCL == nullptr \n");
}
mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(preluBuffer, preluDataPtrCL);
mOp = op;
mOpType = op->type();
}
ReluBufExecution::~ReluBufExecution() {

View File

@ -0,0 +1,160 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void batch_matmul(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
#ifdef BIAS
__global FLOAT* input_C,
#endif
__global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
#ifdef BIAS
__global FLOAT* offset_C,
#endif
__private const int e,
__private const int l,
__private const int h,
__private const int4 offsets,
__private const int4 iters,
__private const int4 steps) {
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
int4 index = (int4)(pos.z);
if (iters.x >= 0) {
index.x = (int)(offset_O[pos.z]);
}
if (iters.y >= 0) {
index.y = (int)(offset_A[pos.z]);
}
if (iters.z >= 0) {
index.z = (int)(offset_B[pos.z]);
}
#ifdef BIAS
if (iters.w >= 0) {
index.w = (int)(offset_C[pos.z]);
}
#endif
int4 offset = index * steps + offsets;
#if TRANSPOSE_A
__global FLOAT* A_ptr = input_A + offset.y + pos.y;
#else
__global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
#endif
#if TRANSPOSE_B
__global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
#else
__global FLOAT* B_ptr = input_B + offset.z + pos.x;
#endif
#ifdef BIAS
FLOAT value = input_C[offset.w + pos.x];
#else
FLOAT value = 0;
#endif
for(int i = 0; i < l; ++i){
#if TRANSPOSE_A
FLOAT value_a = A_ptr[i * e];
#else
FLOAT value_a = A_ptr[i];
#endif
#if TRANSPOSE_B
FLOAT value_b = B_ptr[i];
#else
FLOAT value_b = B_ptr[i * h];
#endif
value = mad(value_a, value_b, value);
}
output[offset.x + pos.y * h + pos.x] = value;
}
}
__kernel void tile(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__read_only image2d_t input,
__global FLOAT* output,
__private const int width,
__private const int height,
__private const int channel){
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
const int w = pos.x % width;
const int h = pos.x / width;
const int c = pos.y << 2;
const int x_dst_pitch = 1;
const int y_dst_pitch = x_dst_pitch * width;
const int c_dst_pitch = y_dst_pitch * height;
const int b_dst_pitch = c_dst_pitch * channel;
__global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
FLOAT4 value = RI_F(input, SAMPLER, (int2)(pos.y * width + w, pos.z * height + h));
dst_ptr[0] = value.x;
if(c + 1 >= channel)return;
dst_ptr[c_dst_pitch] = value.y;
if(c + 2 >= channel)return;
dst_ptr[2 * c_dst_pitch] = value.z;
if(c + 3 >= channel)return;
dst_ptr[3 * c_dst_pitch] = value.w;
}
}
__kernel void pack(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* input,
__write_only image2d_t output,
__private const int width,
__private const int height,
__private const int channel){
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
const int w = pos.x % width;
const int h = pos.x / width;
const int c = pos.y << 2;
const int x_src_pitch = 1;
const int y_src_pitch = x_src_pitch * width;
const int c_src_pitch = y_src_pitch * height;
const int b_src_pitch = c_src_pitch * channel;
__global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
FLOAT4 value = (FLOAT4)0;
FLOAT *value_ptr = (FLOAT*)&value;
for(int i = 0; i < 4 && (i + c < channel); ++i){
value_ptr[i] = src_ptr[i * c_src_pitch];
}
WI_F(output, (int2)(pos.y * width + w, pos.z * height + h), value);
}
}
__kernel void batch_gather(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* output, __global FLOAT* input,
__global FLOAT* offset_dst, __global FLOAT* offset_src,
__private const int x_size,
__private const int4 stride_src,
__private const int4 stride_dst,
__private const int2 steps,
__private const int2 iters) {
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
int x = pos.x % x_size;
int y = pos.x / x_size;
int2 index = (int2)(pos.z, pos.z);
if (iters.x >= 0) {
index.x = (int)(offset_dst[pos.z]);
}
if (iters.y >= 0) {
index.y = (int)(offset_src[pos.z]);
}
int2 offset = index * steps;
output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
}
}

View File

@ -0,0 +1,164 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void batch_matmul_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
#ifdef BIAS
__global FLOAT* input_C,
#endif
__global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
#ifdef BIAS
__global FLOAT* offset_C,
#endif
__private const int e,
__private const int l,
__private const int h,
__private const int4 offsets,
__private const int4 iters,
__private const int4 steps) {
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
int4 index = (int4)(pos.z);
if (iters.x >= 0) {
index.x = (int)(offset_O[pos.z]);
}
if (iters.y >= 0) {
index.y = (int)(offset_A[pos.z]);
}
if (iters.z >= 0) {
index.z = (int)(offset_B[pos.z]);
}
#ifdef BIAS
if (iters.w >= 0) {
index.w = (int)(offset_C[pos.z]);
}
#endif
int4 offset = index * steps + offsets;
#if TRANSPOSE_A
__global FLOAT* A_ptr = input_A + offset.y + pos.y;
#else
__global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
#endif
#if TRANSPOSE_B
__global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
#else
__global FLOAT* B_ptr = input_B + offset.z + pos.x;
#endif
#ifdef BIAS
FLOAT value = input_C[offset.w + pos.x];
#else
FLOAT value = 0;
#endif
for(int i = 0; i < l; ++i){
#if TRANSPOSE_A
FLOAT value_a = A_ptr[i * e];
#else
FLOAT value_a = A_ptr[i];
#endif
#if TRANSPOSE_B
FLOAT value_b = B_ptr[i];
#else
FLOAT value_b = B_ptr[i * h];
#endif
value = mad(value_a, value_b, value);
}
output[offset.x + pos.y * h + pos.x] = value;
}
}
__kernel void tile_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* input, __global FLOAT* output,
__private const int width,
__private const int height,
__private const int channel){
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
const int w = pos.x % width;
const int h = pos.x / width;
const int c = pos.y << 2;
const int x_src_pitch = 4;
const int y_src_pitch = x_src_pitch * width;
const int c_src_pitch = y_src_pitch * height;
const int b_src_pitch = c_src_pitch * ((channel + 3) / 4);
const int x_dst_pitch = 1;
const int y_dst_pitch = x_dst_pitch * width;
const int c_dst_pitch = y_dst_pitch * height;
const int b_dst_pitch = c_dst_pitch * channel;
__global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
FLOAT4 value = vload4(0, input + pos.z * b_src_pitch + pos.y * c_src_pitch + h * y_src_pitch + w * x_src_pitch);
dst_ptr[0] = value.x;
if(c + 1 >= channel)return;
dst_ptr[c_dst_pitch] = value.y;
if(c + 2 >= channel)return;
dst_ptr[2 * c_dst_pitch] = value.z;
if(c + 3 >= channel)return;
dst_ptr[3 * c_dst_pitch] = value.w;
}
}
__kernel void pack_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* input, __global FLOAT* output,
__private const int width,
__private const int height,
__private const int channel){
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
const int w = pos.x % width;
const int h = pos.x / width;
const int c = pos.y << 2;
const int x_dst_pitch = 4;
const int y_dst_pitch = x_dst_pitch * width;
const int c_dst_pitch = y_dst_pitch * height;
const int b_dst_pitch = c_dst_pitch * ((channel + 3) / 4);
const int x_src_pitch = 1;
const int y_src_pitch = x_src_pitch * width;
const int c_src_pitch = y_src_pitch * height;
const int b_src_pitch = c_src_pitch * channel;
__global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
FLOAT4 value = (FLOAT4)0;
FLOAT *value_ptr = (FLOAT*)&value;
for(int i = 0; i < 4 && (i + c < channel); ++i){
value_ptr[i] = src_ptr[i * c_src_pitch];
}
vstore4(value, 0, output + pos.z * b_dst_pitch + pos.y * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch);
}
}
__kernel void batch_gather_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
__global FLOAT* output, __global FLOAT* input,
__global FLOAT* offset_dst, __global FLOAT* offset_src,
__private const int x_size,
__private const int4 stride_src,
__private const int4 stride_dst,
__private const int2 steps,
__private const int2 iters) {
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
int x = pos.x % x_size;
int y = pos.x / x_size;
int2 index = (int2)(pos.z, pos.z);
if (iters.x >= 0) {
index.x = (int)(offset_dst[pos.z]);
}
if (iters.y >= 0) {
index.y = (int)(offset_src[pos.z]);
}
int2 offset = index * steps;
output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
}
}

File diff suppressed because one or more lines are too long

View File

@ -19,34 +19,44 @@ __kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT sum = 0;
FLOAT4 sum = 0;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = sum + in.x;
sum = sum + in;
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum/height, 0.0, 0.0, 0.0));
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x += sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
}
__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT sum = 0;
FLOAT4 sum = 0;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = sum + in.x;
sum = sum + in;
}
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x += sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
@ -54,17 +64,22 @@ __kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT sum = -MAXFLOAT;
FLOAT4 sum = (FLOAT4)-MAXFLOAT;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = max(sum, in.x);
sum = max(sum, in);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x = max(sum.x, sum_ptr[i]);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
@ -72,17 +87,22 @@ __kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT sum = MAXFLOAT;
FLOAT4 sum = (FLOAT4)MAXFLOAT;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = min(sum, in.x);
sum = min(sum, in);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x = min(sum.x, sum_ptr[i]);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
@ -90,17 +110,22 @@ __kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT sum = 1.0;
FLOAT4 sum = (FLOAT4)1.0;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = sum * in.x;
sum = sum * in;
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x *= sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
@ -108,21 +133,27 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
sum[idx] = 0.0;
FLOAT4 out = (FLOAT4)0.0;
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum[idx] = sum[idx] + in.x;
out = out + in;
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x += out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){
if (idx < i)
@ -130,7 +161,8 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
barrier(CLK_LOCAL_MEM_FENCE);
}
if (idx == 0) {
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/height, 0.0, 0.0, 0.0));
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
}
}
__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
@ -138,22 +170,27 @@ __kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
sum[idx] = 0.0;
FLOAT4 out = (FLOAT4)0.0;
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum[idx] = sum[idx] + in.x;
out = out + in;
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x += out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){
if (idx < i)
@ -170,20 +207,26 @@ __kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
sum[idx] = -MAXFLOAT;
FLOAT4 out = (FLOAT4)(-MAXFLOAT);
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum[idx] = max(sum[idx], in.x);
out = max(out, in);
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x = max(out.x, out_ptr[i]);
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){
@ -202,22 +245,28 @@ __kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
sum[idx] = MAXFLOAT;
FLOAT4 out = (FLOAT4)(MAXFLOAT);
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum[idx] = min(sum[idx], in.x);
out = min(out, in);
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x = min(out.x, out_ptr[i]);
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){
if (idx < i)
@ -234,21 +283,27 @@ __kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
sum[idx] = 1.0;
FLOAT4 out = (FLOAT4)1.0;
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum[idx] = sum[idx] * in.x;
out = out * in;
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x *= out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){

View File

@ -14,21 +14,26 @@ __kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
__global FLOAT* output,
__private const int batch,
__private const int height,
__private const int width
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
FLOAT num = input[inp_offset];
FLOAT4 out = vload4(0, input + inp_offset);
for (int h = 1; h < height; h++) {
FLOAT in = input[inp_offset + h*width*4];
num = OPERATE;
FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
out = OPERATE(out, in);
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int c = 1; c < channel; ++c){
out.x = OPERATE(out.x, out_ptr[c]);
}
#ifdef GET_AVG
num = num / height;
out.x = out.x / (height * channel);
#endif
const int out_offset = batch_idx * width + width_idx;
vstore4((FLOAT4)(num, 0.0, 0.0, 0.0), out_offset, output);
vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
}

View File

@ -10,7 +10,9 @@
namespace MNN {
namespace OpenCL {
CommonExecution::CommonExecution(Backend *backend) : Execution(backend) {
CommonExecution::CommonExecution(Backend *backend, const MNN::Op *Op)
: Execution(backend), mOp(Op) {
mOpType = Op->type();
}
ErrorCode CommonExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();

View File

@ -15,7 +15,7 @@ namespace OpenCL {
class CommonExecution : public Execution {
public:
CommonExecution(Backend *backend);
CommonExecution(Backend *backend, const MNN::Op *Op);
virtual ~CommonExecution() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

View File

@ -15,7 +15,7 @@
namespace MNN {
namespace OpenCL {
Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
auto common = op->main_as_Convolution2D()->common();
mStrides = {common->strideY(), common->strideX()};
mDilations = {common->dilateY(), common->dilateX()};
@ -25,8 +25,6 @@ Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend)
if (common->padMode() == PadMode_VALID) {
mPaddings[0] = mPaddings[1] = 0;
}
mOp = op;
mOpType = op->type();
}
Conv2DBackPropFilter::~Conv2DBackPropFilter() {

View File

@ -28,10 +28,8 @@ static string swapComputeIn0In1(const string& computeOrigin) {
}
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
: CommonExecution(backend), mCompute(compute) {
: CommonExecution(backend, op), mCompute(compute) {
mBuildOptions.emplace("-DOPERATOR=" + compute);
mOp = op;
mOpType = op->type();
}
uint32_t EltwiseExecution::realSize(const Tensor* tensor) {

View File

@ -0,0 +1,370 @@
//
// LoopExecution.cpp
// MNN
//
// Created by MNN on 2019/02/28.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/opencl/execution/image/LoopExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
namespace MNN {
namespace OpenCL {
static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
kernel = runTime->buildKernel("loop", "tile", buildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
uint32_t index = 0;
kernel.setArg(index++, mGlobalWorkSize[0]);
kernel.setArg(index++, mGlobalWorkSize[1]);
kernel.setArg(index++, mGlobalWorkSize[2]);
kernel.setArg(index++, openCLImage(input));
kernel.setArg(index++, *output);
kernel.setArg(index++, Width);
kernel.setArg(index++, Height);
kernel.setArg(index++, Channel);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "tile", kernel).first;
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
}
static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
kernel = runTime->buildKernel("loop", "pack", buildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
uint32_t index = 0;
kernel.setArg(index++, mGlobalWorkSize[0]);
kernel.setArg(index++, mGlobalWorkSize[1]);
kernel.setArg(index++, mGlobalWorkSize[2]);
kernel.setArg(index++, *input);
kernel.setArg(index++, openCLImage(output));
kernel.setArg(index++, Width);
kernel.setArg(index++, Height);
kernel.setArg(index++, Channel);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "pack", kernel).first;
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
}
static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs, const LoopParam *loop) {
if (loop->inputIndexes() != nullptr) {
for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
result[loop->inputIndexes()->data()[i]] = inputs[i];
}
}
for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
result[loop->outputIndexes()->data()[i]] = outputs[i];
}
}
LoopGatherExecution::LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
: CommonExecution(bn, op) {
mLoop = loop;
mTensors.resize(mLoop->tensorNumber());
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
mOpType = op->type();
}
ErrorCode LoopGatherExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
auto runTime = mOpenCLBackend->getOpenCLRuntime();
auto bufferPool = mOpenCLBackend->getBufferPool();
auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
_setTensorStack(mTensors, inputs, outputs, mLoop);
mUnits.clear();
mOffsetBuffers.clear();
mTmpBuffers.resize(2);
int x = cmd->size()->data()[0];
int y = cmd->size()->data()[1];
int z = cmd->size()->data()[2];
int n = mLoop->loopNumber();
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
for (int i = 0; i < 3; ++i) {
mStride_src[i] = srcStride[i];
mStride_dst[i] = dstStride[i];
}
mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
// tile input
{
auto input = mTensors[cmd->indexes()->data()[1]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mTmpBuffers[1] = bufferPool->alloc(input->elementSize() * bufferUnitSize);
Unit unit;
_TileTensor(mTensors[cmd->indexes()->data()[1]], mTmpBuffers[1], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
if (mIter[i] >= 0) {
auto input = mTensors[cmd->iterIndexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize));
Unit unit;
_TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
}
// gather
{
mTmpBuffers[0] = bufferPool->alloc(n * z * y * x * bufferUnitSize);
int offset_index = 0;
Unit unit;
std::string KernelName = "batch_gather";
unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
uint32_t index = 0;
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
unit.kernel.setArg(index++, *mTmpBuffers[0]);
unit.kernel.setArg(index++, *mTmpBuffers[1]);
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
if (mIter[i] >= 0) {
unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
} else {
unit.kernel.setArg(index++, *mTmpBuffers[0]);
}
}
unit.kernel.setArg(index++, x);
unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
unit.kernel.setArg(index++, sizeof(mStep), mStep);
unit.kernel.setArg(index++, sizeof(mIter), mIter);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
mUnits.emplace_back(unit);
}
//pack output
{
auto output = mTensors[cmd->indexes()->data()[0]];
std::vector<int> Shape = tensorShapeFormat(output);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
Unit unit;
_PackTensor(mTmpBuffers[0], mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
for (int i = 0; i < mTmpBuffers.size(); ++i) {
bufferPool->recycle(mTmpBuffers[i]);
}
for (int i = 0; i < mOffsetBuffers.size(); ++i) {
bufferPool->recycle(mOffsetBuffers[i]);
}
return NO_ERROR;
}
LoopBatchMatMulExecution::LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
: CommonExecution(bn, op) {
mLoop = loop;
mTensors.resize(mLoop->tensorNumber());
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
mHasBias = cmd->indexes()->size() > 3;
mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
}
ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
auto runTime = mOpenCLBackend->getOpenCLRuntime();
auto bufferPool = mOpenCLBackend->getBufferPool();
auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
_setTensorStack(mTensors, inputs, outputs, mLoop);
mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
mUnits.clear();
mOffsetBuffers.clear();
mTmpBuffers.resize(3);
if (mHasBias) {
mTmpBuffers.resize(4);
mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
}
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
int e = cmd->size()->data()[0];
int l = cmd->size()->data()[1];
int h = cmd->size()->data()[2];
int n = mLoop->loopNumber();
// tile input
for (int i = 1; i < cmd->indexes()->size(); ++i) {
auto input = mTensors[cmd->indexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mTmpBuffers[i] = bufferPool->alloc(input->elementSize() * bufferUnitSize);
Unit unit;
_TileTensor(input, mTmpBuffers[i], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
if (mIter[i] >= 0) {
auto input = mTensors[cmd->iterIndexes()->data()[i]];
std::vector<int> Shape = tensorShapeFormat(input);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize));
Unit unit;
_TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
}
// matmul
{
mTmpBuffers[0] = bufferPool->alloc(n * e * h * bufferUnitSize);
int offset_index = 0;
Unit unit;
std::string KernelName = "batch_matmul";
if (mHasBias) {
mBuildOptions.emplace("-DBIAS");
}
if (mTransposeA) {
mBuildOptions.emplace("-DTRANSPOSE_A");
}
if (mTransposeB) {
mBuildOptions.emplace("-DTRANSPOSE_B");
}
unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
uint32_t index = 0;
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
unit.kernel.setArg(index++, *mTmpBuffers[0]);
unit.kernel.setArg(index++, *mTmpBuffers[1]);
unit.kernel.setArg(index++, *mTmpBuffers[2]);
if (mHasBias) {
unit.kernel.setArg(index++, *mTmpBuffers[3]);
}
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
if (mIter[i] >= 0) {
unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
} else {
unit.kernel.setArg(index++, *mTmpBuffers[0]);
}
}
unit.kernel.setArg(index++, e);
unit.kernel.setArg(index++, l);
unit.kernel.setArg(index++, h);
unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
unit.kernel.setArg(index++, sizeof(mIter), mIter);
unit.kernel.setArg(index++, sizeof(mStep), mStep);
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
mUnits.emplace_back(unit);
}
//pack output
{
auto output = mTensors[cmd->indexes()->data()[0]];
std::vector<int> Shape = tensorShapeFormat(output);
const int Channel = Shape.at(3);
const int Width = Shape.at(2);
const int Height = Shape.at(1);
const int Batch = Shape.at(0);
Unit unit;
_PackTensor(mTmpBuffers[0], output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
mUnits.emplace_back(unit);
}
for (int i = 0; i < mTmpBuffers.size(); ++i) {
bufferPool->recycle(mTmpBuffers[i]);
}
for (int i = 0; i < mOffsetBuffers.size(); ++i) {
bufferPool->recycle(mOffsetBuffers[i]);
}
return NO_ERROR;
}
class LoopCreator : public OpenCLBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override {
auto loop = op->main_as_LoopParam();
if (nullptr == loop || loop->commands() == nullptr) {
return nullptr;
}
if (nullptr != loop->initCommand()) {
return nullptr;
}
// Make Tensor Stack
if (1 == loop->commands()->size()) {
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
auto subop = cmd->op();
if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
return new LoopGatherExecution(loop, op, backend);
}
if (OpType_MatMul == subop->type() && loop->parallel()) {
return new LoopBatchMatMulExecution(loop, op, backend);
}
}
return nullptr;
}
};
OpenCLCreatorRegister<LoopCreator> __Loop_op(OpType_While, IMAGE);
} // namespace OpenCL
} // namespace MNN

View File

@ -0,0 +1,58 @@
//
// LoopExecution.hpp
// MNN
//
// Created by MNN on 2023/05/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef LoopExecution_hpp
#define LoopExecution_hpp
#include "backend/opencl/execution/image/CommonExecution.hpp"
namespace MNN {
namespace OpenCL {
class LoopGatherExecution : public CommonExecution {
public:
LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
virtual ~LoopGatherExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const LoopParam *mLoop;
std::vector<Tensor *> mTensors;
std::vector<cl::Buffer *> mTmpBuffers;
std::vector<cl::Buffer *> mOffsetBuffers;
int mStride_src[4];
int mStride_dst[4];
int mStep[2];
int mIter[2];
std::set<std::string> mBuildOptions;
};
class LoopBatchMatMulExecution : public CommonExecution {
public:
LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
virtual ~LoopBatchMatMulExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const LoopParam *mLoop;
std::vector<Tensor *> mTensors;
std::vector<cl::Buffer*> mTmpBuffers;
std::vector<cl::Buffer*> mOffsetBuffers;
int mOffset[4];
int mStep[4];
int mIter[4];
bool mHasBias = false;
bool mTransposeA = false;
bool mTransposeB = false;
std::set<std::string> mBuildOptions;
};
} // namespace OpenCL
} // namespace MNN
#endif /* LoopExecution_hpp */

View File

@ -15,7 +15,7 @@
namespace MNN {
namespace OpenCL {
MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
auto common = op->main_as_Convolution2D()->common();
mPadMode = common->padMode();
mStrides = {common->strideY(), common->strideX()};
@ -25,8 +25,6 @@ MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend
}
isRelu = common->relu();
isRelu6 = common->relu6();
mOp = op;
mOpType = op->type();
}
MultiInputDWConvExecution::~MultiInputDWConvExecution() {

View File

@ -13,7 +13,7 @@
namespace MNN {
namespace OpenCL {
MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
auto common = op->main_as_Convolution2D()->common();
mStrides = {common->strideY(), common->strideX()};
@ -30,8 +30,6 @@ MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Back
isRelu = common->relu();
isRelu6 = common->relu6();
mOp = op;
mOpType = op->type();
}
MultiInputDWDeconvExecution::~MultiInputDWDeconvExecution() {

View File

@ -17,10 +17,8 @@ namespace OpenCL {
RasterExecution::RasterExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
: CommonExecution(backend) {
: CommonExecution(backend, op) {
mOpenCLBackend = (OpenCLBackend *)backend;
mOp = op;
mOpType = op->type();
//nothing to do
}

View File

@ -13,7 +13,7 @@
namespace MNN {
namespace OpenCL {
ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
#ifdef LOG_VERBOSE
MNN_PRINT("start ReductionExecution init !\n");
#endif
@ -44,7 +44,6 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
MNN_ASSERT(false);
break;
}
mOp = op;
#ifdef LOG_VERBOSE
MNN_PRINT("end ReductionExecution init !\n");
#endif
@ -89,7 +88,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
break;
}
} else { //useLocal
uint32_t global_x;
uint32_t global_x = 8;
int size = inputShape[1];
if (size >= 1024) {
global_x = 256;
@ -144,6 +143,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
return NO_ERROR;
}

View File

@ -14,7 +14,7 @@ namespace MNN {
namespace OpenCL {
ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
: CommonExecution(backend) {
: CommonExecution(backend, op) {
auto mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
auto mPreluParamPtr = op->main_as_PRelu();
int preluSize = mPreluParamPtr->slopeCount();
@ -50,8 +50,6 @@ ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op
mOpenCLBackend->onAcquireBuffer(mPreluParam.get(), Backend::STATIC);
copyBufferToImage(mOpenCLBackend->getOpenCLRuntime(), preluBuffer, openCLImage(mPreluParam.get()),
UP_DIV(preluSize, 4), 1);
mOp = op;
mOpType = op->type();
}
ReluExecution::~ReluExecution() {
backend()->onReleaseBuffer(mPreluParam.get(), Backend::STATIC);

View File

@ -13,8 +13,7 @@
namespace MNN {
namespace OpenCL {
TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend), mOp(op), mInitialized(false) {
mOp = op;
TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op), mInitialized(false) {
}
TrainableParamExecution::~TrainableParamExecution() {

View File

@ -84,7 +84,41 @@ public:
}
return true;
}
static bool compressFloatWeightToSparse(MNN::OpT* op) {
auto opType = op->type;
auto param = op->main.AsConvolution2D();
if (param->sparseParameter.get() == nullptr) {
return false;
}
// Encode for sparse float weight
size_t weightSize = param->weight.size();
if (weightSize > std::numeric_limits<uint32_t>().max()) {
MNN_ERROR("The weightSize exceed uint32_t, can't compress the sparse weight\n");
return false;
}
param->quanParameter.reset(new IDSTQuanT);
size_t validSize = 0;
std::vector<uint32_t> indexes;
std::vector<float> newWeights;
for (size_t i=0; i<weightSize; ++i) {
if (param->weight[i] != 0.0f) {
indexes.emplace_back(i);
newWeights.emplace_back(param->weight[i]);
}
}
// If empty, Add Single weight to avoid error, runtime can't extract full sparse convolution
if (indexes.empty()) {
indexes.emplace_back(0);
newWeights.emplace_back(0.0f);
}
param->weight.clear();
param->quanParameter->alpha = std::move(newWeights);
param->quanParameter->weightSize = (uint32_t)weightSize;
param->quanParameter->index = std::move(indexes);
return true;
}
};
} // namespace MNN

View File

@ -256,6 +256,10 @@ public:
virtual int onGetRuntimeStatus(RuntimeStatus statusEnum) const {
return 0;
}
// If the info user set can't be match by runtime, return false and set real info
virtual bool onCheckInfo(Backend::Info& info) const {
return true;
}
struct OpInfo {
bool initCostLong;
float exeutionCost; // In ms

View File

@ -8,12 +8,13 @@
#include "ConvolutionCommon.hpp"
#include <math.h>
#include "backend/cpu/compute/CommonOptFunction.h"
#include "half.hpp"
namespace MNN {
static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
}
static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeBufCnt) {
static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
int uSize = myfile[0];
myfile++;
if (uSize > 4) {
@ -24,8 +25,16 @@ static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeB
if (copyLength > shapeBufCnt) {
copyLength = shapeBufCnt;
}
::memcpy(shape, myfile, sizeof(unsigned short) * copyLength);
myfile += copyLength * sizeof(unsigned short);
if (useInt32) {
::memcpy(shape, myfile, sizeof(unsigned int) * copyLength);
myfile += copyLength * sizeof(unsigned int);
} else {
auto myfileint16 = (uint16_t*)myfile;
for (int i=0; i<copyLength; ++i) {
shape[i] = myfileint16[i];
}
myfile += copyLength * sizeof(unsigned short);
}
return copyLength;
}
@ -176,18 +185,17 @@ static void StreamSizeRead(void *dst, int unit, size_t count, unsigned char *&fi
file += (unit * count);
}
static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32) {
int8_t *blob = nullptr;
int8_t *samples = nullptr;
uint8_t *idxBuf = nullptr;
uint8_t *idxBytes = nullptr;
uint32_t dataCnt = 1;
do {
// blob shape
unsigned short shape[64] = {0};
uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 64);
if (shapeDim == 0 || shapeDim > 64)
unsigned int shape[32] = {0};
uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
if (shapeDim == 0 || shapeDim > 32)
break;
for (uint32_t i = 0; i < shapeDim; i++)
dataCnt *= shape[i];
@ -198,7 +206,8 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
if (0 == sampleCnt) {
sampleCnt = 256;
}
samples = (int8_t *)MNNMemoryAllocAlignZeroAlign(sampleCnt);
result->weightMap.resize(sampleCnt);
auto samples = result->weightMap.data();
if (samples == nullptr)
break;
StreamSizeRead(samples, 1, sampleCnt, s);
@ -238,8 +247,6 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
}
} while (0);
if (samples != nullptr)
MNNMemoryFreeAlign(samples);
if (idxBuf != nullptr)
MNNMemoryFreeAlign(idxBuf);
if (idxBytes != nullptr)
@ -249,9 +256,9 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
return blob;
}
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const flatbuffers::Vector<float> *alpha, ConvolutionCommon::Int8Common* result, bool useInt32) {
// MNN_ERROR("sparse:%d\n", 1);
unsigned short shape[64] = {0};
unsigned int shape[32];
uint32_t ucMapSize = 0;
PSIMPLE_SET setWeight = CreateSimpleSet(256);
if (setWeight == nullptr) {
@ -262,8 +269,8 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
unsigned char iIdxNeedBits;
int8_t *blob = nullptr;
// 1. weights blob shape(unsigned int32)
int ShapeDim = ReadBlobDim(myfile, shape, 64);
int Size = sizeof(int8_t);
int ShapeDim = ReadBlobDim(myfile, shape, 32, useInt32);
size_t Size = sizeof(int8_t);
for (int i = 0; i < ShapeDim; i++)
Size *= shape[i];
blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)Size);
@ -295,11 +302,13 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
if (0 == ucMapSize) {
ucMapSize = 256;
}
result->weightMap.resize(ucMapSize);
// 6. valueset(signed char * valueset_size)
for (int i = 0; i < ucMapSize; i++) {
int8_t tmp;
StreamSizeRead(&tmp, 1, 1, myfile);
InsertSimpleSet(setWeight, tmp);
result->weightMap[i] = tmp;
}
SimpleRank(setWeight->UniSet, setWeight->CurUniCnt, 1);
// map<unsigned char, signed char> mapWeight;
@ -367,14 +376,61 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
}
std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat, bool forceInt8) {
auto result = std::make_shared<Int8Common>();
uint32_t weightLength = 0;
result->quan = quan;
if (quan->index() != nullptr) {
if (forceFloat) {
// Expand sparse to dense
result->weightFloat.reset(quan->weightSize());
if (nullptr == result->weightFloat.get()) {
return nullptr;
}
::memset(result->weightFloat.get(), 0, quan->weightSize() * sizeof(float));
auto index = quan->index()->data();
auto indexSize = quan->index()->size();
if (nullptr == quan->alpha() || quan->alpha()->size() != indexSize) {
MNN_ERROR("The model is error, don't has alpha but has index\n");
return nullptr;
}
auto weightRaw = quan->alpha()->data();
for (uint32_t i=0; i<indexSize; ++i) {
result->weightFloat.get()[index[i]] = weightRaw[i];
}
} // Otherwise needn't treat, just return result with quan info
return result;
}
size_t weightLength = 0;
int8_t *buffer = nullptr;
auto originBuffer = (unsigned char *)quan->buffer()->data();
if (1 == quan->type()) {
buffer = ReadQuanData_c(originBuffer, &weightLength);
buffer = ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32());
}
if (2 == quan->type()) {
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha(), result.get(), quan->shapeInt32());
}
if (result->weightMap.size() > 0 && result->weightMap.size() <= 16) {
// Compute Remap for int4
result->canUseInt4 = true;
result->weightReverseMap.resize(256);
::memset(result->weightReverseMap.data(), 0, 256 * sizeof(int8_t));
for (int i=0; i<result->weightMap.size(); ++i) {
int value = result->weightMap[i];
value = value + 128;
result->weightReverseMap[value] = i;
}
#ifdef MNN_TEST_REMAPQUANT
// Test reverse
std::vector<int8_t> originBuffer(weightLength);
for (int i=0; i<weightLength; ++i) {
originBuffer[i] = buffer[i];
buffer[i] = result->weightReverseMap[(int)buffer[i] + 128];
}
for (int i=0; i<weightLength; ++i) {
buffer[i] = result->weightMap[buffer[i]];
}
for (int i=0; i<weightLength; ++i) {
MNN_ASSERT(buffer[i] == originBuffer[i]);
}
#endif
}
// read fp16 data
if (3 == quan->type()) {
@ -406,13 +462,41 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
}
result->weight.set(buffer, weightLength);
}
result->quan = quan;
result->alpha.reset(quan->alpha()->size());
if (nullptr == result->alpha.get()) {
MNN_PRINT("Alloc memory error for extract idst int8\n");
return nullptr;
}
::memcpy(result->alpha.get(), quan->alpha()->data(), quan->alpha()->size() * sizeof(float));
{
int outputCount = 0;
bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
if (quan->readType() != 0 || oldType4) {
result->asymmetric = true;
outputCount = result->alpha.size() / 2;
} else {
result->asymmetric = false;
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
}
if (result->asymmetric) {
// clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
// and old version clampMin is -128
float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
for (int o = 0; o < outputCount; ++o) {
result->alpha.get()[2 * o] = result->alpha.get()[2 * o] - clampMin * result->alpha.get()[2 * o + 1];
}
}
if (!quan->has_scaleInt()) {
float extraFactor = quan->quantScale();
// for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
if (oldType4) {
extraFactor = 1.0f;
}
for (int o=0; o<result->alpha.size(); ++o) {
result->alpha.get()[o] *= extraFactor;
}
}
}
if (forceInt8) {
return result;
}
@ -424,42 +508,30 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
return nullptr;
}
int outputCount = 0;
bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
if (quan->readType() != 0 || oldType4) {
outputCount = result->alpha.size() / 2;
if (result->asymmetric) {
outputCount = result->alpha.size() / 2;
} else {
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
outputCount = result->alpha.size();
}
int partWeightSize = weightLength / outputCount;
for (int o = 0; o < outputCount; ++o) {
float min = 0.0f;
float alpha = 0.0f;
if (result->asymmetric) {
min = result->alpha.get()[2*o];
alpha = result->alpha.get()[2*o+1];
} else {
alpha = result->alpha.get()[o];
}
auto dstW = result->weightFloat.get() + o * partWeightSize;
auto srcW = result->weight.get() + o * partWeightSize;
float extraFactor = quan->quantScale();
// for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
if (oldType4) {
extraFactor = 1.0f;
}
if (result->alpha.size() == 2 * outputCount) {
float min = result->alpha.get()[2*o];
float alpha = result->alpha.get()[2*o+1];
// clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
// and old version clampMin is -128
float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
for (int j = 0; j < partWeightSize; ++j) {
dstW[j] = (( (float)srcW[j] - clampMin ) * alpha + min) * extraFactor;
}
} else {
float alpha = result->alpha.get()[o];
for (int j = 0; j < partWeightSize; ++j) {
dstW[j] = ((float)srcW[j]) * alpha * extraFactor;
}
for (int v=0; v < partWeightSize; ++v) {
dstW[v] = (float)srcW[v] * alpha + min;
}
}
result->weight.release();
result->alpha.release();
}
return result;
}

View File

@ -19,6 +19,10 @@ public:
AutoStorage<float> alpha;
AutoStorage<float> weightFloat;
const IDSTQuan* quan;
bool asymmetric;
std::vector<int8_t> weightMap;
std::vector<uint8_t> weightReverseMap;
bool canUseInt4 = false;
};
static std::shared_ptr<Int8Common> load(const IDSTQuan* quan, bool forceFloat = false, bool forceInt8 = false);
static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize);

View File

@ -189,6 +189,7 @@ Pipeline::Pipeline(Schedule::PipelineInfo&& info, bool allocInput, bool outputSt
#else
{
#endif
rt->onCheckInfo(info.first.info);
mRuntime = rt;
mCpuRuntime = cpuRt;
mTuneAttr = tune;

View File

@ -266,7 +266,16 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
} else {
*dst = 0;
}
return true;
} break;
case Interpreter::THREAD_NUMBER: {
auto dst = (int*)ptr;
if (mPipelines.empty()) {
break;
}
*dst = mPipelines[0]->getPipelineInfo().first.info.numThread;
return true;
}
// TODO: Support other debug info
default:
break;

View File

@ -399,17 +399,21 @@ bool TensorUtils::isDepthToSpaceRegions(const Tensor* output) {
}
// compute offset through region
static inline int offsetCompute(Tensor::InsideDescribe::Region reg, int offset, bool backward) {
static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int offset, bool backward) {
Tensor::InsideDescribe::View src;
Tensor::InsideDescribe::View dst;
if (backward) {
auto tmp = reg.src;
reg.src = reg.dst;
reg.dst = tmp;
src = reg.dst;
dst = reg.src;
} else {
src = reg.src;
dst = reg.dst;
}
int res = 0;
for (int i = 0; i < 3; i++) {
if (reg.size[i] > 1) {
res += offset / reg.src.stride[i] * reg.dst.stride[i];
offset %= reg.src.stride[i];
res += offset / src.stride[i] * dst.stride[i];
offset %= src.stride[i];
}
}
return res;
@ -461,6 +465,23 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) {
return needMalloc;
}
static bool _RegionValid(int* stride, int offset, int* size, int sizeNum, size_t limitSize) {
int maxOffset = offset;
int minOffset = offset;
// Check start and end
for (int i=0; i<sizeNum; ++i) {
if (stride[i] > 0) {
maxOffset += (stride[i] * (size[i] - 1));
} else {
minOffset += (stride[i] * (size[i] - 1));
}
}
if (minOffset < 0 || maxOffset >= limitSize) {
return false;
}
return true;
}
// fuse srcRegion and dstRegion to dstRegion if return true
bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
// src data isnot full data of dst
@ -573,6 +594,14 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
}
// set final size and set expandIdx if expand val is 1
int expandIdx = -1;
int newSrcOffset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
if (nullptr != srcReg.origin) {
bool valid = _RegionValid(newSrc, newSrcOffset, dstSize, dstNum, TensorUtils::getRawSize(srcReg.origin));
if (!valid) {
// Exceed src range
return false;
}
}
if (dstNum > sizeNum) {
for (int i = 2; i >= 0; i--) {
if (i < dstNum) {
@ -654,7 +683,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
}
}
dstReg.origin = srcReg.origin;
dstReg.src.offset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
dstReg.src.offset = newSrcOffset;
return true;
}
void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
@ -680,70 +709,6 @@ Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
return Tensor::TENSORFLOW;
}
halide_type_t TensorUtils::DataTypeToHalideType(DataType t) {
switch (t) {
case DataType_DT_DOUBLE:
case DataType_DT_FLOAT:
return halide_type_of<float>();
case DataType_DT_BFLOAT16:
return halide_type_t(halide_type_float, 16);
case DataType_DT_QINT32:
case DataType_DT_INT32:
case DataType_DT_BOOL:
case DataType_DT_INT64:
return halide_type_of<int32_t>();
case DataType_DT_QINT8:
case DataType_DT_INT8:
return halide_type_of<int8_t>();
case DataType_DT_QUINT8:
case DataType_DT_UINT8:
return halide_type_of<uint8_t>();
case DataType_DT_QUINT16:
case DataType_DT_UINT16:
return halide_type_of<uint16_t>();
case DataType_DT_QINT16:
case DataType_DT_INT16:
return halide_type_of<int16_t>();
case DataType_DT_STRING:
default:
MNN_PRINT("Unsupported data type!");
MNN_ASSERT(false);
return halide_type_of<float>();
}
}
DataType TensorUtils::HaildeTypeToDataType(halide_type_t t) {
if (t == halide_type_of<int8_t>()) {
return DataType_DT_INT8;
}
if (t == halide_type_of<int16_t>()) {
return DataType_DT_INT16;
}
if (t == halide_type_of<int32_t>()) {
return DataType_DT_INT32;
}
if (t == halide_type_of<int64_t>()) {
return DataType_DT_INT64;
}
if (t == halide_type_of<uint8_t>()) {
return DataType_DT_UINT8;
}
if (t == halide_type_of<uint16_t>()) {
return DataType_DT_UINT16;
}
if (t == halide_type_t(halide_type_float, 16)) {
return DataType_DT_BFLOAT16;
}
if (t == halide_type_of<float>()) {
return DataType_DT_FLOAT;
}
if (t == halide_type_of<double>()) {
return DataType_DT_DOUBLE;
}
MNN_PRINT("Unsupported data type!");
MNN_ASSERT(false);
return DataType_DT_INVALID;
}
std::vector<float> TensorUtils::getQuantInfo(const Tensor* t) {
float scale = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->scale : 0.0f;
float zero = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->zero : 0.0f;

View File

@ -163,8 +163,6 @@ public:
static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
static void adjustTensorForCompability(Tensor* t);
static Tensor::DimensionType getDimType(const Tensor* t);
static halide_type_t DataTypeToHalideType(DataType t);
static DataType HaildeTypeToDataType(halide_type_t t);
static std::vector<float> getQuantInfo(const Tensor* t);
static size_t getRawSize(const Tensor* t);

View File

@ -6,6 +6,7 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#include <algorithm>
#include "geometry/GeometryComputer.hpp"
#include "core/TensorUtils.hpp"
namespace MNN {
@ -21,7 +22,6 @@ public:
MNN_ASSERT(input->dimensions() >= 1);
MNN_ASSERT(output->dimensions() == input->dimensions());
auto originTensor = input;
int basicOffset = 0;
int shape[MNN_MAX_TENSOR_DIM];
if (op->type() == OpType_Permute) {
auto shapeValue = op->main_as_Permute()->dims();
@ -53,6 +53,7 @@ public:
continue;
}
if (axis - preAxis == 1) {
// Fuse dimension if possible
inputShape[inputShapeSize - 1] *= len;
} else {
if (preAxis >= 0) {
@ -89,7 +90,18 @@ public:
stride *= inputShape[i];
}
}
int basicStride = 1;
// Sort inputShapeSize from small to large
if (inputShapeSize > 3) {
for (int i=0; i<inputShapeSize; ++i) {
for (int j=i+1; j<inputShapeSize; ++j) {
if (inputShape[i] > inputShape[j]) {
std::swap(inputShape[i], inputShape[j]);
std::swap(inputStrides[i], inputStrides[j]);
std::swap(outputStrides[i], outputStrides[j]);
}
}
}
}
// Compute inside, outside, axis
int inside = 1;
int insideStride = 0;
@ -99,18 +111,24 @@ public:
int axisStride = 0;
int breakAxis = -1;
int remainSize = 1;
int outputInsideStride = 0;
int outputAxisStride = 0;
int outputOutsideStride = 0;
{
if (inputShapeSize >= 1) {
inside = inputShape[inputShapeSize-1];
insideStride = inputStrides[inputShapeSize-1];
outputInsideStride = outputStrides[inputShapeSize-1];
}
if (inputShapeSize >= 2) {
axis = inputShape[inputShapeSize-2];
axisStride = inputStrides[inputShapeSize-2];
outputAxisStride = outputStrides[inputShapeSize-2];
}
if (inputShapeSize >= 3) {
outside = inputShape[inputShapeSize-3];
outsideStride = inputStrides[inputShapeSize-3];
outputOutsideStride = outputStrides[inputShapeSize-3];
breakAxis = inputShapeSize - 3;
for (int i = 0; i < inputShapeSize - 3; ++i) {
remainSize *= inputShape[i];
@ -130,24 +148,26 @@ public:
for (int indice = 0; indice < remainSize; ++indice) {
int value = indice;
int inputOffset = 0;
int outputOffset = 0;
for (int i = 0; i < breakAxis; ++i) {
auto coordinate = value / mod[i];
inputOffset += coordinate * inputStrides[i];
outputOffset += coordinate * outputStrides[i];
value = value % mod[i];
}
Tensor::InsideDescribe::Region& slice = outputDes->regions[indice];
slice.src.offset = inputOffset + basicOffset;
slice.src.stride[0] = outsideStride * basicStride;
slice.src.offset = inputOffset;
slice.src.stride[0] = outsideStride;
slice.size[0] = outside;
slice.src.stride[1] = axisStride * basicStride;
slice.src.stride[1] = axisStride;
slice.size[1] = axis;
slice.src.stride[2] = insideStride * basicStride;
slice.src.stride[2] = insideStride;
slice.size[2] = inside;
slice.origin = originTensor;
slice.dst.offset = indice * outside * axis * inside;
slice.dst.stride[0] = axis * inside;
slice.dst.stride[1] = inside;
slice.dst.stride[2] = 1;
slice.dst.offset = outputOffset;
slice.dst.stride[0] = outputOutsideStride;
slice.dst.stride[1] = outputAxisStride;
slice.dst.stride[2] = outputInsideStride;
}
return true;
}

View File

@ -67,6 +67,8 @@ public:
};
for (int i = 0; i < N; i++) {
Region src, dst;
src.origin = nullptr;
dst.origin = nullptr;
::memcpy(&src, data[3 * i], 44);
::memcpy(&dst, data[3 * i + 1], 44);
bool fused = TensorUtils::fuseRegion(src, dst);

View File

@ -68,6 +68,7 @@ public:
}
virtual bool run(int precision) {
int numberThread = 0;
MNN::BackendConfig bnConfig;
auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
ExecutorScope scope(exe);
@ -77,10 +78,31 @@ public:
auto y = _ReduceSum(_Multiply(x, x), {});
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
y->readMap<float>();
auto res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
if (numberThread != 4 || res == false) {
FUNC_PRINT(1);
return false;
}
exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4);
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
y->readMap<float>();
res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
if (numberThread != 4 || res == false) {
FUNC_PRINT(1);
return false;
}
exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 1);
// Reset x, y
x = _Input({1, 3, 224, 224}, NC4HW4);
y = _ReduceSum(_Multiply(x, x), {});
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
y->readMap<float>();
res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
if (numberThread != 1 || res == false) {
FUNC_PRINT(1);
return false;
}
return true;
}
};

View File

@ -689,9 +689,18 @@ public:
auto bufferOutput = builderOutput.GetBufferPointer();
std::shared_ptr<Interpreter> net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy);
ScheduleConfig config;
config.numThread = 1;
int runTime = 5;
auto s0 = net->createSession(config);
{
AUTOTIME;
for (int t = 0; t < runTime; ++t) {
net->runSession(s0);
}
}
net->releaseSession(s0);
config.numThread = 4;
auto s1 = net->createSession(config);
int runTime = 10;
{
AUTOTIME;
for (int t = 0; t < runTime; ++t) {
@ -699,7 +708,6 @@ public:
}
}
net->releaseSession(s1);
net = nullptr;
std::vector<std::thread> allThreads;
for (int i = 0; i < 4; ++i) {
allThreads.emplace_back(std::thread([runTime, i, bufferOutput, sizeOutput] {
@ -722,6 +730,31 @@ public:
for (auto& t : allThreads) {
t.join();
}
for (int i=0; i<3; ++i) {
auto rt = Interpreter::createRuntime({config});
auto s0 = net->createSession(config, rt);
auto s1 = net->createSession(config, rt);
int numberThread = 0;
net->getSessionInfo(s0, MNN::Interpreter::THREAD_NUMBER, &numberThread);
if (numberThread != 4) {
FUNC_PRINT(i);
return false;
}
net->getSessionInfo(s1, MNN::Interpreter::THREAD_NUMBER, &numberThread);
if (numberThread != 4) {
FUNC_PRINT(i);
return false;
}
{
AUTOTIME;
for (int t = 0; t < runTime; ++t) {
net->runSession(s0);
}
}
net->releaseSession(s0);
net->releaseSession(s1);
}
return true;
}
virtual bool run(int precision) {

View File

@ -42,6 +42,7 @@ int main(int argc, char* argv[]) {
MNN::BackendConfig config;
config.precision = (MNN::BackendConfig::PrecisionMode)precision;
MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, config, thread);
FUNC_PRINT(thread);
precisionInTestUtil = getTestPrecision(type, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
MNN_PRINT("After update, precision in TestUtil:%d\n", precisionInTestUtil);
}

View File

@ -19,7 +19,8 @@ static void fillFloat(float* dst, int h, int w, ConvertFP32 functor, float offse
for (int y = 0; y < h; ++y) {
auto dstY = dst + w * y;
for (int x = 0; x < w; ++x) {
dstY[x] = functor((float)x * 0.1f + (float)y + offset);
int temp = (x + y) % 31;
dstY[x] = functor(((float)temp + offset) * 0.01f);
}
}
}
@ -38,7 +39,7 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i
}
expected = functor(expected);
auto diff = fabsf(expected - computed);
if (diff > 0.1f) {
if (diff / fabsf(expected) > 0.005f) {
MNN_PRINT("%f -> %f\n", expected, computed);
res = false;
}
@ -270,6 +271,50 @@ public:
}
}
}
// BatchMatMul batch = 1 with large K
{
std::vector<std::vector<int>> values = {
{16, 262144, 15},
{3, 262144, 16}
};
for(auto value : values) {
e = value[0];
l = value[1];
h = value[2];
std::unique_ptr<MNN::OpT> op(new MNN::OpT);
op->type = MNN::OpType_BatchMatMul;
op->main.type = MNN::OpParameter_BatchMatMulParam;
op->main.value = new MNN::BatchMatMulParamT;
auto param = op->main.AsBatchMatMulParam();
param->adjX = false;
param->adjY = true;
int batch = 1;
auto x0 = _Input({}, NHWC, halide_type_of<float>());
auto x1 = _Input({}, NHWC, halide_type_of<float>());
x0->resize({batch, h, l});
x1->resize({batch, l, e});
auto x0Ptr = x0->writeMap<float>();
auto x1Ptr = x1->writeMap<float>();
for (int b = 0; b < batch; ++b) {
fillFloat(x0Ptr + b * h * l, h, l, FP32Converter[precision], (float)b * 10);
fillFloat(x1Ptr + b * e * l, l, e, FP32Converter[precision], (float)b * 10);
}
auto tranposeB = _Transpose(x1, {0, 2, 1});
auto y = Variable::create(Expr::create(op.get(), {x0, tranposeB}));
auto yPtr = y->readMap<float>();
for (int b = 0; b < batch; ++b) {
auto res = checkMatMul(yPtr + b * e * h, x0Ptr + b * h * l, x1Ptr + b * e * l, e, l, h, FP32Converter[precision]);
if (!res) {
FUNC_PRINT(1);
return false;
}
}
}
}
return true;
}
};

View File

@ -71,6 +71,7 @@ protected:
for (int i = 0; i < size_out; ++i) {
auto error = (int32_t)data_out[i] - (int32_t)gotOutput[i];
if (error * error > 1) {
MNN_PRINT("Error case = %d:\n", i);
MNN_PRINT("%s Test error: compute result=%d, right value=%d\n", name.c_str(), (int32_t)gotOutput[i], (int32_t)data_out[i]);
return false;
}
@ -88,7 +89,7 @@ class AddTest : public BinaryTestCommon {
public:
virtual ~AddTest() = default;
virtual bool run(int precision) {
return test<float, float>(_Add, "AddTest", 0.01,
return test<float, float>(MNN::Express::_Add, "AddTest", 0.01,
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {0.0, 0.0, 0.0, 0.0},
{4}, {4}, {4});
}
@ -101,7 +102,7 @@ class AddInt8Test : public BinaryTestCommon {
vector<float> inp2 = {1.1, 2.2, 3.3, 4.6}, inp1 = {2};
vector<float> rightResult = {3.1, 4.2, 5.3, 6.6};
return test<float, float>(_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
return test<float, float>(MNN::Express::_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
{0., 0., 0.});
}
};
@ -110,7 +111,7 @@ class SubtractTest : public BinaryTestCommon {
public:
virtual ~SubtractTest() = default;
virtual bool run(int precision) {
return test<float, float>(_Subtract, "SubtractTest", 0.01,
return test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
{4}, {4}, {4});
}
@ -119,11 +120,11 @@ class SubtractInt8Test : public BinaryTestCommon {
public:
virtual ~SubtractInt8Test() = default;
virtual bool run(int precision) {
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
vector<float> rightResult = {-4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
vector<float> inp1 = {7.0, 28.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
vector<float> rightResult = {1.3, 22.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
-1.1, -4.6, -3.5, -2.4, -1.1};
return test<float, float>(_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
{4, 4}, {1}, {4, 4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
}
};
@ -132,7 +133,7 @@ class MultiplyTest : public BinaryTestCommon {
public:
virtual ~MultiplyTest() = default;
virtual bool run(int precision) {
return test<float, float>(_Multiply, "MultiplyTest", 0.01,
return test<float, float>(MNN::Express::_Multiply, "MultiplyTest", 0.01,
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -4.0, -9.0, -16.0},
{4}, {4}, {4});
}
@ -143,7 +144,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 0.25, 0.43};
vector<float> rightResult = {6.27 , 5.5 , 0.825, 1.978};
return test<float, float>(_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {4}, {4}, {0.4, 0.4, 0.16}, {0., 0., 0.});
}
};
@ -152,7 +153,7 @@ class DivideTest : public BinaryTestCommon {
public:
virtual ~DivideTest() = default;
virtual bool run(int precision) {
return test<float, float>(_Divide, "DivideTest", 0.01,
return test<float, float>(MNN::Express::_Divide, "DivideTest", 0.01,
{-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {-0.5, -0.5, -0.5, -0.5},
{4}, {4}, {4});
}
@ -163,7 +164,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 2.6, 1.88};
vector<float> rightResult = {0.19298, 0.88, 1.269, 2.4468};
return test<float, float>(_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {4}, {4}, {0.4, 0.4, 1.0}, {0., 0., 0.});
}
};
@ -173,7 +174,7 @@ public:
virtual ~PowTest() = default;
virtual bool run(int precision) {
float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 10;
return test<float, float>(_Pow, "PowTest", 0.01 * errorScale,
return test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
{-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
{4}, {4}, {4});
}
@ -182,10 +183,10 @@ class PowInt8Test : public BinaryTestCommon {
public:
virtual ~PowInt8Test() = default;
virtual bool run(int precision) {
vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 2, 4.0};
vector<float> rightResult = {1, 16, 8, 0};
return test<float, float>(_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {4}, {4}, {1.0, 1.0, 1.0}, {0., 0., 0.});
vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 3, 4.0};
vector<float> rightResult = {1, 16, -27.0, 256};
return test<float, float>(MNN::Express::_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {4}, {4}, {1.0, 1.0, 3.0}, {0., 0., 0.});
}
};
@ -193,7 +194,7 @@ class MinimumTest : public BinaryTestCommon {
public:
virtual ~MinimumTest() = default;
virtual bool run(int precision) {
return test<float, float>(_Minimum, "MinimumTest", 0.01,
return test<float, float>(MNN::Express::_Minimum, "MinimumTest", 0.01,
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0},
{4}, {4}, {4});
}
@ -204,7 +205,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {-1.2, -5.0, 8, 10}, inp2 = {9.3, 3.1, 11.0, 2.9};
vector<float> rightResult = {-1.2, -5.0, 8, 2.9};
return test<float, float>(_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {4}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
}
};
@ -224,7 +225,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {-1, -5, 8, 10}, inp2 = {9};
vector<float> rightResult = {9, 9, 9, 10};
return test<float, float>(_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {1}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
}
};
@ -233,7 +234,7 @@ class BiasAddTest : public BinaryTestCommon {
public:
virtual ~BiasAddTest() = default;
virtual bool run(int precision) {
return test<float, float>(_BiasAdd, "BiasAddTest", 0.01,
return test<float, float>(MNN::Express::_BiasAdd, "BiasAddTest", 0.01,
{-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
{1.0, 2.0},
{0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0},
@ -244,7 +245,7 @@ class GreaterTest : public BinaryTestCommon {
public:
virtual ~GreaterTest() = default;
virtual bool run(int precision) {
return test<float, int>(_Greater, "GreaterTest", 0,
return test<float, int>(MNN::Express::_Greater, "GreaterTest", 0,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
{3.0, 4.0},
{0, 0, 0, 0, 1, 1, 1, 1},
@ -255,7 +256,7 @@ class GreaterEqualTest : public BinaryTestCommon {
public:
virtual ~GreaterEqualTest() = default;
virtual bool run(int precision) {
return test<float, int>(_GreaterEqual, "GreaterEqualTest", 0,
return test<float, int>(MNN::Express::_GreaterEqual, "GreaterEqualTest", 0,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
{3.0, 4.0},
{0, 0, 1, 1, 1, 1, 1, 1},
@ -266,7 +267,7 @@ class LessTest : public BinaryTestCommon {
public:
virtual ~LessTest() = default;
virtual bool run(int precision) {
return test<float, int>(_Less, "LessTest", 0,
return test<float, int>(MNN::Express::_Less, "LessTest", 0,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
{3.0, 4.0},
{1, 1, 0, 0, 0, 0, 0, 0},
@ -277,7 +278,7 @@ class FloorDivTest : public BinaryTestCommon {
public:
virtual ~FloorDivTest() = default;
virtual bool run(int precision) {
return test<float, float>(_FloorDiv, "FloorDivTest", 0.01,
return test<float, float>(MNN::Express::_FloorDiv, "FloorDivTest", 0.01,
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.1},
{3.0, 4.0},
{-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0},
@ -290,7 +291,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {-3.98, 17.5, 25.4, 6.7}, inp2 = {3};
vector<float> rightResult = {-2, 5, 8, 2};
return test<float, float>(_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
{4}, {1}, {4}, {0.4, 0.4, 1}, {0., 0., 0.});
}
};
@ -327,7 +328,7 @@ public:
z[i + j * 2] = FP32Converter[precision](fmodf(FP32Converter[precision](x[i+j*2]), FP32Converter[precision](y[i])));
}
}
return test<float, float>(_Mod, "ModTestFloat", 0,
return test<float, float>(MNN::Express::_Mod, "ModTestFloat", 0,
x,y,z,
{4, 2}, {2}, {4, 2});
}
@ -336,7 +337,7 @@ class SquaredDifferenceTest : public BinaryTestCommon {
public:
virtual ~SquaredDifferenceTest() = default;
virtual bool run(int precision) {
return test<float, float>(_SquaredDifference, "SquaredDifferenceTest", 0.01,
return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceTest", 0.01,
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001},
{3.0, 4.0},
{16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0},
@ -349,7 +350,7 @@ public:
virtual bool run(int precision) {
vector<float> inp1 = {-1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8}, inp2 = {3};
vector<float> rightResult = {16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25};
return test<float, float>(_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
{8, 4}, {1}, {8, 4}, {1, 1, 1}, {0., 0., 0.});
}
};
@ -358,7 +359,7 @@ class EqualTest : public BinaryTestCommon {
public:
virtual ~EqualTest() = default;
virtual bool run(int precision) {
return test<float, int>(_Equal, "EqualTest", 0,
return test<float, int>(MNN::Express::_Equal, "EqualTest", 0,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
{3.0, 4.0},
{0, 0, 1, 1, 0, 0, 0, 0},
@ -380,7 +381,7 @@ class FloorModTest : public BinaryTestCommon {
public:
virtual ~FloorModTest() = default;
virtual bool run(int precision) {
return test<float, float>(_FloorMod, "FloorModTest", 0.01,
return test<float, float>(MNN::Express::_FloorMod, "FloorModTest", 0.01,
{-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.1f},
{3.0f, 4.0f},
{2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.1f},
@ -391,7 +392,7 @@ class FloorModInt8Test : public BinaryTestCommon {
public:
virtual ~FloorModInt8Test() = default;
virtual bool run(int precision) {
return test<float, float>(_FloorMod, "FloorModInt8Test", 0.01,
return test<float, float>(MNN::Express::_FloorMod, "FloorModInt8Test", 0.01,
{-1, -3, 5, 7},
{3.0f}, {2, 0, 2, 1},
{4}, {1}, {4}, {0.3, 0.3, 0.3}, {0., 0., 0.});
@ -401,7 +402,7 @@ class Atan2Test : public BinaryTestCommon {
public:
virtual ~Atan2Test() = default;
virtual bool run(int precision) {
return test<float, float>(_Atan2, "Atan2Test", 0.01,
return test<float, float>(MNN::Express::_Atan2, "Atan2Test", 0.01,
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0},
{3.0, -4.0},
{-0.32175055, -2.67794504, -0.7853982, -2.35619449, 1.0303768, 2.15879893, 1.1659045, 2.03444394},
@ -412,7 +413,7 @@ class Atan2Int8Test : public BinaryTestCommon {
public:
virtual ~Atan2Int8Test() = default;
virtual bool run(int precision) {
return test<float, float>(_Atan2, "Atan2Int8Test", 0.01,
return test<float, float>(MNN::Express::_Atan2, "Atan2Int8Test", 0.01,
{-1, -3, 5, 7},
{3}, {-1, 0, 2, 1},
{4}, {1}, {4}, {1, 1, 1}, {0., 0., 0.});
@ -523,7 +524,7 @@ public:
virtual bool run(int precision) {
vector<int> data_x(8, 1), data_y(8, 1), data_out(64, 2);
vector<int> shape_x = {4, 1, 2, 1}, shape_y = {2, 1, 4}, shape_out = {4, 2, 2, 4};
return test<int, int>(_Add, "BinaryBroadcastShapeTest", 0,
return test<int, int>(MNN::Express::_Add, "BinaryBroadcastShapeTest", 0,
data_x, data_y, data_out, shape_x, shape_y, shape_out);
}
};
@ -546,7 +547,7 @@ public:
data_out[j + i * 560] = func(data_x[j] - data_y[j + i * 560]);
}
}
return test<float, float>(_Subtract, "SubtractBroastTest", 0.01,
return test<float, float>(MNN::Express::_Subtract, "SubtractBroastTest", 0.01,
data_x, data_y, data_out, shape_x, shape_y, shape_out);
}
};

View File

@ -212,9 +212,13 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
conv2D->common->kernelY = kernelSize[1];
conv2D->common->relu6 = relu6;
conv2D->common->relu = relu;
MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
conv2D->weight = std::move(weight);
MNN_ASSERT(bias.size() == channel[1]);
conv2D->bias = std::move(bias);
if (sparese) {
size_t weightNNZElement, weightBlockNumber = 0;
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, weight.data(), bias.size(), weight.size() / bias.size(), sparseBlockOC);
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, conv2D->weight.data(), conv2D->bias.size(), conv2D->weight.size() / conv2D->bias.size(), sparseBlockOC);
std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
arg1->key = "sparseBlockOC";
@ -250,11 +254,8 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
conv2D->sparseParameter.reset(sparseComPtr);
CommonCompute::compressFloatWeightToSparse(convOp.get());
}
MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
conv2D->weight = std::move(weight);
MNN_ASSERT(bias.size() == channel[1]);
conv2D->bias = std::move(bias);
return (Variable::create(Expr::create(convOp.get(), {x})));
}

View File

@ -6,12 +6,22 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#include <algorithm>
#include "CommonUtils.hpp"
#include "common/CommonCompute.hpp"
#include "backend/cpu/compute/SparseConvolutionTiledExecutor.hpp"
using namespace MNN;
static inline std::vector<float> getSparsityThreshold() {
// sparsity threadhold values, when sparseblock is
// {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
}
static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
std::vector<float> thresholds = getSparsityThreshold();
return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
}
void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
auto prune_algo_type = MNN::SparseAlgo_RANDOM;
int sparseBlockOC = 1;
@ -41,10 +51,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
size_t weightNNZElement, weightBlockNumber = 0;
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), biasSize, weightSize / biasSize, sparseBlockOC);
float sparsity = 1. - double(weightNNZElement) / weightSize;
// MNN_PRINT(" opname [%s] sparsity is:%f\n", op->name.c_str(), sparsity);
if (!SparseConvolutionTiledExecutor::shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
if (!shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
return;
}
// MNN_PRINT(" opname [%s] sparsity is:%f, use sparse\n", op->name.c_str(), sparsity);
MNN::AttributeT* arg1(new MNN::AttributeT);
arg1->key = "sparseBlockOC";
@ -74,6 +84,7 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
argsVector.emplace_back(sparseArg3);
argsVector.emplace_back(sparseArg4);
// sparseArgs need sorted table, can't use obj interface
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
builder.Finish(sparseCom);
@ -81,6 +92,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
param->sparseParameter.reset(sparseComPtr);
delete arg1;
delete arg2;
delete arg3;
delete arg4;
break;
}
default:

View File

@ -0,0 +1,367 @@
//
// ChannelPruneConvert.cpp
// MNNConverter
//
// Created by MNN on 2023/05/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "CommonUtils.hpp"
#include "MNN/expr/ExprCreator.hpp"
#include <vector>
#include <map>
#include <set>
#include <algorithm>
using namespace MNN;
using namespace MNN::Express;
using namespace std;
// TODO: add more unsafe ops
static std::vector<MNN::OpType> unSafeOpTypes = {
OpType_BroadcastTo, OpType_BatchToSpaceND, OpType_Concat, OpType_LSTM, OpType_LSTMBlockCell, OpType_Reshape, OpType_Resize,
OpType_RNN, OpType_RNNSequenceGRU, OpType_ScatterNd, OpType_Slice, OpType_SliceTf, OpType_SpaceToBatchND, OpType_Raster,
};
struct TensorMaskInfo {
std::vector<int> mask; // per-channel 1 or 0
std::string oriConvName;
};
std::vector<MNN::OpT*> findUserOps(int outputIndex, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph) {
std::vector<MNN::OpT*> userOps;
if (subgraph) {
for (auto& subOp : subgraph->nodes) {
for (int inputIndex : subOp->inputIndexes) {
if (inputIndex == outputIndex) {
userOps.push_back(subOp.get());
}
}
}
} else {
for (auto& netOp : netT->oplists) {
for (int inputIndex : netOp->inputIndexes) {
if (inputIndex == outputIndex) {
userOps.push_back(netOp.get());
}
}
}
}
return userOps;
}
// do the actual channel prune on weights and bias
void channelPrune(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo) {
auto opType = op->type;
if (opType != OpType_Convolution && opType != OpType_ConvolutionDepthwise && opType != OpType_Deconvolution && opType != OpType_DeconvolutionDepthwise && opType != OpType_BatchNorm) {
return;
}
if (op->inputIndexes.size() != 1) {
return;
}
int inputIndex = op->inputIndexes[0];
int outputIndex = op->outputIndexes[0];
std::string inputTensorName = subgraph ? subgraph->tensors[inputIndex] : netT->tensorName[inputIndex];
std::string outputTensorName = subgraph ? subgraph->tensors[outputIndex] : netT->tensorName[outputIndex];
std::vector<int> inputMask = tensorMaskInfo[inputTensorName].mask;
int inputMaskSum = 0;
for (int i = 0; i < inputMask.size(); i++) {
inputMaskSum += inputMask[i];
}
if (opType == OpType_BatchNorm) {
if (!(inputMaskSum < inputMask.size())) {
return;
}
auto bnParams = op->main.AsBatchNorm();
auto slopFloat = bnParams->slopeData;
auto biasFloat = bnParams->biasData;
auto meanFloat = bnParams->meanData;
auto varianceFloat = bnParams->varData;
bnParams->slopeData.clear();
bnParams->biasData.clear();
bnParams->meanData.clear();
bnParams->varData.clear();
for (int i = 0; i < varianceFloat.size(); i++) {
if (inputMask[i] == 1) {
bnParams->slopeData.push_back(slopFloat[i]);
bnParams->biasData.push_back(biasFloat[i]);
bnParams->meanData.push_back(meanFloat[i]);
bnParams->varData.push_back(varianceFloat[i]);
}
}
bnParams->channels = inputMaskSum;
return;
}
auto convParams = op->main.AsConvolution2D();
auto weightFloat = convParams->weight;
auto biasFloat = convParams->bias;
auto& common = convParams->common;
int ko = common->outputCount;
int ki = common->inputCount / common->group;
int kh = common->kernelY;
int kw = common->kernelX;
std::vector<int> opMask;
for (auto info : tensorMaskInfo) {
if (op->name == info.second.oriConvName) {
opMask = info.second.mask;
break;
}
}
int opMaskSum = 0;
for (int i = 0; i < opMask.size(); i++) {
opMaskSum += opMask[i];
}
if (opMaskSum < opMask.size()) {
convParams->weight.clear();
convParams->bias.clear();
for (int i = 0; i < ko; i++) {
int offset = i * ki * kh * kw;
if (opMask[i] == 1) {
for (int j = 0; j < ki * kh * kw; j++) {
convParams->weight.emplace_back(weightFloat[offset + j]);
}
convParams->bias.emplace_back(biasFloat[i]);
}
}
common->outputCount = opMaskSum;
}
if (inputMaskSum < inputMask.size()) {
auto weightFloat = convParams->weight;
convParams->weight.clear();
int ko = common->outputCount;
int ki = common->inputCount / common->group;
int kh = common->kernelY;
int kw = common->kernelX;
for (int i = 0; i < ko; i++) {
for (int j = 0; j < ki; j++) {
int offset = i * ki * kh * kw + j * kh * kw;
if (inputMask[j] == 1) {
for (int k = 0; k < kh * kw; k++) {
convParams->weight.emplace_back(weightFloat[offset + k]);
}
}
}
}
common->inputCount = inputMaskSum;
// we will not do prune for depthwise, its channel pruning only depends on its input tensor's pruning
if (opType == OpType_ConvolutionDepthwise || opType == OpType_DeconvolutionDepthwise) {
common->outputCount = inputMaskSum;
}
}
}
// propagate and analyze prune mask info in model
void analyzePruneInfo(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo, std::set<std::string>& notSafeConvNames) {
auto opType = op->type;
auto inputIndices = op->inputIndexes;
if (inputIndices.size() == 0) {
return;
}
auto outputIndices = op->outputIndexes;
std::vector<std::string> inputTensorNames;
for (int i = 0; i < inputIndices.size(); i++) {
inputTensorNames.push_back(subgraph ? subgraph->tensors[inputIndices[i]] : netT->tensorName[inputIndices[i]]);
}
std::vector<std::string> outputTensorNames;
for (int i = 0; i < outputIndices.size(); i++) {
outputTensorNames.push_back(subgraph ? subgraph->tensors[outputIndices[i]] : netT->tensorName[outputIndices[i]]);
}
if (opType == OpType_Convolution || opType == OpType_Deconvolution) {
if (inputIndices.size() == 1) {
auto convParams = op->main.AsConvolution2D();
auto weightFloat = convParams->weight;
auto biasFloat = convParams->bias;
auto& common = convParams->common;
const int ko = common->outputCount;
const int ki = common->inputCount / common->group;
const int kh = common->kernelY;
const int kw = common->kernelX;
VARP weightVar = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW);
VARP weightMask = _Greater(_ReduceSum(_Abs(weightVar), {1, 2, 3}), _Scalar<float>(1e-6));
VARP maskSum = _ReduceSum(weightMask);
auto maskInfo = weightMask->getInfo();
auto maskPtr = weightMask->readMap<int>();
if (maskSum->readMap<int>()[0] == maskInfo->size) {
return;
}
// conv has pruned, propagate its mask down
tensorMaskInfo[outputTensorNames[0]].oriConvName = op->name;
for (int i = 0; i < maskInfo->size; i++) {
tensorMaskInfo[outputTensorNames[0]].mask.push_back(maskPtr[i]);
}
}
return;
}
std::vector<MNN::OpType>::iterator iter;
iter = std::find(unSafeOpTypes.begin(), unSafeOpTypes.end(), opType);
// not safe op and num_outputs > 1 op are not safe
if ((iter != unSafeOpTypes.end()) || (outputTensorNames.size() > 1)) {
for (auto name : inputTensorNames) {
if (!tensorMaskInfo[name].oriConvName.empty()) {
// so that input tensor mask's oriConv op is not safe
notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
}
}
return;
}
// when a mask is propagated to the output, its oriConv ops are not safe
std::vector<MNN::OpT*> userOps = findUserOps(outputIndices[0], netT, subgraph);
if (userOps.size() == 0) {
for (auto name : inputTensorNames) {
if (!tensorMaskInfo[name].oriConvName.empty()) {
notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
}
}
return;
}
// if the op has more than one input (including const input)
// we need its input tensor's masks are all from one oriConv op
if (inputIndices.size() > 1) {
std::string oriConvName;
std::string oriTensorName;
for (auto name : inputTensorNames) {
if (!tensorMaskInfo[name].oriConvName.empty()) {
oriConvName = tensorMaskInfo[name].oriConvName;
oriTensorName = name;
}
}
if (oriConvName.empty()) {
return;
}
// oriConvName is not empty
bool unsafe = false;
for (auto name : inputTensorNames) {
auto tOriName = tensorMaskInfo[name].oriConvName;
if ((tOriName != oriConvName) && (!tOriName.empty())) {
unsafe = true;
}
}
// if unsafe, all its input tensor mask's oriConvs are not safe
if (unsafe) {
for (auto name : inputTensorNames) {
auto tOriName = tensorMaskInfo[name].oriConvName;
if (!tOriName.empty()) {
notSafeConvNames.insert(tOriName);
}
}
return;
}
// if safe, propagate mask down
tensorMaskInfo[outputTensorNames[0]].oriConvName = oriConvName;
tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[oriTensorName].mask;
return;
}
// for 1 input and 1 output safe op, propagate mask down
tensorMaskInfo[outputTensorNames[0]].oriConvName = tensorMaskInfo[inputTensorNames[0]].oriConvName;
tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[inputTensorNames[0]].mask;
}
void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto) {
bool filterPruned = false;
for (const auto& algo : proto.algo()) {
if (algo.type() == Compression::CompressionAlgo::PRUNE) {
auto prune_type = algo.prune_params().type();
auto prune_algo_type = MNN::SparseAlgo(prune_type);
if (prune_type == Compression::PruneParams_PruneType_FILTER) {
filterPruned = true;
break;
}
}
}
if (!filterPruned) {
return;
}
std::map<std::string, TensorMaskInfo> netMaskInfo;
for (auto tensorName : netT->tensorName) {
netMaskInfo[tensorName] = TensorMaskInfo();
}
std::set<std::string> notSafeConvNames;
for (auto& op : netT->oplists) {
analyzePruneInfo(op, netT, nullptr, netMaskInfo, notSafeConvNames);
}
std::set<std::string>::iterator iter;
if (!notSafeConvNames.empty()) {
for (auto& info : netMaskInfo) {
iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
if (iter != notSafeConvNames.end()) {
for (int i = 0; i < info.second.mask.size(); i++) {
if (info.second.mask[i] == 0) {
info.second.mask[i] = 1;
}
}
}
}
}
for (auto& op : netT->oplists) {
channelPrune(op, netT, nullptr, netMaskInfo);
}
for (auto& subgraph : netT->subgraphs) {
std::map<std::string, TensorMaskInfo> subgraphMaskInfo;
for (auto tensorName : subgraph->tensors) {
subgraphMaskInfo[tensorName] = TensorMaskInfo();
}
std::set<std::string> notSafeConvNames;
for (auto& op : subgraph->nodes) {
analyzePruneInfo(op, netT, subgraph.get(), subgraphMaskInfo, notSafeConvNames);
}
std::set<std::string>::iterator iter;
if (!notSafeConvNames.empty()) {
for (auto& info : subgraphMaskInfo) {
iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
if (iter != notSafeConvNames.end()) {
for (int i = 0; i < info.second.mask.size(); i++) {
if (info.second.mask[i] == 0) {
info.second.mask[i] = 1;
}
}
}
}
}
for (auto& op : subgraph->nodes) {
channelPrune(op, netT, subgraph.get(), subgraphMaskInfo);
}
}
}

View File

@ -24,5 +24,6 @@ void addSparseInfo(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline
void fullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
void weightQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, const modelConfig& config);
void addUUID(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
#endif // COMMMON_UTILS_HPP

View File

@ -7,6 +7,7 @@
//
#include "CommonUtils.hpp"
#include "common/CommonCompute.hpp"
#include "cpp/IDSTEncoder.hpp"
static float findAbsMax(const float *weights, const int count) {
@ -42,17 +43,26 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
const auto opType = op->type;
// config.weightQuantBits only control weight quantization for float convolution
// by default, do coding for convint8 and depthwiseconvint8, if there is any
if ((config.weightQuantBits == 0) && (
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
return;
}
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
opType != MNN::OpType_Deconvolution && opType != MNN::OpType_DeconvolutionDepthwise &&
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8) {
return;
}
auto param = op->main.AsConvolution2D();
auto& common = param->common;
if (param->quanParameter.get() != nullptr) {
return;
}
if (config.weightQuantBits == 0) {
if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
// Do nothing
} else {
CommonCompute::compressFloatWeightToSparse(op.get());
return;
}
}
int bits = 8;
if ((config.weightQuantBits > 0) && (
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
@ -62,12 +72,6 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
bits = std::max(bits, 2);
bits = std::min(bits, 8);
auto param = op->main.AsConvolution2D();
auto& common = param->common;
if (param->quanParameter.get() != nullptr) {
return;
}
int weightSize = param->weight.size();
// shared weights or sth else.
if (weightSize == 0) {

View File

@ -48,7 +48,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c
if (config.benchmarkModel) {
removeParams(netT);
}
if (config.compressionParamsFile != "") {
channelPruneConvert(netT, proto);
}
if (config.saveHalfFloat) {
castParamsToHalf(netT);
}

View File

@ -43,7 +43,7 @@ message LayerQuantizeParams {
optional int32 clamp_min = 4 [default = -128];
optional int32 clamp_max = 5 [default = 127];
}
message WinogradParams {
required int32 version = 1 [default = 0];
// units_attr: {kyStart, kxStart, subKy, subKx, unitY, unitX} x N
@ -80,6 +80,7 @@ message PruneParams {
enum PruneType {
RANDOM = 0;
SIMD_OC = 1;
FILTER = 2;
}
optional PruneType type = 1 [default = RANDOM];
optional LevelPrunerParams level_pruner_params = 2;

View File

@ -359,25 +359,26 @@ const char descriptor_table_protodef_MNN_5fcompression_2eproto[] PROTOBUF_SECTIO
"\030\003 \003(\t\"o\n\022SIMDOCPrunerParams\022\033\n\023weight_t"
"ensor_names\030\001 \003(\t\022\024\n\014prune_ratios\030\002 \003(\002\022"
"\023\n\013layer_names\030\003 \003(\t\022\021\n\toc_blocks\030\004 \003(\005\""
"\366\001\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
"\202\002\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
"ression.PruneParams.PruneType:\006RANDOM\022\?\n"
"\023level_pruner_params\030\002 \001(\0132\".MNN.Compres"
"sion.LevelPrunerParams\022B\n\025simd_oc_pruner"
"_params\030\003 \001(\0132#.MNN.Compression.SIMDOCPr"
"unerParams\"$\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
"IMD_OC\020\001\"\362\001\n\017CompressionAlgo\022H\n\004type\030\001 \001"
"(\01620.MNN.Compression.CompressionAlgo.Com"
"pressionType:\010QUANTIZE\0225\n\014quant_params\030\002"
" \001(\0132\037.MNN.Compression.QuantizeParams\0222\n"
"\014prune_params\030\003 \001(\0132\034.MNN.Compression.Pr"
"uneParams\"*\n\017CompressionType\022\014\n\010QUANTIZE"
"\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n\007version\030\001 \002"
"(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MNN.Compressio"
"n.CompressionAlgo\022\020\n\010mnn_uuid\030\003 \001(\t"
"unerParams\"0\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
"IMD_OC\020\001\022\n\n\006FILTER\020\002\"\362\001\n\017CompressionAlgo"
"\022H\n\004type\030\001 \001(\01620.MNN.Compression.Compres"
"sionAlgo.CompressionType:\010QUANTIZE\0225\n\014qu"
"ant_params\030\002 \001(\0132\037.MNN.Compression.Quant"
"izeParams\0222\n\014prune_params\030\003 \001(\0132\034.MNN.Co"
"mpression.PruneParams\"*\n\017CompressionType"
"\022\014\n\010QUANTIZE\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n"
"\007version\030\001 \002(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MN"
"N.Compression.CompressionAlgo\022\020\n\010mnn_uui"
"d\030\003 \001(\t"
;
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_MNN_5fcompression_2eproto_once;
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_MNN_5fcompression_2eproto = {
false, false, 1835, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto",
false, false, 1847, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto",
&descriptor_table_MNN_5fcompression_2eproto_once, nullptr, 0, 10,
schemas, file_default_instances, TableStruct_MNN_5fcompression_2eproto::offsets,
file_level_metadata_MNN_5fcompression_2eproto, file_level_enum_descriptors_MNN_5fcompression_2eproto, file_level_service_descriptors_MNN_5fcompression_2eproto,
@ -444,6 +445,7 @@ bool PruneParams_PruneType_IsValid(int value) {
switch (value) {
case 0:
case 1:
case 2:
return true;
default:
return false;
@ -453,6 +455,7 @@ bool PruneParams_PruneType_IsValid(int value) {
#if (__cplusplus < 201703) && (!defined(_MSC_VER) || (_MSC_VER >= 1900 && _MSC_VER < 1912))
constexpr PruneParams_PruneType PruneParams::RANDOM;
constexpr PruneParams_PruneType PruneParams::SIMD_OC;
constexpr PruneParams_PruneType PruneParams::FILTER;
constexpr PruneParams_PruneType PruneParams::PruneType_MIN;
constexpr PruneParams_PruneType PruneParams::PruneType_MAX;
constexpr int PruneParams::PruneType_ARRAYSIZE;

View File

@ -153,11 +153,12 @@ inline bool LayerQuantizeParams_QuantMethod_Parse(
}
enum PruneParams_PruneType : int {
PruneParams_PruneType_RANDOM = 0,
PruneParams_PruneType_SIMD_OC = 1
PruneParams_PruneType_SIMD_OC = 1,
PruneParams_PruneType_FILTER = 2
};
bool PruneParams_PruneType_IsValid(int value);
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MIN = PruneParams_PruneType_RANDOM;
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_SIMD_OC;
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_FILTER;
constexpr int PruneParams_PruneType_PruneType_ARRAYSIZE = PruneParams_PruneType_PruneType_MAX + 1;
const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* PruneParams_PruneType_descriptor();
@ -1935,6 +1936,8 @@ class PruneParams final :
PruneParams_PruneType_RANDOM;
static constexpr PruneType SIMD_OC =
PruneParams_PruneType_SIMD_OC;
static constexpr PruneType FILTER =
PruneParams_PruneType_FILTER;
static inline bool PruneType_IsValid(int value) {
return PruneParams_PruneType_IsValid(value);
}

View File

@ -18,16 +18,30 @@ using namespace MNN;
namespace IDSTEncoder {
static void WriteBlobDim(std::ostream &out, std::vector<int> dims)
static bool WriteBlobDim(std::ostream &out, std::vector<int> dims)
{
char tmp[4];
bool useInt32 = false;
((unsigned char *)tmp)[0] = (unsigned char)dims.size();
out.write(tmp, 1);
for (int i = 0; i < dims.size(); i++)
{
unsigned short tmpShort = (unsigned short)dims[i];
out.write((const char*)(&tmpShort), 2);
for (int i = 0; i < dims.size(); i++) {
if (dims[i] > ((1<<16)-1)) {
useInt32 = true;
break;
}
}
if (useInt32) {
for (int i = 0; i < dims.size(); i++) {
unsigned int tmpShort = (unsigned int)dims[i];
out.write((const char*)(&tmpShort), 4);
}
} else {
for (int i = 0; i < dims.size(); i++) {
unsigned short tmpShort = (unsigned short)dims[i];
out.write((const char*)(&tmpShort), 2);
}
}
return useInt32;
}
static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
@ -174,7 +188,7 @@ static unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsi
return best_nnz;
}
static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
{
//push values into buffer
//Find int values in all blobs and check;
@ -239,7 +253,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
{
char tmp[100];
//1. weights blob shape(unsigned int32)
WriteBlobDim(out, {channel, area});
shapeUseInt32 = WriteBlobDim(out, {channel, area});
// 2. Avalable values Count(unsigned char)
tmp[0] = (unsigned char)iCount;
out.write(tmp, 1);
@ -256,7 +270,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
delete[] buf;
}
static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
{
std::set<int> setWeight;
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
@ -358,7 +372,7 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
{ //write
char tmp[100];
// 1.weights blob shape(unsigned int32)
WriteBlobDim(out, {channel, area});
shapeUseInt32 = WriteBlobDim(out, {channel, area});
// 2. nnz
out.write((const char*) &nnz, 4);
// 3. max_step use # bits () (unsigned char)
@ -384,12 +398,14 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
static std::unique_ptr<IDSTQuanT> encode(const std::vector<float>& weight, const std::vector<float>& scale, int kernelSize, int kernelNum,
bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin) {
std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
bool shapeUseInt32 = false;
WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
auto cqStr = outputStringStreamCQ.str();
auto sqStr = outputStringStreamSQ.str();
int int8Size = kernelNum * kernelSize;
idst->shapeInt32 = shapeUseInt32;
if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
idst->type = 4;
idst->aMax = kernelNum;

View File

@ -59,6 +59,7 @@ void Revert::packMNNNet() {
void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
if (mMNNNet->bizCode == "benchmark" || rewrite) {
randStart();
bool useSparse = spasity > 0.5f;
for (auto& op : mMNNNet->oplists) {
const auto opType = op->type;
switch (opType) {
@ -71,51 +72,53 @@ void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
const int oc = convCommon->outputCount / convCommon->group;
param->weight.resize(oc * weightReduceStride);
::memset(param->weight.data(), 0, param->weight.size() * sizeof(float));
size_t weightNNZElement, weightBlockNumber = 0;
MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
MNN::AttributeT* arg1(new MNN::AttributeT);
arg1->key = "sparseBlockOC";
arg1->i = sparseBlockOC;
MNN::AttributeT* arg2(new MNN::AttributeT);
arg2->key = "sparseBlockKernel";
arg2->i = 1;
MNN::AttributeT* arg3(new MNN::AttributeT);
arg3->key = "NNZElement";
arg3->i = weightNNZElement;
MNN::AttributeT* arg4(new MNN::AttributeT);
arg4->key = "blockNumber";
arg4->i = weightBlockNumber;
flatbuffers::FlatBufferBuilder builder;
std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
argsVector.emplace_back(sparseArg1);
argsVector.emplace_back(sparseArg2);
argsVector.emplace_back(sparseArg3);
argsVector.emplace_back(sparseArg4);
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
MNN::SparseAlgo prune_algo_type;
if (sparseBlockOC == 4) {
prune_algo_type = MNN::SparseAlgo_SIMD_OC;
} else {
prune_algo_type = MNN::SparseAlgo_RANDOM;
}
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
builder.Finish(sparseCom);
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
param->sparseParameter.reset(sparseComPtr);
param->bias.resize(convCommon->outputCount);
::memset(param->bias.data(), 0, param->bias.size() * sizeof(float));
if (useSparse) {
size_t weightNNZElement, weightBlockNumber = 0;
MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
MNN::AttributeT* arg1(new MNN::AttributeT);
arg1->key = "sparseBlockOC";
arg1->i = sparseBlockOC;
MNN::AttributeT* arg2(new MNN::AttributeT);
arg2->key = "sparseBlockKernel";
arg2->i = 1;
MNN::AttributeT* arg3(new MNN::AttributeT);
arg3->key = "NNZElement";
arg3->i = weightNNZElement;
MNN::AttributeT* arg4(new MNN::AttributeT);
arg4->key = "blockNumber";
arg4->i = weightBlockNumber;
flatbuffers::FlatBufferBuilder builder;
std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
argsVector.emplace_back(sparseArg1);
argsVector.emplace_back(sparseArg2);
argsVector.emplace_back(sparseArg3);
argsVector.emplace_back(sparseArg4);
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
MNN::SparseAlgo prune_algo_type;
if (sparseBlockOC == 4) {
prune_algo_type = MNN::SparseAlgo_SIMD_OC;
} else {
prune_algo_type = MNN::SparseAlgo_RANDOM;
}
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
builder.Finish(sparseCom);
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
param->sparseParameter.reset(sparseComPtr);
MNN::CommonCompute::compressFloatWeightToSparse(op.get());
}
break;
}
case MNN::OpType_Scale: {

View File

@ -270,9 +270,12 @@ if __name__ == '__main__':
t = TestModel(modelName)
if len(sys.argv) > 2:
if sys.argv[2] == 'DEBUG':
debugMode = len(sys.argv) > 2
print('Debug Mode: ', debugMode)
t.Debug()
message = t.Test()
print(message)
if message.find("TEST_SUCCESS") < 0:
debugMode = len(sys.argv) > 2
print('Debug Mode: ', debugMode)
t.Debug()
else:
specifyOpName = sys.argv[2]
t.TestName(specifyOpName)

View File

@ -2,7 +2,7 @@
## 编译
### MNN 编译与安装
- MNN 编译时打开 MNN_SUPPORT_TRAIN 开关cmake .. -DMNN_SUPPORT_TRAIN=true
- MNN 编译时打开 MNN_BUILD_TRAIN 开关cmake .. -DMNN_BUILD_TRAIN=true
### 产物
- transformer.out
@ -11,6 +11,7 @@
- train.out
- backendTest.out
- backwardTest.out
- runTrainDemo.out
## 使用

View File

@ -29,6 +29,35 @@ using namespace MNN::Express;
using namespace MNN::Train;
using namespace std;
VARP getLocalLearningRate(std::string pName, std::vector<std::vector<std::string>> weightNameGroups, std::vector<std::string> lrNames,
std::map<std::string, VARP> &lrMap, std::map<std::string, std::string> &extraInputs) {
bool hasLocalOptConf = false;
std::string localLrName;
for (int ii = 0; ii < weightNameGroups.size(); ii++) {
if (std::find(weightNameGroups[ii].begin(), weightNameGroups[ii].end(), pName) != weightNameGroups[ii].end()) {
hasLocalOptConf = true;
localLrName = lrNames[ii];
break;
}
}
if (!hasLocalOptConf) {
localLrName = "LearningRate";
}
VARP localLearningRate;
if (lrMap.find(localLrName) != lrMap.end()) {
localLearningRate = lrMap[localLrName];
} else {
auto newLr = _Input({}, NCHW);
newLr->setName(localLrName);
lrMap[localLrName] = newLr;
localLearningRate = newLr;
}
extraInputs[localLrName] = "float";
return localLearningRate;
}
int main(int argc, const char* argv[]) {
if (argc < 4) {
MNN_PRINT("Usage: ./transformer.out temp.bin dst.bin config.json\n");
@ -54,34 +83,59 @@ int main(int argc, const char* argv[]) {
std::vector<std::string> onlyUpdateOps;
std::vector<std::string> stopBackPropOps;
std::string optimizerType = "SGD";
if (configObject.HasMember("Optimizor")) {
auto optimizor = configObject["Optimizor"].GetObject();
if (optimizor.HasMember("OnlyUpdateOps")) {
auto limitArray = optimizor["OnlyUpdateOps"].GetArray();
std::vector<std::string> fixAsConstOps;
std::vector<std::vector<std::string>> weightNameGroups;
std::vector<std::string> lrNames;
if (configObject.HasMember("Optimizer")) {
auto optimizer = configObject["Optimizer"].GetObject();
if (optimizer.HasMember("OnlyUpdateOps")) {
auto limitArray = optimizer["OnlyUpdateOps"].GetArray();
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
onlyUpdateOps.emplace_back(vIter->GetString());
MNN_PRINT("will only update: %s \n", vIter->GetString());
}
}
if (optimizor.HasMember("NoUpdateOps")) {
auto limitArray = optimizor["NoUpdateOps"].GetArray();
if (optimizer.HasMember("NoUpdateOps")) {
auto limitArray = optimizer["NoUpdateOps"].GetArray();
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
noUpdateOps.emplace_back(vIter->GetString());
if (onlyUpdateOps.empty())
MNN_PRINT("will not update: %s \n", vIter->GetString());
}
}
if (optimizor.HasMember("StopBackPropOps")) {
auto limitArray = optimizor["StopBackPropOps"].GetArray();
if (optimizer.HasMember("StopBackPropOps")) {
auto limitArray = optimizer["StopBackPropOps"].GetArray();
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
stopBackPropOps.emplace_back(vIter->GetString());
MNN_PRINT("will stop back prop from (also not update this op): %s \n", vIter->GetString());
}
}
if (optimizor.HasMember("type")) {
optimizerType = std::string(optimizor["type"].GetString());
if (optimizer.HasMember("type")) {
optimizerType = std::string(optimizer["type"].GetString());
MNN_PRINT("optimizer type: %s\n", optimizerType.c_str());
}
if (optimizer.HasMember("FixAsConstOps")) {
auto limitArray = optimizer["FixAsConstOps"].GetArray();
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
fixAsConstOps.emplace_back(vIter->GetString());
MNN_PRINT("this op will be fixed as Const, and maybe turn to Trainable later: %s \n", vIter->GetString());
}
}
if (optimizer.HasMember("ParameterOptConfig")) {
auto pConf = optimizer["ParameterOptConfig"].GetArray();
for (auto vIter = pConf.begin(); vIter != pConf.end(); vIter++) {
auto conf = vIter->GetObject();
if (conf.HasMember("WeightNames") && conf.HasMember("LrName")) {
auto wn = conf["WeightNames"].GetArray();
std::vector<std::string> wNames;
for (auto wIter = wn.begin(); wIter != wn.end(); wIter++) {
wNames.push_back(wIter->GetString());
}
weightNameGroups.push_back(wNames);
lrNames.push_back(conf["LrName"].GetString());
}
}
}
}
auto bnMomentum = new MNN::AttributeT;
bnMomentum->f = 0.99;
@ -100,6 +154,17 @@ int main(int argc, const char* argv[]) {
inputVars = inputsOutputs.first;
outputVars = inputsOutputs.second;
}
for (auto& varIter : inputVars) {
auto var = varIter.second;
auto varInfo = var->getInfo();
auto vDims = varInfo->dim;
if (!fixAsConstOps.empty()) {
if (std::find(fixAsConstOps.begin(), fixAsConstOps.end(), var->name()) != fixAsConstOps.end()) {
var.fix(VARP::CONSTANT);
}
}
}
Transformer::TrainConfig trainConfig;
trainConfig.noUpdateOps = std::move(noUpdateOps);
trainConfig.onlyUpdateOps = std::move(onlyUpdateOps);
@ -185,15 +250,19 @@ int main(int argc, const char* argv[]) {
}
}
}
auto lossInfo = loss->getInfo();
MNN_ASSERT(nullptr != loss);
auto gradMap = OpGrad::grad(loss, parameters, stopBackPropOps);
// Make Update
std::map<VARP, VARP> varUpdateMap;
auto learningRate = _Input();
auto learningRate = _Input({}, NCHW);
learningRate->setName("LearningRate");
auto weightDecay = _Input();
auto weightDecay = _Input({}, NCHW);
weightDecay->setName("WeightDecay");
std::map<std::string, VARP> lrMap;
lrMap["LearningRate"] = learningRate;
auto step = _Scalar<float>(1.0f);
step->setName("optimize_step");
step.fix(VARP::TRAINABLE);
@ -209,12 +278,13 @@ int main(int argc, const char* argv[]) {
}
if (optimizerType == "SGD") {
auto momentum = _Input();
auto momentum = _Input({}, NCHW);
momentum->setName("Momentum");
extraInputs["Momentum"] = "float";
for (auto iter : gradMap) {
auto p = iter.first;
MNN_PRINT("optimize variable: %s\n", p->name().c_str());
p.fix(VARP::TRAINABLE);
auto grad = iter.second;
grad->setName(p->name()+"_grad");
@ -251,7 +321,9 @@ int main(int argc, const char* argv[]) {
auto newHistory = gradWithDecay + momentum * history;
newHistory->setName("update_" + history->name());
auto finalGrad = learningRate * history;
VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
VARP finalGrad = localLearningRate * history;
finalGrad->setName(p->name() + "_final_grad");
auto updateValue = _Subtract(p, finalGrad);
@ -260,11 +332,11 @@ int main(int argc, const char* argv[]) {
varUpdateMap[history] = newHistory;
}
} else if (optimizerType == "ADAM") {
auto beta1 = _Input();
auto beta1 = _Input({}, NCHW);
beta1->setName("Beta1");
auto beta2 = _Input();
auto beta2 = _Input({}, NCHW);
beta2->setName("Beta2");
auto eps = _Input();
auto eps = _Input({}, NCHW);
eps->setName("Eps");
extraInputs["Beta1"] = "float";
@ -276,6 +348,7 @@ int main(int argc, const char* argv[]) {
for (auto iter : gradMap) {
auto p = iter.first;
MNN_PRINT("optimize variable: %s\n", p->name().c_str());
p.fix(VARP::TRAINABLE);
auto grad = iter.second;
grad->setName(p->name()+"_grad");
@ -317,7 +390,9 @@ int main(int argc, const char* argv[]) {
auto newHistory2 = beta2 * history2 + (_Scalar(1.0f) - beta2) * _Square(gradWithDecay);
newHistory2->setName("update_" + history2->name());
auto finalGrad = learningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
auto finalGrad = localLearningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
finalGrad->setName(p->name() + "_final_grad");
auto updateValue = _Subtract(p, finalGrad);

View File

@ -79,6 +79,11 @@ public:
for (int i = 0; i < expr->outputSize(); ++i) {
output[i] = Variable::create(expr, i);
}
int activateType = op->main_as_BinaryOp()->activationType();
if (activateType == 1) { // relu
auto mask = _Cast<float>(_Greater(output[0], _Scalar(0.0f)));
outputDiff = mask * backwardOutput[0];
}
switch (op->main_as_BinaryOp()->opType()) {
case BinaryOpOperation_ADD: {
res[0] = outputDiff;

View File

@ -1,20 +1,28 @@
{
"Train": true,
"Loss": {
"op": "output"
"op": "loss"
},
"Optimizor": {
"Optimizer": {
"OnlyUpdateOps":[],
"NoUpdateOps":[],
"StopBackPropOps":[],
"type": "SGD"
"type": "SGD",
"ParameterOptConfig":[
{
"WeightNames":["example_Weight1", "example_Weight2"],
"LrName":"LearningRate2"
},
{
"WeightNames":["example_Weight3"],
"LrName":"LearningRate3"
}
],
"FixAsConstOps":[]
},
"BatchNorm": {
"momentum":0.99
},
"Debug": {
"L2Norm": []
},
"Shape": {
"input": [1, 3, 224, 224]
}

View File

@ -4,8 +4,5 @@
"OnlyUpdateOps":[],
"NoUpdateOps":[],
"type": "SGD"
},
"Debug": {
"L2Norm": []
}
}