mirror of https://github.com/alibaba/MNN.git
[MNN:Sync] Sync Internal Gitlab: 2.5.1
This commit is contained in:
parent
d7d1efe03b
commit
c70ecef660
|
@ -74,6 +74,6 @@ Pod::Spec.new do |s|
|
|||
end
|
||||
|
||||
s.compiler_flags = '-arch arm64 -march=armv8.2-a+simd+fp16'
|
||||
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1'}
|
||||
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1 MNN_USE_SPARSE_COMPUTE=1'}
|
||||
s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
|
||||
end
|
||||
|
|
|
@ -2,13 +2,17 @@
|
|||
## Linux / macOS / Ubuntu
|
||||
[从源码编译](../compile/tools.html#benchmark),然后执行如下命令:
|
||||
```bash
|
||||
./benchmark.out models_folder loop_count warm_up_count forwardtype
|
||||
./benchmark.out models_folder loop_count warm_up_count forwardtype numberThread precision weightSparsity weightSparseBlockNumber
|
||||
```
|
||||
参数如下:
|
||||
- models_folder: benchmark models文件夹,[benchmark models](https://github.com/alibaba/MNN/tree/master/benchmark/models)。
|
||||
- loop_count: 可选,默认是10
|
||||
- warm_up_count: 预热次数
|
||||
- forwardtype: 可选,默认是0,即CPU,forwardtype有0->CPU,1->Metal,3->OpenCL,6->OpenGL,7->Vulkan
|
||||
- numberThread: 可选,默认是4,为 CPU 线程数或者 GPU 的运行模式
|
||||
- precision: 可选,默认是 2 (precision_low)
|
||||
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
|
||||
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
|
||||
## Android
|
||||
在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下直接执行脚本`bench_android.sh`,默认编译armv7,加参数-64编译armv8,参数-p将[benchmarkModels](https://github.com/alibaba/MNN/tree/master/benchmark/models) push到机器上。
|
||||
脚本执行完成在[benchmark目录](https://github.com/alibaba/MNN/tree/master/benchmark/android)下得到测试结果`benchmark.txt`
|
||||
|
|
|
@ -107,6 +107,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
|
|||
std::shared_ptr<Runtime> bn(creator->onCreate(info));
|
||||
mRuntimes[mAttr->firstType] = bn;
|
||||
}
|
||||
_refreshRuntime();
|
||||
}
|
||||
|
||||
int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
|
||||
|
@ -139,6 +140,7 @@ Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int nu
|
|||
defaultConfig.flags = 4;
|
||||
std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
|
||||
mAttr->constantBackend = defaultBackend;
|
||||
_refreshRuntime();
|
||||
}
|
||||
Executor::~Executor(){
|
||||
// Do nothing
|
||||
|
@ -205,15 +207,38 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
|
|||
auto executor = new Executor(runtime, type, numberThread);
|
||||
return std::shared_ptr<Executor>(executor);
|
||||
}
|
||||
void Executor::_refreshRuntime() {
|
||||
mRuntimeInfo.first.clear();
|
||||
mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
|
||||
auto firstIter = mRuntimes.find(getAttr()->firstType);
|
||||
if (firstIter != mRuntimes.end()) {
|
||||
mRuntimeInfo.first.insert(std::make_pair(firstIter->first.first, firstIter->second));
|
||||
} else {
|
||||
MNN_ASSERT(false);
|
||||
}
|
||||
for (auto& iter : mRuntimes) {
|
||||
if (iter.first.first != getAttr()->firstType.first) {
|
||||
mRuntimeInfo.first.insert(std::make_pair(iter.first.first, iter.second));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeInfo Executor::getRuntime() {
|
||||
RuntimeInfo info;
|
||||
auto glo = ExecutorScope::Current();
|
||||
info.second = glo->mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
|
||||
for (auto& iter : glo->mRuntimes) {
|
||||
info.first.insert(std::make_pair(iter.first.first, iter.second));
|
||||
return glo->mRuntimeInfo;
|
||||
}
|
||||
bool Executor::getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr) {
|
||||
if (nullptr == expr) {
|
||||
return false;
|
||||
}
|
||||
return info;
|
||||
if (nullptr == expr->inside()->mCache.get()) {
|
||||
return false;
|
||||
}
|
||||
auto session = expr->inside()->mCache->getSession();
|
||||
if (nullptr == session) {
|
||||
return false;
|
||||
}
|
||||
return session->getInfo(code, ptr);
|
||||
}
|
||||
|
||||
static bool loadCache(std::shared_ptr<Runtime> &rt, const void* buffer, size_t size) {
|
||||
|
@ -352,6 +377,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
|
|||
} else {
|
||||
res->mInside->mUserConfig = false;
|
||||
}
|
||||
glo->_refreshRuntime();
|
||||
return res;
|
||||
}
|
||||
ExecutorAttr* Executor::getAttr() const {
|
||||
|
@ -603,6 +629,7 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
|
|||
scheduleInfo.pipelineInfo[0].first.info.type = MNN_FORWARD_CPU;
|
||||
} else {
|
||||
scheduleInfo.pipelineInfo[0].first.info.type = current->getAttr()->firstType.first;
|
||||
scheduleInfo.pipelineInfo[0].first.info.numThread = current->getAttr()->firstType.second;
|
||||
}
|
||||
scheduleInfo.pipelineInfo[0].first.needComputeShape = false;
|
||||
scheduleInfo.pipelineInfo[0].first.needComputeGeometry = mLazyMode != LAZY_CONTENT;
|
||||
|
|
|
@ -343,6 +343,9 @@ public:
|
|||
|
||||
/** Resize Info, int*, 0: ready to execute, 1: need malloc, 2: need resize */
|
||||
RESIZE_STATUS = 3,
|
||||
|
||||
/** Mode / NumberThread, int* */
|
||||
THREAD_NUMBER = 4,
|
||||
|
||||
ALL
|
||||
};
|
||||
|
|
|
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
|||
#define STR(x) STR_IMP(x)
|
||||
#define MNN_VERSION_MAJOR 2
|
||||
#define MNN_VERSION_MINOR 5
|
||||
#define MNN_VERSION_PATCH 0
|
||||
#define MNN_VERSION_PATCH 1
|
||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||
#endif /* MNNDefine_h */
|
||||
|
|
|
@ -133,11 +133,15 @@ public:
|
|||
friend class StaticModule;
|
||||
RuntimeManager();
|
||||
};
|
||||
static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
|
||||
private:
|
||||
void _refreshRuntime();
|
||||
Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
|
||||
void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
|
||||
|
||||
// TODO: Remove mRuntimes, only use mRuntimeInfo
|
||||
std::map<std::pair<MNNForwardType, int>, std::shared_ptr<Runtime>> mRuntimes;
|
||||
RuntimeInfo mRuntimeInfo;
|
||||
std::shared_ptr<DebugTools> mDebug;
|
||||
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
|
||||
LazyMode mLazyMode = LAZY_FULL;
|
||||
|
|
|
@ -3953,7 +3953,7 @@
|
|||
CODE_SIGN_STYLE = Automatic;
|
||||
DEAD_CODE_STRIPPING = YES;
|
||||
DEFINES_MODULE = YES;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
DYLIB_COMPATIBILITY_VERSION = 1;
|
||||
DYLIB_CURRENT_VERSION = 1;
|
||||
DYLIB_INSTALL_NAME_BASE = "@rpath";
|
||||
|
@ -3971,6 +3971,7 @@
|
|||
"ENABLE_ARMV82=1",
|
||||
"MNN_COREML_ENABLED=1",
|
||||
"USE_LZ4_FLAG=1",
|
||||
"MNN_USE_SPARSE_COMPUTE=1",
|
||||
);
|
||||
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
|
||||
GCC_WARN_SHADOW = NO;
|
||||
|
@ -3995,7 +3996,7 @@
|
|||
METAL_LIBRARY_FILE_BASE = mnn;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
OTHER_CFLAGS = "";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
|
||||
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
|
||||
PROVISIONING_PROFILE_SPECIFIER = "";
|
||||
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
|
||||
|
@ -4016,7 +4017,7 @@
|
|||
CODE_SIGN_STYLE = Automatic;
|
||||
DEAD_CODE_STRIPPING = YES;
|
||||
DEFINES_MODULE = YES;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
DYLIB_COMPATIBILITY_VERSION = 1;
|
||||
DYLIB_CURRENT_VERSION = 1;
|
||||
DYLIB_INSTALL_NAME_BASE = "@rpath";
|
||||
|
@ -4033,6 +4034,7 @@
|
|||
"ENABLE_ARMV82=1",
|
||||
"MNN_COREML_ENABLED=1",
|
||||
"USE_LZ4_FLAG=1",
|
||||
"MNN_USE_SPARSE_COMPUTE=1",
|
||||
);
|
||||
GCC_SYMBOLS_PRIVATE_EXTERN = YES;
|
||||
GCC_WARN_SHADOW = YES;
|
||||
|
@ -4056,7 +4058,7 @@
|
|||
MACH_O_TYPE = staticlib;
|
||||
METAL_LIBRARY_FILE_BASE = mnn;
|
||||
OTHER_CFLAGS = "";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
|
||||
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
|
||||
PROVISIONING_PROFILE_SPECIFIER = "";
|
||||
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
|
||||
|
@ -4075,7 +4077,7 @@
|
|||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
GCC_ENABLE_CPP_EXCEPTIONS = NO;
|
||||
GCC_ENABLE_CPP_RTTI = NO;
|
||||
HEADER_SEARCH_PATHS = (
|
||||
|
@ -4088,7 +4090,7 @@
|
|||
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
|
@ -4100,7 +4102,7 @@
|
|||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
GCC_ENABLE_CPP_EXCEPTIONS = NO;
|
||||
GCC_ENABLE_CPP_RTTI = NO;
|
||||
HEADER_SEARCH_PATHS = (
|
||||
|
@ -4113,7 +4115,7 @@
|
|||
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = jiuqi.bbbbb.test;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
|
|
|
@ -23,6 +23,10 @@ USE_TRT=False
|
|||
if len(sys.argv) > 1 and sys.argv[1] == '-trt':
|
||||
USE_TRT=True
|
||||
|
||||
USE_CUDA=False
|
||||
if len(sys.argv) > 1 and sys.argv[1] == '-cuda':
|
||||
USE_CUDA=True
|
||||
|
||||
def build_deps():
|
||||
if os.path.isdir('../../schema/private'):
|
||||
IS_INTERNAL_BUILD = args.internal
|
||||
|
@ -49,6 +53,7 @@ def build_deps():
|
|||
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
|
||||
extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
|
||||
extra_opts += ' -DMNN_BUILD_TORCH=ON ' if IS_BUILD_TORCH else ' '
|
||||
extra_opts += ' -DMNN_CUDA=ON ' if USE_CUDA else ' '
|
||||
os.system('cmake ' + extra_opts +
|
||||
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
|
||||
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
|
||||
|
|
|
@ -9,6 +9,10 @@
|
|||
|
||||
set -e
|
||||
|
||||
echo "clean build cache:"
|
||||
echo ">>> rm -rf build dist *.egg-info wheelhouse/*"
|
||||
rm -rf build dist *.egg-info wheelhouse/*
|
||||
|
||||
PROJECT_ROOT=$(cd `dirname $0`;cd ../../;pwd)
|
||||
echo $PROJECT_ROOT
|
||||
export PROJECT_ROOT
|
||||
|
@ -17,6 +21,8 @@ for PYBIN in /opt/python/*/bin; do
|
|||
"${PYBIN}/pip" install -U numpy
|
||||
if [ "$1" == "-trt" ]; then
|
||||
USE_TRT=true "${PYBIN}/python" setup.py bdist_wheel
|
||||
elif [ "$1" == "-cuda" ]; then
|
||||
USE_CUDA=true "${PYBIN}/python" setup.py bdist_wheel
|
||||
else
|
||||
"${PYBIN}/python" setup.py bdist_wheel
|
||||
fi
|
||||
|
@ -26,6 +32,8 @@ done
|
|||
for whl in dist/*.whl; do
|
||||
if [ "$1" == "-trt" ]; then
|
||||
LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/tensorrt:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
|
||||
elif [ "$1" == "-cuda" ]; then
|
||||
LD_LIBRARY_PATH=${PROJECT_ROOT}/pymnn_build/source/backend/cuda:$LD_LIBRARY_PATH auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
|
||||
else
|
||||
auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse/
|
||||
fi
|
||||
|
|
|
@ -59,9 +59,11 @@ def report(*args):
|
|||
|
||||
package_name = 'MNN'
|
||||
USE_TRT=check_env_flag('USE_TRT')
|
||||
USE_CUDA = check_env_flag("USE_CUDA")
|
||||
IS_INTERNAL_BUILD = False
|
||||
|
||||
print ("USE_TRT ", USE_TRT)
|
||||
print("USE_CUDA:", USE_CUDA)
|
||||
|
||||
if os.path.isdir('../../schema/private'):
|
||||
IS_INTERNAL_BUILD = args.serving
|
||||
|
@ -149,7 +151,8 @@ def configure_extension_build():
|
|||
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
|
||||
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
|
||||
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
|
||||
if USE_TRT:
|
||||
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
|
||||
if USE_TRT or USE_CUDA:
|
||||
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
|
||||
engine_library_dirs += ['/usr/local/cuda/lib64/']
|
||||
|
||||
|
@ -187,6 +190,7 @@ def configure_extension_build():
|
|||
engine_include_dirs += [np.get_include()]
|
||||
|
||||
trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
|
||||
cuda_depend = ['-lMNN_Cuda_Main']
|
||||
engine_depend = ['-lMNN']
|
||||
|
||||
# enable logging & model authentication on linux.
|
||||
|
@ -196,12 +200,16 @@ def configure_extension_build():
|
|||
if USE_TRT:
|
||||
engine_depend += trt_depend
|
||||
|
||||
if USE_CUDA:
|
||||
engine_depend += cuda_depend
|
||||
|
||||
tools_compile_args = []
|
||||
tools_libraries = []
|
||||
tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
|
||||
tools_library_dirs = [os.path.join(root_dir, BUILD_DIR)]
|
||||
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "converter")]
|
||||
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
|
||||
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "cuda")]
|
||||
tools_library_dirs += [os.path.join(root_dir, BUILD_DIR, "3rd_party", "protobuf", "cmake")]
|
||||
|
||||
# add libTorch dependency
|
||||
|
@ -227,7 +235,7 @@ def configure_extension_build():
|
|||
os.path.join(torch_lib, 'libc10.dylib')]),
|
||||
('.dylibs', [os.path.join(torch_path, '.dylibs', 'libiomp5.dylib')])]
|
||||
'''
|
||||
if USE_TRT:
|
||||
if USE_TRT or USE_CUDA:
|
||||
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
|
||||
tools_library_dirs += ['/usr/local/cuda/lib64/']
|
||||
|
||||
|
@ -269,6 +277,9 @@ def configure_extension_build():
|
|||
if USE_TRT:
|
||||
tools_depend += trt_depend
|
||||
|
||||
if USE_CUDA:
|
||||
tools_depend += cuda_depend
|
||||
|
||||
if IS_DARWIN:
|
||||
engine_link_args += ['-stdlib=libc++']
|
||||
engine_link_args += ['-Wl,-all_load']
|
||||
|
|
|
@ -942,6 +942,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
|
|||
int32_t aMin;
|
||||
int32_t readType;
|
||||
bool has_scaleInt;
|
||||
bool shapeInt32;
|
||||
uint32_t weightSize;
|
||||
std::vector<uint32_t> index;
|
||||
IDSTQuanT()
|
||||
: type(0),
|
||||
useInt32(false),
|
||||
|
@ -951,7 +954,9 @@ struct IDSTQuanT : public flatbuffers::NativeTable {
|
|||
aMax(0),
|
||||
aMin(0),
|
||||
readType(0),
|
||||
has_scaleInt(false) {
|
||||
has_scaleInt(false),
|
||||
shapeInt32(false),
|
||||
weightSize(0) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -993,6 +998,15 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
|||
bool has_scaleInt() const {
|
||||
return GetField<uint8_t>(24, 0) != 0;
|
||||
}
|
||||
bool shapeInt32() const {
|
||||
return GetField<uint8_t>(26, 0) != 0;
|
||||
}
|
||||
uint32_t weightSize() const {
|
||||
return GetField<uint32_t>(28, 0);
|
||||
}
|
||||
const flatbuffers::Vector<uint32_t> *index() const {
|
||||
return GetPointer<const flatbuffers::Vector<uint32_t> *>(30);
|
||||
}
|
||||
bool Verify(flatbuffers::Verifier &verifier) const {
|
||||
return VerifyTableStart(verifier) &&
|
||||
VerifyOffset(verifier, 4) &&
|
||||
|
@ -1008,6 +1022,10 @@ struct IDSTQuan FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
|||
VerifyField<int32_t>(verifier, 20) &&
|
||||
VerifyField<int32_t>(verifier, 22) &&
|
||||
VerifyField<uint8_t>(verifier, 24) &&
|
||||
VerifyField<uint8_t>(verifier, 26) &&
|
||||
VerifyField<uint32_t>(verifier, 28) &&
|
||||
VerifyOffset(verifier, 30) &&
|
||||
verifier.VerifyVector(index()) &&
|
||||
verifier.EndTable();
|
||||
}
|
||||
IDSTQuanT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
|
@ -1051,6 +1069,15 @@ struct IDSTQuanBuilder {
|
|||
void add_has_scaleInt(bool has_scaleInt) {
|
||||
fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(has_scaleInt), 0);
|
||||
}
|
||||
void add_shapeInt32(bool shapeInt32) {
|
||||
fbb_.AddElement<uint8_t>(26, static_cast<uint8_t>(shapeInt32), 0);
|
||||
}
|
||||
void add_weightSize(uint32_t weightSize) {
|
||||
fbb_.AddElement<uint32_t>(28, weightSize, 0);
|
||||
}
|
||||
void add_index(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index) {
|
||||
fbb_.AddOffset(30, index);
|
||||
}
|
||||
explicit IDSTQuanBuilder(flatbuffers::FlatBufferBuilder &_fbb)
|
||||
: fbb_(_fbb) {
|
||||
start_ = fbb_.StartTable();
|
||||
|
@ -1075,8 +1102,13 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
|
|||
int32_t aMax = 0,
|
||||
int32_t aMin = 0,
|
||||
int32_t readType = 0,
|
||||
bool has_scaleInt = false) {
|
||||
bool has_scaleInt = false,
|
||||
bool shapeInt32 = false,
|
||||
uint32_t weightSize = 0,
|
||||
flatbuffers::Offset<flatbuffers::Vector<uint32_t>> index = 0) {
|
||||
IDSTQuanBuilder builder_(_fbb);
|
||||
builder_.add_index(index);
|
||||
builder_.add_weightSize(weightSize);
|
||||
builder_.add_readType(readType);
|
||||
builder_.add_aMin(aMin);
|
||||
builder_.add_aMax(aMax);
|
||||
|
@ -1086,6 +1118,7 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(
|
|||
builder_.add_type(type);
|
||||
builder_.add_alpha(alpha);
|
||||
builder_.add_buffer(buffer);
|
||||
builder_.add_shapeInt32(shapeInt32);
|
||||
builder_.add_has_scaleInt(has_scaleInt);
|
||||
builder_.add_useInt32(useInt32);
|
||||
return builder_.Finish();
|
||||
|
@ -4390,6 +4423,9 @@ inline void IDSTQuan::UnPackTo(IDSTQuanT *_o, const flatbuffers::resolver_functi
|
|||
{ auto _e = aMin(); _o->aMin = _e; };
|
||||
{ auto _e = readType(); _o->readType = _e; };
|
||||
{ auto _e = has_scaleInt(); _o->has_scaleInt = _e; };
|
||||
{ auto _e = shapeInt32(); _o->shapeInt32 = _e; };
|
||||
{ auto _e = weightSize(); _o->weightSize = _e; };
|
||||
{ auto _e = index(); if (_e) { _o->index.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->index[_i] = _e->Get(_i); } } };
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<IDSTQuan> IDSTQuan::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IDSTQuanT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
|
@ -4411,6 +4447,9 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
|
|||
auto _aMin = _o->aMin;
|
||||
auto _readType = _o->readType;
|
||||
auto _has_scaleInt = _o->has_scaleInt;
|
||||
auto _shapeInt32 = _o->shapeInt32;
|
||||
auto _weightSize = _o->weightSize;
|
||||
auto _index = _o->index.size() ? _fbb.CreateVector(_o->index) : 0;
|
||||
return MNN::CreateIDSTQuan(
|
||||
_fbb,
|
||||
_buffer,
|
||||
|
@ -4423,7 +4462,10 @@ inline flatbuffers::Offset<IDSTQuan> CreateIDSTQuan(flatbuffers::FlatBufferBuild
|
|||
_aMax,
|
||||
_aMin,
|
||||
_readType,
|
||||
_has_scaleInt);
|
||||
_has_scaleInt,
|
||||
_shapeInt32,
|
||||
_weightSize,
|
||||
_index);
|
||||
}
|
||||
|
||||
inline QuantizedFloatParamT *QuantizedFloatParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
|
@ -5908,7 +5950,10 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
|
|||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_BOOL, 0, -1 }
|
||||
{ flatbuffers::ET_BOOL, 0, -1 },
|
||||
{ flatbuffers::ET_BOOL, 0, -1 },
|
||||
{ flatbuffers::ET_UINT, 0, -1 },
|
||||
{ flatbuffers::ET_UINT, 1, -1 }
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"buffer",
|
||||
|
@ -5921,10 +5966,13 @@ inline const flatbuffers::TypeTable *IDSTQuanTypeTable() {
|
|||
"aMax",
|
||||
"aMin",
|
||||
"readType",
|
||||
"has_scaleInt"
|
||||
"has_scaleInt",
|
||||
"shapeInt32",
|
||||
"weightSize",
|
||||
"index"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_TABLE, 11, type_codes, nullptr, nullptr, names
|
||||
flatbuffers::ST_TABLE, 14, type_codes, nullptr, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
|
|
@ -65,6 +65,10 @@ table IDSTQuan {
|
|||
aMin:int;
|
||||
readType:int;
|
||||
has_scaleInt:bool;
|
||||
shapeInt32:bool = false;
|
||||
// For sparse
|
||||
weightSize:uint32;
|
||||
index:[uint32];
|
||||
}
|
||||
|
||||
enum QuantizeAlgo : byte {
|
||||
|
|
|
@ -263,100 +263,6 @@ void executeVec(void* outputRaw, const void* inputRaw0, const void* inputRaw1, i
|
|||
}
|
||||
}
|
||||
|
||||
template<typename Func, typename V, int pack>
|
||||
void executeVecInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
Func compute;
|
||||
int sizeDivUnit = elementSize / pack;
|
||||
int remainCount = elementSize - sizeDivUnit * pack;
|
||||
#ifdef MNN_USE_NEON
|
||||
sizeDivUnit = (elementSize * 4) / pack;
|
||||
remainCount = (elementSize * 4) - sizeDivUnit * pack;
|
||||
#endif
|
||||
auto src0 = inputRaw0;
|
||||
auto src1 = inputRaw1;
|
||||
auto dst = (int8_t*)outputRaw;
|
||||
#ifdef MNN_USE_SSE
|
||||
V zeroPointV((uint8_t)(128));
|
||||
#else
|
||||
V zeroPointV((uint8_t)(0));
|
||||
#endif
|
||||
if (-1 == needBroadcast) {
|
||||
if (sizeDivUnit > 0) {
|
||||
for (int i = 0; i < sizeDivUnit; ++i) {
|
||||
V a = V::load(src0);
|
||||
a -= zeroPointV;
|
||||
V b = V::load(src1);
|
||||
b -= zeroPointV;
|
||||
V::save(dst, compute(a, b) + zeroPointV);
|
||||
src0 += pack;
|
||||
src1 += pack;
|
||||
dst += pack;
|
||||
}
|
||||
}
|
||||
if (remainCount > 0) {
|
||||
int8_t tempSrc0[pack];
|
||||
int8_t tempSrc1[pack];
|
||||
int8_t tempDst[pack];
|
||||
::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
|
||||
::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
|
||||
V a = V::load(tempSrc0);
|
||||
a -= zeroPointV;
|
||||
V b = V::load(tempSrc1);
|
||||
b -= zeroPointV;
|
||||
V::save(tempDst, compute(a, b) + zeroPointV);
|
||||
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
|
||||
}
|
||||
} else if (0 == needBroadcast) {
|
||||
const int8_t srcValue0 = src0[0];
|
||||
V a = V(srcValue0);
|
||||
a -= zeroPointV;
|
||||
if (sizeDivUnit > 0) {
|
||||
for (int i = 0; i < sizeDivUnit; ++i) {
|
||||
const auto src1Ptr = src1;
|
||||
auto dstPtr = dst;
|
||||
V b = V::load(src1Ptr);
|
||||
b -= zeroPointV;
|
||||
V::save(dstPtr, compute(a, b) + zeroPointV);
|
||||
src1 += pack;
|
||||
dst += pack;
|
||||
}
|
||||
}
|
||||
if (remainCount > 0) {
|
||||
int8_t tempSrc1[pack];
|
||||
int8_t tempDst[pack];
|
||||
::memcpy(tempSrc1, src1, remainCount * sizeof(int8_t));
|
||||
V b = V::load(tempSrc1);
|
||||
b -= zeroPointV;
|
||||
V::save(tempDst, compute(a, b) + zeroPointV);
|
||||
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
|
||||
}
|
||||
} else {
|
||||
const int8_t srcValue1 = src1[0];
|
||||
V b = V(srcValue1);
|
||||
b -= zeroPointV;
|
||||
if (sizeDivUnit > 0) {
|
||||
for (int i = 0; i < sizeDivUnit; ++i) {
|
||||
const auto src0Ptr = src0;
|
||||
auto dstPtr = dst;
|
||||
V a = V::load(src0Ptr);
|
||||
a -= zeroPointV;
|
||||
V::save(dstPtr, compute(a, b) + zeroPointV);
|
||||
src0 += pack;
|
||||
dst += pack;
|
||||
}
|
||||
}
|
||||
if (remainCount > 0) {
|
||||
int8_t tempSrc0[pack];
|
||||
int8_t tempDst[pack];
|
||||
::memcpy(tempSrc0, src0, remainCount * sizeof(int8_t));
|
||||
V a = V::load(tempSrc0);
|
||||
a -= zeroPointV;
|
||||
V::save(tempDst, compute(a, b) +zeroPointV);
|
||||
::memcpy(dst, tempDst, remainCount * sizeof(int8_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Vec>
|
||||
struct VecBinaryAdd {
|
||||
Vec operator()(Vec& x, Vec& y) const {
|
||||
|
@ -426,43 +332,49 @@ void execute(void* outputRaw, const void* inputRaw0, const void* inputRaw1, int
|
|||
template<typename Tin, typename Tout, typename Func>
|
||||
void executeInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
Func f;
|
||||
int input0DataCount = elementSize;
|
||||
int input1DataCount = elementSize;
|
||||
int size = elementSize;
|
||||
#ifdef MNN_USE_NEON
|
||||
input0DataCount = elementSize * 4;
|
||||
input1DataCount = elementSize * 4;
|
||||
size *= 4;
|
||||
#endif
|
||||
const Tin* input0Data = (const Tin*)inputRaw0;
|
||||
const Tin* input1Data = (const Tin*)inputRaw1;
|
||||
Tout* outputData = (Tout*)outputRaw;
|
||||
|
||||
|
||||
float inp0 = 0, inp1 = 0, output = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = (int8_t*)inputRaw0;
|
||||
const int8_t* inputData1 = (int8_t*)inputRaw1;
|
||||
int8_t* outputData = (int8_t*)outputRaw;
|
||||
#endif
|
||||
if (needBroadcast == 0) { // data count == 1, not only mean scalar input, maybe of shape (1, 1, 1, ...,1)
|
||||
for (int i = 0; i < input1DataCount; i++) {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
inp0 = (inputData0[0]- zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[i]- zeroPoint) * inputScale1[i];
|
||||
output = f(inp0, inp1);
|
||||
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
|
||||
}
|
||||
} else if (needBroadcast == 1) {
|
||||
for (int i = 0; i < input0DataCount; i++) {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
|
||||
} else if (needBroadcast == 1) {
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
output = f(inp0, inp1);
|
||||
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
|
||||
}
|
||||
} else { // both input contains more than one element,which means no scalar input
|
||||
for (int i = 0; i < input0DataCount; i++) {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
} else {
|
||||
inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
output = f(inp0, inp1);
|
||||
outputData[i] = (Tout)(output * outputScale[i] + zeroPoint);
|
||||
}
|
||||
int value = (int)roundf(output * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -104,9 +104,16 @@ float CPURuntime::onGetMemoryInMB() {
|
|||
auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
|
||||
return staticMemoryInMB;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool CPURuntime::onCheckInfo(Backend::Info& info) const {
|
||||
#ifdef MNN_USE_THREAD_POOL
|
||||
int threadNumber = mThreadNumber;
|
||||
if (mTaskIndex < 0) {
|
||||
threadNumber = 1;
|
||||
}
|
||||
info.numThread = threadNumber;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
Backend* CPURuntime::onCreate(const BackendConfig* config) const {
|
||||
auto precision = mPrecision;
|
||||
|
|
|
@ -31,6 +31,8 @@ public:
|
|||
}
|
||||
void onConcurrencyBegin() const;
|
||||
void onConcurrencyEnd() const;
|
||||
virtual bool onCheckInfo(Backend::Info& info) const override;
|
||||
|
||||
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
|
|
|
@ -35,13 +35,12 @@ ErrorCode CPUBinaryInt8::onResize(const std::vector<Tensor*>& inputs, const std:
|
|||
}
|
||||
MNN_ASSERT(mTotalSize == ((CPUBackend*)backend())->getTensorSize(outputs[0]));
|
||||
|
||||
std::vector<float> scale0(mTotalSize), scale1(mTotalSize), outputScale(mTotalSize);
|
||||
std::fill(scale0.begin(), scale0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
|
||||
std::fill(scale1.begin(), scale1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
|
||||
std::fill(outputScale.begin(), outputScale.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
|
||||
mInputQuant0 = scale0;
|
||||
mInputQuant1 = scale1;
|
||||
mOutputQuant = outputScale;
|
||||
mInputQuant0.resize(mTotalSize);
|
||||
mInputQuant1.resize(mTotalSize);
|
||||
mOutputQuant.resize(mTotalSize);
|
||||
std::fill(mInputQuant0.begin(), mInputQuant0.end(), TensorUtils::getDescribe(inputs[0])->quantAttr->scale);
|
||||
std::fill(mInputQuant1.begin(), mInputQuant1.end(), TensorUtils::getDescribe(inputs[1])->quantAttr->scale);
|
||||
std::fill(mOutputQuant.begin(), mOutputQuant.end(), 1 / TensorUtils::getDescribe(outputs[0])->quantAttr->scale);
|
||||
|
||||
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
|
||||
mActivationExe.reset(new CPURelu(backend(), 0.0));
|
||||
|
@ -56,15 +55,10 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
|
|||
auto output = outputs[0];
|
||||
|
||||
auto schedule = ((CPUBackend*)backend())->multiThreadDivide(mTotalSize);
|
||||
#ifdef MNN_USE_SSE
|
||||
auto input0Ptr = input->host<uint8_t>();
|
||||
auto input1Ptr = input1->host<uint8_t>();
|
||||
auto outputPtr = outputs[0]->host<uint8_t>();
|
||||
#else
|
||||
|
||||
auto input0Ptr = input->host<int8_t>();
|
||||
auto input1Ptr = input1->host<int8_t>();
|
||||
auto outputPtr = outputs[0]->host<int8_t>();
|
||||
#endif
|
||||
|
||||
int inpBytes = 1;
|
||||
int outBytes = 1;
|
||||
|
@ -90,7 +84,7 @@ ErrorCode CPUBinaryInt8::onExecute(const std::vector<Tensor*>& inputs, const std
|
|||
#ifdef MNN_USE_NEON
|
||||
mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize / 4, mNeedBroadcastIndex);
|
||||
#else
|
||||
mProc((int8_t*)out, (int8_t*)inp0, (int8_t*)inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
|
||||
mProc(out, inp0, inp1, scale0, scale1, scaleDst, realSize, mNeedBroadcastIndex);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,19 +40,21 @@ public:
|
|||
};
|
||||
class CPUConvolution : public Execution {
|
||||
public:
|
||||
struct ResourceDequantizeInfo {
|
||||
int bits = 32;
|
||||
std::shared_ptr<Tensor> mScaleBias;
|
||||
std::vector<int8_t> mLowBitWeightMap;
|
||||
};
|
||||
struct Resource {
|
||||
std::shared_ptr<Tensor> mWeight;
|
||||
std::shared_ptr<Tensor> mBias;
|
||||
ResourceDequantizeInfo mDequantize;
|
||||
Backend* backend;
|
||||
bool copyBiasAlign(const float* bias, int outputCount);
|
||||
~ Resource() {
|
||||
if (nullptr != mBias) {
|
||||
backend->onReleaseBuffer(mBias.get(), Backend::STATIC);
|
||||
}
|
||||
if (nullptr != mWeight) {
|
||||
backend->onReleaseBuffer(mWeight.get(), Backend::STATIC);
|
||||
}
|
||||
}
|
||||
int hU;
|
||||
int lU;
|
||||
int lP;
|
||||
int hP;
|
||||
};
|
||||
struct ResourceInt8 {
|
||||
std::vector<int> mInt8WeightKernelSum;
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#include <vector>
|
||||
#include "../CPURuntime.hpp"
|
||||
#include "common/MemoryFormater.h"
|
||||
#include "common/CommonCompute.hpp"
|
||||
// TODO: Find better way to optimize it
|
||||
#include "../CPUBinary.hpp"
|
||||
#include "../CPUUnary.hpp"
|
||||
|
@ -174,107 +173,6 @@ void MNNUnpackC2Common(T* dst, const T* src, size_t area, size_t depth, int* are
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
source: source matrix is h x l
|
||||
transpose: if false, export compressed matrix as h x l, other export as l x h.
|
||||
*/
|
||||
void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose) {
|
||||
// 1. in convolution, source B layout is OC x (KH * KW * IC),
|
||||
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
|
||||
// 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
|
||||
|
||||
// BCSR
|
||||
if (transpose) {
|
||||
int rowOffset = 0;
|
||||
for (int i = 0; i < l; i += 1) {
|
||||
*NNZMap = 0;
|
||||
for(int j = 0; j < h; j += sparseBlockOC) {
|
||||
if(!MNN::CommonCompute::checkAllZeros(source + j * l + i, l, sparseBlockOC, 1)) {
|
||||
*dest = *(source + j * l + l);
|
||||
dest++;
|
||||
*NNZMap = *NNZMap + 1;
|
||||
*dataOffsetMap = rowOffset;
|
||||
dataOffsetMap++;
|
||||
rowOffset = 0;
|
||||
}
|
||||
rowOffset += eP;
|
||||
}
|
||||
NNZMap++;
|
||||
rowOffset -= h * eP;
|
||||
}
|
||||
} else { // BCSC
|
||||
int columOffset = 0;
|
||||
int i = 0;
|
||||
for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
|
||||
*NNZMap = 0;
|
||||
for(int j = 0; j < l; j += 1) {
|
||||
if (!MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1)) {
|
||||
for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
|
||||
*dest = *(source + ioc * l);
|
||||
dest++;
|
||||
}
|
||||
*NNZMap = *NNZMap + 1;
|
||||
*dataOffsetMap = columOffset;
|
||||
dataOffsetMap++;
|
||||
columOffset = 0;
|
||||
}
|
||||
columOffset += eP;
|
||||
source++;
|
||||
}
|
||||
NNZMap++;
|
||||
source += l * (sparseBlockOC - 1);
|
||||
columOffset -= l * eP;
|
||||
}
|
||||
|
||||
for (; i < h; i++) {
|
||||
*NNZMap = 0;
|
||||
for(int j = 0; j < l; j++) {
|
||||
if (*source != 0.0f) {
|
||||
*dest = *source;
|
||||
dest++;
|
||||
*NNZMap = *NNZMap + 1;
|
||||
*dataOffsetMap = columOffset;
|
||||
dataOffsetMap++;
|
||||
columOffset = 0;
|
||||
}
|
||||
columOffset += eP;
|
||||
source++;
|
||||
}
|
||||
NNZMap++;
|
||||
columOffset -= l * eP;
|
||||
}
|
||||
|
||||
*dataOffsetMap = columOffset; //
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l) {
|
||||
size_t nnzBlock = 0;
|
||||
size_t nnzTail = 0;
|
||||
int ocEven = (h / sparseBlockOC) * sparseBlockOC;
|
||||
size_t ioc = 0;
|
||||
for (; ioc < ocEven; ioc += sparseBlockOC) {
|
||||
for (size_t i = 0; i < l; i++) {
|
||||
bool isZero = MNN::CommonCompute::checkAllZeros(source, l, sparseBlockOC, 1);
|
||||
nnzBlock += !isZero;
|
||||
source++;
|
||||
}
|
||||
source += (sparseBlockOC - 1) * l;
|
||||
}
|
||||
for (; ioc < h; ioc++) {
|
||||
for (size_t i = 0; i < l; i++) {
|
||||
bool isZero = (*source) == 0.0f;
|
||||
nnzTail += !isZero;
|
||||
source++;
|
||||
}
|
||||
}
|
||||
weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
|
||||
weightBlockNumber = nnzBlock + nnzTail;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef MNN_USE_NEON
|
||||
|
||||
void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
|
||||
|
@ -2875,8 +2773,6 @@ void MNNCoreFunctionInit() {
|
|||
gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;
|
||||
|
||||
gCoreFunction->MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode;
|
||||
gCoreFunction->MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B; // sparse packing B
|
||||
gCoreFunction->MNNGetOptimalBlockShape = MNNGetOptimalBlockShape;
|
||||
gCoreFunction->MNNAdjustOptimalSparseKernel = _MNNAdjustOptimalSparseKernel;
|
||||
|
||||
gCoreFunction->MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1;
|
||||
|
@ -2995,4 +2891,4 @@ void MNNPackC2Origin(double* dst, const double* src, size_t area, size_t depth,
|
|||
areaOffset,
|
||||
};
|
||||
MNNPackC2(dst, src, area, depth, offset);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,10 +198,6 @@ struct CoreFunctions {
|
|||
MNNBinaryExecute(*MNNSelectBinaryFunctionForFloat)(int opType);
|
||||
MNNUnaryExecute(*MNNSelectUnaryFunctionForFloat)(int opType, int precisionMode);
|
||||
|
||||
// sparse matrix multiply
|
||||
void(*MNNPackForSparseMatMul_B)(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose);
|
||||
void(*MNNGetOptimalBlockShape)(size_t& weightNNZElement, size_t& weightBlockNumber, const float* source, int sparseBlockOC, size_t h, size_t l);
|
||||
|
||||
// B matrix is sparsed
|
||||
typedef void(*MNNPackedSparseMatMul)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
|
||||
void(*MNNAdjustOptimalSparseKernel)(int& sparseBlockOC, MNNPackedSparseMatMul& packedSparseMatMul);
|
||||
|
|
|
@ -26,29 +26,25 @@ namespace MNN {
|
|||
|
||||
static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend,
|
||||
const Convolution2D* conv2d, const float* originWeight, size_t originWeightSize,
|
||||
const float* bias, size_t biasSize) {
|
||||
const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> weightQuantInfo, bool supportSparse) {
|
||||
auto cpuBackend = (CPUBackend*)backend;
|
||||
bool lowMemory = cpuBackend->memoryMode() == BackendConfig::Memory_Low;
|
||||
auto common = conv2d->common();
|
||||
#ifdef MNN_USE_ONEDNN
|
||||
return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||
#endif
|
||||
|
||||
#ifdef MNN_USE_SPARSE_COMPUTE
|
||||
|
||||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||
int bytes = core->bytes;
|
||||
#ifdef MNN_USE_SSE
|
||||
const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
|
||||
#else
|
||||
const bool onlySSENotAVX = false;
|
||||
#endif
|
||||
if (!onlySSENotAVX && bytes == 4 && conv2d->sparseParameter()) {
|
||||
if (SparseConvolutionTiledExecutor::shouldUseSparseConvolution(originWeightSize, conv2d->sparseParameter())) {
|
||||
return new SparseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize,
|
||||
if (conv2d->sparseParameter() && nullptr != weightQuantInfo.get()) {
|
||||
if (supportSparse) {
|
||||
return new SparseConvolutionTiledExecutor(common, backend, weightQuantInfo->quan,
|
||||
conv2d->sparseParameter(), bias, biasSize);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
if (lowMemory || originWeightSize == 0) {
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
|
||||
}
|
||||
bool fastWay = common->kernelY() == 1 && common->kernelX() == 1
|
||||
&& output->width() == input->width() && output->height() == input->height()
|
||||
&& common->strideX() == 1 && common->strideY() == 1;
|
||||
|
@ -56,16 +52,12 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
|
|||
return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||
}
|
||||
if (!ConvolutionWinogradBridge::canUseWinograd(common)) {
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||
}
|
||||
auto cpuBackend = (CPUBackend*)backend;
|
||||
if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
|
||||
}
|
||||
PerfConfig convPerfconfig = DenseConvolutionTiledExecutor::bestTileConvolutionConfig(common, input, output, cpuBackend->threadNumber(), backend);
|
||||
auto winogradConfig = ConvolutionWinogradBridge::bestWinogradUnit(common, input, output, cpuBackend->threadNumber(), backend, convPerfconfig);
|
||||
if (winogradConfig.unit <= 1) {
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize);
|
||||
return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
|
||||
}
|
||||
return ConvolutionWinogradBridge::createWinogradImpl(common, input, output, backend, originWeight, originWeightSize, bias, biasSize,
|
||||
winogradConfig);
|
||||
|
@ -78,22 +70,39 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
|
|||
// Multi Input
|
||||
return new ConvolutionTiledExecutorMultiInput(conv2d->common(), backend);
|
||||
}
|
||||
bool lowMemory = static_cast<CPUBackend*>(backend)->memoryMode() == BackendConfig::Memory_Low && static_cast<CPUBackend*>(backend)->functions()->bytes == 4;
|
||||
const float* originWeight = nullptr;
|
||||
const float* originBias = nullptr;
|
||||
int originWeightSize = 0;
|
||||
int originBiasSize = 0;
|
||||
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
|
||||
std::unique_ptr<Tensor> externalWeightTensor, externalBiasTensor;
|
||||
bool supportSparse = false;
|
||||
#ifdef MNN_USE_SPARSE_COMPUTE
|
||||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||
int bytes = core->bytes;
|
||||
#ifdef MNN_USE_SSE
|
||||
const bool onlySSENotAVX = core->pack == 4; // no backend of only sse without avx2 or avx512
|
||||
#else
|
||||
const bool onlySSENotAVX = false;
|
||||
#endif
|
||||
supportSparse = !onlySSENotAVX && bytes == 4;
|
||||
#endif
|
||||
if (nullptr != conv2d->quanParameter()) {
|
||||
quanCommon = ConvolutionCommon::load(conv2d->quanParameter());
|
||||
bool forceFloat = false;
|
||||
if (!supportSparse && conv2d->quanParameter()->index() != nullptr) {
|
||||
// The weight is storage as float sparse, but the backend don't support sparse compute, expand it
|
||||
forceFloat = true;
|
||||
}
|
||||
quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), forceFloat, lowMemory);
|
||||
if (nullptr == quanCommon) {
|
||||
MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (quanCommon->weightFloat.get() == nullptr) {
|
||||
if (conv2d->quanParameter()->has_scaleInt()) {
|
||||
if (backend->type() != MNN_FORWARD_CPU) {
|
||||
// From BF16
|
||||
// From BF16 / FP16
|
||||
return nullptr;
|
||||
}
|
||||
return ConvolutionIntFactory::create(inputs[0], outputs[0], op, backend, quanCommon.get());
|
||||
|
@ -114,7 +123,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
|
|||
return nullptr;
|
||||
}
|
||||
auto common = conv2d->common();
|
||||
if (nullptr == originWeight) {
|
||||
if (nullptr == originWeight && nullptr != op->main_as_Convolution2D()->weight()) {
|
||||
originWeight = op->main_as_Convolution2D()->weight()->data();
|
||||
originWeightSize = op->main_as_Convolution2D()->weight()->size();
|
||||
}
|
||||
|
@ -130,7 +139,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
|
|||
MNN_ASSERT(group > 0);
|
||||
if (1 == group) {
|
||||
return _createUnit(inputs[0], outputs[0], backend, conv2d, originWeight, originWeightSize,
|
||||
originBias, originBiasSize);
|
||||
originBias, originBiasSize, quanCommon, supportSparse);
|
||||
}
|
||||
// TODO: Use Geometry to split
|
||||
// Split
|
||||
|
@ -144,7 +153,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
|
|||
for (int i = 0; i < group; ++i) {
|
||||
auto newConvolution =
|
||||
_createUnit(emptyInput.get(), emptyOutput.get(), backend, conv2d, originWeight + groupWeightSize * i,
|
||||
groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount);
|
||||
groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount, quanCommon, supportSparse);
|
||||
subConvolution.push_back(std::shared_ptr<Execution>(newConvolution));
|
||||
}
|
||||
return new ConvolutionGroup(backend, subConvolution);
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
// Created by MNN on 2018/07/16.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include <math.h>
|
||||
#include "DenseConvolutionTiledExecutor.hpp"
|
||||
#include <MNN/AutoTime.hpp>
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
|
@ -19,6 +19,7 @@
|
|||
#include "common/MemoryFormater.h"
|
||||
#define PARAMETERSIZE 6
|
||||
|
||||
#define MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
using Vec4 = MNN::Math::Vec<float, 4>;
|
||||
namespace MNN {
|
||||
|
||||
|
@ -27,10 +28,86 @@ void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source,
|
|||
function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
|
||||
|
||||
}
|
||||
static bool _initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common> int8Info, std::shared_ptr<CPUConvolution::Resource> resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize) {
|
||||
int weightLength = hU * lU * hP * lP;
|
||||
resource->mWeight.reset(Tensor::createDevice<uint8_t>(
|
||||
{weightLength}));
|
||||
auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
resource->mDequantize.bits = 8;
|
||||
resource->lU = lU;
|
||||
resource->hU = hU;
|
||||
resource->lP = lP;
|
||||
resource->hP = hP;
|
||||
// Reorder weight
|
||||
MNN_ASSERT(lP == 1);
|
||||
auto dstWInt8 = resource->mWeight->host<int8_t>();
|
||||
auto srcWInt8 = int8Info->weight.get();
|
||||
for (int y=0; y<outputCount; ++y) {
|
||||
int yo = y / hP;
|
||||
int yi = y % hP;
|
||||
auto srcY = srcWInt8 + y * srcChannel * kernelSize;
|
||||
auto dstY = dstWInt8 + yo * lP * hP * lU + yi;
|
||||
for (int iz=0; iz<srcChannel; ++iz) {
|
||||
for (int k=0; k<kernelSize; ++k) {
|
||||
int sx = iz * kernelSize + k;
|
||||
int dx = iz + k * srcChannel;
|
||||
dstY[dx * hP] = srcY[sx];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save scale bias
|
||||
resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<float>({hU * hP * 2}));
|
||||
res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
|
||||
auto biasPtr = resource->mDequantize.mScaleBias->host<float>() + hU * hP;
|
||||
::memset(alphaPtr, 0, 2 * hU * hP * sizeof(float));
|
||||
int h = int8Info->alpha.size();
|
||||
if (int8Info->asymmetric) {
|
||||
h = h / 2;
|
||||
for (int i=0; i<h; ++i) {
|
||||
alphaPtr[i] = int8Info->alpha.get()[2 * i + 1];
|
||||
biasPtr[i] = int8Info->alpha.get()[2 * i];
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<h; ++i) {
|
||||
alphaPtr[i] = int8Info->alpha.get()[i];
|
||||
}
|
||||
}
|
||||
if (int8Info->canUseInt4) {
|
||||
MNN_ASSERT(weightLength % 2 == 0);
|
||||
weightLength = UP_DIV(weightLength, 2);
|
||||
resource->mDequantize.bits = 4;
|
||||
resource->mDequantize.mLowBitWeightMap = int8Info->weightMap;
|
||||
std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<uint8_t>(
|
||||
{weightLength}));
|
||||
auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
auto srcPtr = resource->mWeight->host<int8_t>();
|
||||
auto dstPtr = weightLow->host<uint8_t>();
|
||||
for (int i=0; i<weightLength; ++i) {
|
||||
int s0 = srcPtr[2 * i + 0];
|
||||
int s1 = srcPtr[2 * i + 1];
|
||||
s0 = int8Info->weightReverseMap[(int)s0 + 128];
|
||||
s1 = int8Info->weightReverseMap[(int)s1 + 128];
|
||||
int d = s0 * 16 + s1;
|
||||
dstPtr[i] = d;
|
||||
}
|
||||
resource->mWeight = weightLow;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
|
||||
const float* originWeight, size_t originWeightSize,
|
||||
const float* bias, size_t biasSize)
|
||||
const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> int8Info)
|
||||
: ConvolutionTiledExecutor(b, bias, biasSize) {
|
||||
|
||||
auto outputCount = (int)biasSize;
|
||||
|
@ -38,22 +115,40 @@ DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2D
|
|||
auto core = static_cast<CPUBackend*>(b)->functions();
|
||||
int bytes = core->bytes;
|
||||
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
||||
bool useInt8Weight = 0 == originWeightSize;
|
||||
if (useInt8Weight) {
|
||||
MNN_ASSERT(nullptr != int8Info.get());
|
||||
originWeightSize = int8Info->weight.size();
|
||||
}
|
||||
// Don't use common->inputCount for old model common->inputCount is zero
|
||||
auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
|
||||
auto lSize = srcCount * common->kernelX() * common->kernelY();
|
||||
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
|
||||
{UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
|
||||
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
|
||||
|
||||
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||
if (!mValid) {
|
||||
return;
|
||||
auto hU = UP_DIV(outputCount, hP);
|
||||
auto lU = UP_DIV(lSize, lP);
|
||||
if (useInt8Weight) {
|
||||
// Quantize weight to int8
|
||||
auto allocSuccess = _initQuantizeResource(int8Info, mResource, hU, hP, lU, lP, outputCount, srcCount, common->kernelX() * common->kernelY());
|
||||
if (!allocSuccess) {
|
||||
mValid = false;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
|
||||
{hU * lU * hP * lP * bytes}));
|
||||
mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||
if (!mValid) {
|
||||
return;
|
||||
}
|
||||
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
|
||||
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||
if (!mValid) {
|
||||
return;
|
||||
}
|
||||
initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
|
||||
// MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
|
||||
// formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
|
||||
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
||||
}
|
||||
initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
|
||||
// MNN_PRINT("srcCount:%d, outputCount:%d, dense weight matrix tile:", srcCount, outputCount);
|
||||
// formatMatrix(mResource->mWeight->host<float>(), {UP_DIV(outputCount, hP), lSize, hP});
|
||||
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
||||
mProxy.reset(new DenseConvolutionTiledImpl(common, b));
|
||||
}
|
||||
|
||||
|
@ -77,6 +172,121 @@ bool DenseConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution
|
|||
return true;
|
||||
}
|
||||
|
||||
ErrorCode DenseConvolutionTiledExecutor::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
bool needDequantize = mResource->mDequantize.bits <= 8;
|
||||
if (needDequantize) {
|
||||
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
auto res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
if (nullptr != mWeightCache.weightInt8) {
|
||||
res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
auto hU = mResource->hU;
|
||||
auto hP = mResource->hP;
|
||||
auto mid = mResource->lU * mResource->lP;
|
||||
auto srcInt8 = mResource->mWeight->host<int8_t>();
|
||||
if (mResource->mDequantize.bits == 4) {
|
||||
int weightLength = hU * hP * mid;
|
||||
weightLength = UP_DIV(weightLength, 2);
|
||||
auto srcPtr = mResource->mWeight->host<uint8_t>();
|
||||
auto dstPtr = mWeightCache.weightInt8->host<int8_t>();
|
||||
for (int i=0; i<weightLength; ++i) {
|
||||
int d = srcPtr[i];
|
||||
int s0 = d / 16;
|
||||
int s1 = d % 16;
|
||||
s0 = mResource->mDequantize.mLowBitWeightMap[s0];
|
||||
s1 = mResource->mDequantize.mLowBitWeightMap[s1];
|
||||
dstPtr[2 * i + 0] = s0;
|
||||
dstPtr[2 * i + 1] = s1;
|
||||
}
|
||||
srcInt8 = mWeightCache.weightInt8->host<int8_t>();
|
||||
}
|
||||
auto alpha = mResource->mDequantize.mScaleBias->host<float>();
|
||||
auto bias = mResource->mDequantize.mScaleBias->host<float>() + hU * hP;
|
||||
auto dstFloat = mWeightCache.weight->host<float>();
|
||||
for (int yo=0; yo<hU; ++yo) {
|
||||
auto dstY = dstFloat + yo * mid * hP;
|
||||
auto srcY = srcInt8 + yo * mid * hP;
|
||||
auto k = alpha + yo * hP;
|
||||
auto b = bias + yo * hP;
|
||||
for (int x=0; x<mid; ++x) {
|
||||
auto dstX = dstY + x * hP;
|
||||
auto srcX = srcY + x * hP;
|
||||
for (int yi=0; yi<hP; ++yi) {
|
||||
dstX[yi] = srcX[yi] * k[yi] + b[yi];
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
if (mWeightCache.weightInt8 != nullptr) {
|
||||
backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::STATIC);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
auto code = mProxy->onExecute(mInputs, outputs);
|
||||
#ifndef MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
if (needDequantize) {
|
||||
backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::STATIC);
|
||||
}
|
||||
((Runtime*)(static_cast<CPUBackend*>(backend())->getRuntime()))->onGabageCollect(0);
|
||||
#endif
|
||||
return code;
|
||||
}
|
||||
ErrorCode DenseConvolutionTiledExecutor::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
|
||||
bool needDequantize = mResource->mDequantize.bits <= 8;
|
||||
if (needDequantize) {
|
||||
if (mWeightCache.weight == nullptr) {
|
||||
int weightLength = mResource->hU * mResource->lU * mResource->hP * mResource->lP;
|
||||
mWeightCache.weight.reset(new Tensor);
|
||||
mWeightCache.weight->buffer().type = halide_type_of<float>();
|
||||
TensorUtils::getDescribe(mWeightCache.weight.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
|
||||
mWeightCache.weight->buffer().dimensions = 1;
|
||||
mWeightCache.weight->setLength(0, weightLength);
|
||||
if (mWeightCache.weightInt8 == nullptr && mResource->mDequantize.bits == 4) {
|
||||
mWeightCache.weightInt8.reset(new Tensor);
|
||||
mWeightCache.weightInt8->buffer().type = halide_type_of<int8_t>();
|
||||
mWeightCache.weightInt8->buffer().dimensions = 1;
|
||||
mWeightCache.weightInt8->setLength(0, weightLength);
|
||||
TensorUtils::getDescribe(mWeightCache.weightInt8.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
|
||||
}
|
||||
}
|
||||
mInputs[1] = mWeightCache.weight.get();
|
||||
#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
bool res = false;
|
||||
if (nullptr != mWeightCache.weightInt8) {
|
||||
res = backend()->onAcquireBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
}
|
||||
res = backend()->onAcquireBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
if (nullptr != mWeightCache.weightInt8) {
|
||||
backend()->onReleaseBuffer(mWeightCache.weightInt8.get(), Backend::DYNAMIC);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
auto code = mProxy->onResize(mInputs, outputs);
|
||||
if (NO_ERROR != code) {
|
||||
return code;
|
||||
}
|
||||
if (needDequantize) {
|
||||
#ifdef MNN_ALLOC_MEMORY_INDIRECTLY
|
||||
backend()->onReleaseBuffer(mWeightCache.weight.get(), Backend::DYNAMIC);
|
||||
#endif
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode ConvolutionTiledExecutorMultiInput::onExecute(const std::vector<Tensor*>& inputs,
|
||||
const std::vector<Tensor*>& outputs) {
|
||||
int depth = inputs[1]->channel();
|
||||
|
|
|
@ -34,25 +34,25 @@ protected:
|
|||
class DenseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
|
||||
public:
|
||||
DenseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||
size_t originWeightSize, const float *bias, size_t biasSize);
|
||||
size_t originWeightSize, const float *bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common>);
|
||||
|
||||
DenseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon *common, Backend* b);
|
||||
virtual ~DenseConvolutionTiledExecutor();
|
||||
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
||||
return mProxy->onExecute(inputs, outputs);
|
||||
}
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
||||
mInputs = {inputs[0], mResource->mWeight.get(), mResource->mBias.get()};
|
||||
return mProxy->onResize(mInputs, outputs);
|
||||
}
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||
void initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function);
|
||||
static PerfConfig bestTileConvolutionConfig(const Convolution2DCommon *common, const Tensor *inputTensor,
|
||||
const Tensor *outputTensor, int threadNumber, Backend* b) {
|
||||
return DenseConvolutionTiledImpl::bestTileConvolutionConfig(common, inputTensor, outputTensor, threadNumber, b);
|
||||
}
|
||||
struct DequantizeCache {
|
||||
std::shared_ptr<MNN::Tensor> weight;
|
||||
std::shared_ptr<MNN::Tensor> weightInt8;
|
||||
};
|
||||
protected:
|
||||
DequantizeCache mWeightCache;
|
||||
std::shared_ptr<DenseConvolutionTiledImpl> mProxy;
|
||||
};
|
||||
|
||||
|
|
|
@ -1577,130 +1577,255 @@ void MNNMaxPoolInt8(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWi
|
|||
void MNNBinaryAddInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float sum = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
sum = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
sum = inp0 + inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
sum = inp0 + inp1;
|
||||
} else {
|
||||
sum = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] + static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
sum = inp0 + inp1;
|
||||
}
|
||||
float value = sum * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(sum * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBinarySubInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float res = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 - inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 - inp1;
|
||||
} else {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] - static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 - inp1;
|
||||
}
|
||||
float value = res * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBinaryMulInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float res = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 * inp1;
|
||||
} else if (needBroadcast == 1) {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 * inp1;
|
||||
} else {
|
||||
res = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i] * static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = inp0 * inp1;
|
||||
}
|
||||
float value = res * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float res = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
res = std::min(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = std::min(inp0, inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
res = std::min(inp0, inp1);
|
||||
} else {
|
||||
res = std::min(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = std::min(inp0, inp1);
|
||||
}
|
||||
float value = res * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float res = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
res = std::max(static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = std::max(inp0, inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
res = std::max(inp0, inp1);
|
||||
} else {
|
||||
res = std::max(static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i], static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i]);
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = std::max(inp0, inp1);
|
||||
}
|
||||
float value = res * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, const float* inputScale0, const float* inputScale1, const float* outputScale, int elementSize, int needBroadcast) {
|
||||
float res = 0, inp0 = 0, inp1 = 0;
|
||||
float res = 0;
|
||||
#ifdef MNN_USE_SSE
|
||||
const uint8_t zeroPoint = 128;
|
||||
const int zeroPoint = 128;
|
||||
const int maxValue = 255;
|
||||
const int minValue = 0;
|
||||
const uint8_t* inputData0 = (uint8_t*)inputRaw0;
|
||||
const uint8_t* inputData1 = (uint8_t*)inputRaw1;
|
||||
uint8_t* outputData = (uint8_t*)outputRaw;
|
||||
#else
|
||||
const uint8_t zeroPoint = 0;
|
||||
const int zeroPoint = 0;
|
||||
const int maxValue = 127;
|
||||
const int minValue = -128;
|
||||
const int8_t* inputData0 = inputRaw0;
|
||||
const int8_t* inputData1 = inputRaw1;
|
||||
int8_t* outputData = outputRaw;
|
||||
#endif
|
||||
for (int i = 0; i < elementSize; ++i) {
|
||||
if (needBroadcast == 0) {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[0] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[0] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
} else if (needBroadcast == 1) {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[0] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[0] - zeroPoint) * inputScale1[i];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
} else {
|
||||
inp0 = static_cast<float>((int8_t)(inputRaw0[i] - zeroPoint)) * inputScale0[i];
|
||||
inp1 = static_cast<float>((int8_t)(inputRaw1[i] - zeroPoint)) * inputScale1[i];
|
||||
float inp0 = (inputData0[i] - zeroPoint) * inputScale0[i];
|
||||
float inp1 = (inputData1[i] - zeroPoint) * inputScale1[i];
|
||||
res = (inp0 - inp1) * (inp0 - inp1);
|
||||
}
|
||||
float value = res * outputScale[i];
|
||||
outputRaw[i] = static_cast<uint8_t>(std::max(std::min(value, 127.0f), -127.0f)) + zeroPoint;
|
||||
int value = (int)roundf(res * outputScale[i]) + zeroPoint;
|
||||
if (value > maxValue) {
|
||||
value = maxValue;
|
||||
}
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
}
|
||||
outputData[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // #ifndef MNN_USE_NEON
|
||||
#ifndef MNN_USE_SSE
|
||||
|
||||
|
|
|
@ -17,16 +17,155 @@
|
|||
#include "math/Vec.hpp"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "common/MemoryFormater.h"
|
||||
#include "common/CommonCompute.hpp"
|
||||
|
||||
using Vec4 = MNN::Math::Vec<float, 4>;
|
||||
namespace MNN {
|
||||
|
||||
/*
|
||||
source: source matrix is h x l
|
||||
transpose: if false, export compressed matrix as h x l, other export as l x h.
|
||||
*/
|
||||
|
||||
static int _fillIndex(int32_t* targetIndexes, uint32_t begin, uint32_t end, const uint32_t* indexes, uint32_t indexSize, int indexStart) {
|
||||
int mid = -1;
|
||||
int current = -1;
|
||||
for (int i=indexStart; i<indexSize; ++i) {
|
||||
if (indexes[i] >= begin) {
|
||||
mid = i;
|
||||
current = indexes[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
uint32_t number = end - begin;
|
||||
for (uint32_t i=0; i<number; ++i) {
|
||||
targetIndexes[i] = -1;
|
||||
}
|
||||
auto offset = current - begin;
|
||||
do {
|
||||
if (current < begin || current >= end) {
|
||||
break;
|
||||
}
|
||||
targetIndexes[current - begin] = mid;
|
||||
mid++;
|
||||
if (mid >= indexSize) {
|
||||
break;
|
||||
}
|
||||
current = indexes[mid];
|
||||
} while (true);
|
||||
return mid;
|
||||
}
|
||||
|
||||
static void MNNGetOptimalBlockShape(size_t& weightNNZElement, size_t& weightBlockNumber, const uint32_t* indexes, uint32_t indexSize, int sparseBlockOC, size_t h, size_t l) {
|
||||
size_t nnzBlock = 0;
|
||||
size_t nnzTail = 0;
|
||||
int ocEven = (h / sparseBlockOC) * sparseBlockOC;
|
||||
std::vector<int32_t> tempIndexes(sparseBlockOC * l);
|
||||
size_t ioc = 0;
|
||||
int offset = 0;
|
||||
for (; ioc < ocEven; ioc += sparseBlockOC) {
|
||||
offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+sparseBlockOC) * l, indexes, indexSize, offset);
|
||||
for (size_t i = 0; i < l; i++) {
|
||||
bool allZero = true;
|
||||
for (int u=0; u<sparseBlockOC; ++u) {
|
||||
if (tempIndexes[u*l + i] >= 0) {
|
||||
allZero = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!allZero) {
|
||||
nnzBlock++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (; ioc < h; ioc++) {
|
||||
offset = _fillIndex(tempIndexes.data(), ioc * l, (ioc+1) * l, indexes, indexSize, offset);
|
||||
for (size_t i = 0; i < l; i++) {
|
||||
if (tempIndexes[i] >= 0) {
|
||||
nnzTail++;
|
||||
}
|
||||
}
|
||||
}
|
||||
weightNNZElement = nnzBlock * sparseBlockOC + nnzTail;
|
||||
weightBlockNumber = nnzBlock + nnzTail;
|
||||
return;
|
||||
}
|
||||
static void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, size_t h, size_t ic, size_t kernelSize, const int eP) {
|
||||
// 1. in convolution, source B layout is OC x (KH * KW * IC),
|
||||
// the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (KH*KW*IC!=0), as a canceled result, just do BCSR, transpose should be false.
|
||||
// 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
|
||||
auto l = ic * kernelSize;
|
||||
|
||||
int columOffset = 0;
|
||||
int i = 0;
|
||||
std::vector<int32_t> tempIndexes(sparseBlockOC * l);
|
||||
int offset = 0;
|
||||
for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
|
||||
*NNZMap = 0;
|
||||
offset = _fillIndex(tempIndexes.data(), i * l, (i+sparseBlockOC) * l, indexes, indexSize, offset);
|
||||
// Origin weight is oc, ic, kernelSize, new weight order is oc, kernelsize, ic
|
||||
for (int x=0; x<kernelSize; ++x) {
|
||||
for (int y=0; y<ic; ++y) {
|
||||
auto j = y * kernelSize + x;
|
||||
bool allZero = true;
|
||||
for (int u=0; u<sparseBlockOC; ++u) {
|
||||
if (tempIndexes[u*l + j] >= 0) {
|
||||
allZero = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!allZero) {
|
||||
for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
|
||||
auto index = tempIndexes[ioc*l + j];
|
||||
if (index >= 0) {
|
||||
*dest = source[index];
|
||||
} else {
|
||||
*dest = 0.0f;
|
||||
}
|
||||
dest++;
|
||||
}
|
||||
*NNZMap = *NNZMap + 1;
|
||||
*dataOffsetMap = columOffset;
|
||||
dataOffsetMap++;
|
||||
columOffset = 0;
|
||||
}
|
||||
columOffset += eP;
|
||||
}
|
||||
}
|
||||
NNZMap++;
|
||||
columOffset -= l * eP;
|
||||
}
|
||||
|
||||
for (; i < h; i++) {
|
||||
*NNZMap = 0;
|
||||
offset = _fillIndex(tempIndexes.data(), i * l, (i+1) * l, indexes, indexSize, offset);
|
||||
for (int x=0; x<kernelSize; ++x) {
|
||||
for (int y=0; y<ic; ++y) {
|
||||
auto j = y * kernelSize + x;
|
||||
auto index = tempIndexes[j];
|
||||
if (index >= 0) {
|
||||
*dest = source[index];
|
||||
dest++;
|
||||
*NNZMap = *NNZMap + 1;
|
||||
*dataOffsetMap = columOffset;
|
||||
dataOffsetMap++;
|
||||
columOffset = 0;
|
||||
}
|
||||
columOffset += eP;
|
||||
}
|
||||
}
|
||||
NNZMap++;
|
||||
columOffset -= l * eP;
|
||||
}
|
||||
|
||||
*dataOffsetMap = columOffset; //
|
||||
return;
|
||||
}
|
||||
void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMap, int* dataOffsetMap,
|
||||
int sparseBlockOC, const float* source, float* cache, int depth,
|
||||
int sparseBlockOC, const float* source, const uint32_t* indexes, uint32_t indexSize, int depth,
|
||||
int outputCount, int kernelSize, int eP, size_t weightNNZElement,
|
||||
size_t weightBlockNumber, const CoreFunctions* function) {
|
||||
ConvolutionTiledExecutor::initWeight(source, cache, depth, outputCount, kernelSize, function);
|
||||
function->MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, cache, outputCount, kernelSize * depth, eP, false);
|
||||
MNNPackForSparseMatMul_B(dest, NNZMap, dataOffsetMap, sparseBlockOC, source, indexes, indexSize, outputCount, depth, kernelSize, eP);
|
||||
|
||||
// MNN_PRINT("\nBCSR origin weight:");
|
||||
// formatMatrix(source, {outputCount, kernelSize * depth});
|
||||
|
@ -40,13 +179,13 @@ void SparseConvolutionTiledExecutor::initWeight(float* dest, unsigned int* NNZMa
|
|||
|
||||
|
||||
SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend* b,
|
||||
const float* originWeight, size_t originWeightSize, const SparseCommon* sparseCommon,
|
||||
const IDSTQuan* weight, const SparseCommon* sparseCommon,
|
||||
const float* bias, size_t biasSize)
|
||||
: ConvolutionTiledExecutor(b, bias, biasSize) {
|
||||
|
||||
auto outputCount = (int)biasSize;
|
||||
// Don't use common->inputCount for old model common->inputCount is zero
|
||||
auto lSize = originWeightSize / outputCount;
|
||||
auto lSize = weight->weightSize() / outputCount;
|
||||
auto srcCount = lSize / (common->kernelX() * common->kernelY());
|
||||
|
||||
int eP, lP, hP;
|
||||
|
@ -64,7 +203,7 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
|
|||
if (optimalSparseBlockOC != sparseBlockOC) {
|
||||
size_t optimalWeightNNZElement = weightNNZElement;
|
||||
size_t optimalWeightBlockNumber = weightBlockNumber;
|
||||
core->MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, originWeight, optimalSparseBlockOC, outputCount, lSize);
|
||||
MNNGetOptimalBlockShape(optimalWeightNNZElement, optimalWeightBlockNumber, weight->index()->data(), weight->index()->size(), optimalSparseBlockOC, outputCount, lSize);
|
||||
MNN_ASSERT(sparseBlockOC == 1 || sparseBlockOC == 2 || sparseBlockOC == 4 || sparseBlockOC == 8);
|
||||
// MNN_PRINT("caution: sparsity changed!!!\nsparseBlockOC:%d -> %d weightNNZElement:%zu -> %zu, weightBlockNumber:%zu -> %zu, outputCount:%d, divide:%d, tail:%d\n",
|
||||
// sparseBlockOC, optimalSparseBlockOC, weightNNZElement, optimalWeightNNZElement, weightBlockNumber, optimalWeightBlockNumber, outputCount, outputCount / optimalSparseBlockOC, outputCount % optimalSparseBlockOC);
|
||||
|
@ -72,26 +211,25 @@ SparseConvolutionTiledExecutor::SparseConvolutionTiledExecutor(const Convolution
|
|||
weightNNZElement = optimalWeightNNZElement;
|
||||
weightBlockNumber = optimalWeightBlockNumber;
|
||||
}
|
||||
MNN_ASSERT(weightNNZElement > 0);
|
||||
MNN_ASSERT(weightBlockNumber > 0);
|
||||
|
||||
mSparseIndexData.reset(new SparseIndexData(sparseBlockOC, weightNNZElement, weightBlockNumber, backend()));
|
||||
|
||||
mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
|
||||
{ static_cast<int>(weightNNZElement + 1) * bytes })); // one more element in case of weight are all zeros
|
||||
std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({static_cast<int>(outputCount * lSize * sizeof(float))})); // cache must be float
|
||||
|
||||
mSparseIndexData->mNNZMap.reset(Tensor::createDevice<unsigned int>({outputCount / sparseBlockOC + outputCount % sparseBlockOC}));
|
||||
mSparseIndexData->mDataOffsetMap.reset(Tensor::createDevice<int>({static_cast<int>(weightBlockNumber + 1)}));
|
||||
|
||||
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
||||
mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
|
||||
mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mNNZMap.get(), Backend::STATIC);
|
||||
mValid = mValid && backend()->onAcquireBuffer(mSparseIndexData->mDataOffsetMap.get(), Backend::STATIC);
|
||||
if (!mValid) {
|
||||
return;
|
||||
}
|
||||
|
||||
initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
|
||||
backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
|
||||
initWeight(mResource->mWeight->host<float>(), mSparseIndexData->mNNZMap->host<unsigned int>(), mSparseIndexData->mDataOffsetMap->host<int>(), sparseBlockOC, weight->alpha()->data(), weight->index()->data(), weight->index()->size(), srcCount, outputCount, common->kernelX() * common->kernelY(), eP, weightNNZElement, weightBlockNumber, core);
|
||||
mProxy.reset(new SparseConvolutionTiledImpl(common, packedSparseMatmul, sparseBlockOC, b));
|
||||
}
|
||||
|
||||
|
|
|
@ -67,8 +67,7 @@ public:
|
|||
|
||||
class SparseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
|
||||
public:
|
||||
SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||
size_t originWeightSize, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);
|
||||
SparseConvolutionTiledExecutor(const Convolution2DCommon *common, Backend *b, const IDSTQuan* weight, const SparseCommon* sparseCommon, const float *bias, size_t biasSize);
|
||||
|
||||
SparseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, std::shared_ptr<SparseIndexData> mSparseIndexData,
|
||||
const Convolution2DCommon *common, MNNPackedSparseMatMul packedSparseMatmul, int sparseBlockOC, Backend *b);
|
||||
|
@ -84,24 +83,9 @@ public:
|
|||
virtual bool onClone(Backend *bn, const Op *op, Execution **dst) override;
|
||||
|
||||
void initWeight(float *dest, unsigned int *NNZMap, int *dataOffsetMap, int sparseBlockOC, const float *source,
|
||||
float *cache, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
|
||||
const uint32_t* indexes, uint32_t indexSize, int depth, int outputCount, int kernelSize, int eP, size_t weightNNZElement,
|
||||
size_t weightBlockNumber, const CoreFunctions *function);
|
||||
|
||||
static bool shouldUseSparseConvolution(size_t originWeightSize, const SparseCommon* sparseCommon) {
|
||||
auto sparseBlockOC = sparseCommon->args()->LookupByKey("sparseBlockOC")->i();
|
||||
size_t weightNNZElement = sparseCommon->args()->LookupByKey("NNZElement")->i();
|
||||
return shouldUseSparseConvolution((originWeightSize - weightNNZElement) / ((double)originWeightSize), sparseBlockOC);
|
||||
}
|
||||
static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
|
||||
std::vector<float> thresholds = getSparsityThreshold();
|
||||
return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
|
||||
}
|
||||
static inline std::vector<float> getSparsityThreshold() {
|
||||
|
||||
// sparsity threadhold values, when sparseblock is
|
||||
// {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
||||
return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
|
||||
}
|
||||
protected:
|
||||
std::shared_ptr<SparseConvolutionTiledImpl> mProxy;
|
||||
std::shared_ptr<SparseIndexData> mSparseIndexData;
|
||||
|
@ -110,4 +94,4 @@ protected:
|
|||
#undef RELEASE_BUFFER_HINT
|
||||
} // namespace MNN
|
||||
|
||||
#endif /* SparseConvolutionTiledExecutor_hpp */
|
||||
#endif /* SparseConvolutionTiledExecutor_hpp */
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(CUDA_MIN_VERSION "7.0")
|
||||
set(CUDA_MIN_VERSION "8.0")
|
||||
find_package(CUDA ${CUDA_MIN_VERSION})
|
||||
|
||||
set (EXTRA_LIBS "")
|
||||
|
@ -21,6 +21,16 @@ if(CUDA_FOUND)
|
|||
include(${CMAKE_CURRENT_SOURCE_DIR}/SelectCudaComputeArch.cmake)
|
||||
CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${CUDA_ARCHS})
|
||||
|
||||
list(LENGTH CUDA_ARCH_FLAGS_readable_code arch_count)
|
||||
# Current Supported Arch List
|
||||
IF (${arch_count} EQUAL 1)
|
||||
set(support_archs 60 61 62 70 72 75 80 86)
|
||||
list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
|
||||
IF (${list_index} EQUAL -1)
|
||||
message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
|
||||
|
@ -41,6 +51,27 @@ if(CUDA_FOUND)
|
|||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
|
||||
ENDIF()
|
||||
|
||||
# Limit minimum cuda version for each archs
|
||||
IF (${arch_count} EQUAL 1)
|
||||
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "80") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "80"))
|
||||
IF (CUDA_VERSION VERSION_LESS "11.2")
|
||||
message(FATAL_ERROR "Please update cuda version to 11.2 or higher!")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "75") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "75"))
|
||||
IF (CUDA_VERSION VERSION_LESS "10.2")
|
||||
message(FATAL_ERROR "Please update cuda version to 10.2 or higher!")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "70") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "70"))
|
||||
IF (CUDA_VERSION VERSION_LESS "10.1")
|
||||
message(FATAL_ERROR "Please update cuda version to 10.1 or higher!")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
message(STATUS "Enabling CUDA support (version: ${CUDA_VERSION_STRING},"
|
||||
" archs: ${CUDA_ARCH_FLAGS_readable})")
|
||||
else()
|
||||
|
|
|
@ -36,9 +36,9 @@
|
|||
# - "Auto" detects local machine GPU compute arch at runtime.
|
||||
# - "Common" and "All" cover common and entire subsets of architectures
|
||||
# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
|
||||
# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing
|
||||
# NAME: Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing Ampere
|
||||
# NUM: Any number. Only those pairs are currently accepted by NVCC though:
|
||||
# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5
|
||||
# 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5 8.0
|
||||
# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
|
||||
# Additionally, sets ${out_variable}_readable to the resulting numeric list
|
||||
# Example:
|
||||
|
@ -58,39 +58,19 @@ endif()
|
|||
# See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list
|
||||
|
||||
# This list will be used for CUDA_ARCH_NAME = All option
|
||||
set(CUDA_KNOWN_GPU_ARCHITECTURES "")
|
||||
|
||||
# CUDA 9.X and later do not support the Fermi architecture anymore.
|
||||
if(CUDA_VERSION VERSION_LESS "9.0")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Fermi")
|
||||
endif()
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
|
||||
set(CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
|
||||
|
||||
# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
|
||||
set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "7.0")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "5.2")
|
||||
endif()
|
||||
set(CUDA_COMMON_GPU_ARCHITECTURES "3.5" "5.0")
|
||||
|
||||
# This list is used to filter CUDA archs when autodetecting
|
||||
set(CUDA_ALL_GPU_ARCHITECTURES "3.0" "3.2" "3.5" "5.0")
|
||||
|
||||
if(CUDA_VERSION VERSION_EQUAL "7.0" OR CUDA_VERSION VERSION_GREATER "7.0")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "8.0")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "6.0")
|
||||
endif()
|
||||
endif()
|
||||
set(CUDA_ALL_GPU_ARCHITECTURES "3.5" "5.0")
|
||||
|
||||
if(CUDA_VERSION VERSION_EQUAL "8.0" OR CUDA_VERSION VERSION_GREATER "8.0")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "6.0" "6.1" "6.2")
|
||||
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "9.0")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "7.0")
|
||||
|
@ -101,22 +81,58 @@ if(CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0")
|
|||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0" "7.0+PTX")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.0" "7.0+PTX" "7.2" "7.2+PTX")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "10.0")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(CUDA_VERSION VERSION_GREATER "10.5")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "11.1")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5" "7.5+PTX")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "7.5" "7.5+PTX")
|
||||
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "11.0")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT CUDA_VERSION VERSION_LESS "11.1")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "11.8")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6+PTX")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT CUDA_VERSION VERSION_LESS "11.8")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
|
||||
list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
|
||||
list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
|
||||
|
||||
if(CUDA_VERSION VERSION_LESS "12.0")
|
||||
set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
|
||||
list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
################################################################################################
|
||||
# A function for automatic detection of GPUs installed (if autodetection is enabled)
|
||||
# Usage:
|
||||
|
@ -175,7 +191,8 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
|
|||
set(CUDA_GPU_DETECT_OUTPUT_FILTERED "")
|
||||
separate_arguments(CUDA_GPU_DETECT_OUTPUT)
|
||||
foreach(ITEM IN ITEMS ${CUDA_GPU_DETECT_OUTPUT})
|
||||
if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE OR ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE))
|
||||
if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE OR
|
||||
ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE))
|
||||
list(GET CUDA_COMMON_GPU_ARCHITECTURES -1 NEWITEM)
|
||||
string(APPEND CUDA_GPU_DETECT_OUTPUT_FILTERED " ${NEWITEM}")
|
||||
else()
|
||||
|
@ -228,14 +245,10 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
|
|||
set(arch_ptx ${arch_bin})
|
||||
else()
|
||||
# Look for it in our list of known architectures
|
||||
if(${arch_name} STREQUAL "Fermi")
|
||||
set(arch_bin 2.0 "2.1(2.0)")
|
||||
elseif(${arch_name} STREQUAL "Kepler+Tegra")
|
||||
set(arch_bin 3.2)
|
||||
elseif(${arch_name} STREQUAL "Kepler+Tesla")
|
||||
if(${arch_name} STREQUAL "Kepler+Tesla")
|
||||
set(arch_bin 3.7)
|
||||
elseif(${arch_name} STREQUAL "Kepler")
|
||||
set(arch_bin 3.0 3.5)
|
||||
set(arch_bin 3.5)
|
||||
set(arch_ptx 3.5)
|
||||
elseif(${arch_name} STREQUAL "Maxwell+Tegra")
|
||||
set(arch_bin 5.3)
|
||||
|
@ -245,12 +258,25 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
|
|||
elseif(${arch_name} STREQUAL "Pascal")
|
||||
set(arch_bin 6.0 6.1)
|
||||
set(arch_ptx 6.1)
|
||||
elseif(${arch_name} STREQUAL "Volta+Tegra")
|
||||
set(arch_bin 7.2)
|
||||
elseif(${arch_name} STREQUAL "Volta")
|
||||
set(arch_bin 7.0 7.0)
|
||||
set(arch_ptx 7.0)
|
||||
elseif(${arch_name} STREQUAL "Turing")
|
||||
set(arch_bin 7.5)
|
||||
set(arch_ptx 7.5)
|
||||
elseif(${arch_name} STREQUAL "Ampere+Tegra")
|
||||
set(arch_bin 8.7)
|
||||
elseif(${arch_name} STREQUAL "Ampere")
|
||||
set(arch_bin 8.0 8.6)
|
||||
set(arch_ptx 8.0 8.6)
|
||||
elseif(${arch_name} STREQUAL "Ada")
|
||||
set(arch_bin 8.9)
|
||||
set(arch_ptx 8.9)
|
||||
elseif(${arch_name} STREQUAL "Hopper")
|
||||
set(arch_bin 9.0)
|
||||
set(arch_ptx 9.0)
|
||||
else()
|
||||
message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
|
||||
endif()
|
||||
|
@ -282,17 +308,20 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
|
|||
|
||||
set(nvcc_flags "")
|
||||
set(nvcc_archs_readable "")
|
||||
set(nvcc_archs_code "")
|
||||
|
||||
# Tell NVCC to add binaries for the specified GPUs
|
||||
foreach(arch ${cuda_arch_bin})
|
||||
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
|
||||
# User explicitly specified ARCH for the concrete CODE
|
||||
list(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}")
|
||||
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
|
||||
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
|
||||
list(APPEND nvcc_archs_code ${CMAKE_MATCH_1})
|
||||
else()
|
||||
# User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
|
||||
list(APPEND nvcc_flags " -gencode arch=compute_${arch},code=sm_${arch}")
|
||||
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
|
||||
list(APPEND nvcc_archs_readable sm_${arch})
|
||||
list(APPEND nvcc_archs_code ${arch})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
|
@ -305,4 +334,5 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
|
|||
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
|
||||
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
|
||||
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
|
||||
endfunction()
|
||||
set(${out_variable}_readable_code ${nvcc_archs_code} PARENT_SCOPE)
|
||||
endfunction()
|
|
@ -215,7 +215,8 @@ using GemmTensor_F16_F16_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F16_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -232,7 +233,8 @@ using GemmTensor_F16_F16_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F16_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -249,7 +251,8 @@ using GemmTensor_F16_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -266,7 +269,8 @@ using GemmTensor_F16_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmCuda_F32_F32_Linear_AlignCuda = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -334,7 +338,8 @@ using GemmTensor_F32_F32_Linear_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -351,7 +356,8 @@ using GemmTensor_F32_F32_Linear_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Linear,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
using GemmCuda_F16_F16_Relu_AlignCuda = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -470,7 +476,8 @@ using GemmTensor_F16_F16_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F16_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -487,7 +494,8 @@ using GemmTensor_F16_F16_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F16_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -504,7 +512,8 @@ using GemmTensor_F16_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -521,7 +530,8 @@ using GemmTensor_F16_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmCuda_F32_F32_Relu_AlignCuda = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -589,7 +599,8 @@ using GemmTensor_F32_F32_Relu_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -606,7 +617,8 @@ using GemmTensor_F32_F32_Relu_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Relu,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
using GemmCuda_F16_F16_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -725,7 +737,8 @@ using GemmTensor_F16_F16_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F16_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -742,7 +755,8 @@ using GemmTensor_F16_F16_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F16_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -759,7 +773,8 @@ using GemmTensor_F16_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
cutlass::half_t,
|
||||
|
@ -776,7 +791,8 @@ using GemmTensor_F16_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;
|
||||
|
||||
using GemmCuda_F32_F32_Relu6_AlignCuda = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -844,7 +860,8 @@ using GemmTensor_F32_F32_Relu6_AlignCuda_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueCudaOp_F32_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
||||
float,
|
||||
|
@ -861,8 +878,9 @@ using GemmTensor_F32_F32_Relu6_AlignTensor_Sm75 = cutlass::gemm::device::Gemm<
|
|||
cutlass::gemm::GemmShape<16, 8, 8>,
|
||||
EpilogueTensorOp_F32_Relu6,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
NumStages,
|
||||
128 / cutlass::sizeof_bits<float>::value, 128 / cutlass::sizeof_bits<float>::value, true>;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -4,36 +4,6 @@ namespace CUDA {
|
|||
|
||||
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
|
||||
|
||||
#define FINAL_MASK 0xffffffff
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceSum(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1)
|
||||
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceSum(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceSum<T>(val);
|
||||
|
||||
if(lane == 0)
|
||||
shared[wid] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceSum(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__
|
||||
void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#define LayerNormExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef MNNCUDAFunction_cuh
|
||||
#define MNNCUDAFunction_cuh
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
struct DivModFast {
|
||||
DivModFast(int d = 1)
|
||||
{
|
||||
|
@ -35,4 +37,68 @@ struct DivModFast {
|
|||
uint32_t l_; // ceil(log2(d_))
|
||||
uint32_t m_; // m' in the papaer
|
||||
};
|
||||
|
||||
|
||||
#define FINAL_MASK 0xffffffff
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceSum(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1) {
|
||||
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceSum(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceSum<T>(val);
|
||||
|
||||
if(lane == 0) {
|
||||
shared[wid] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceSum(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceMax(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1) {
|
||||
val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceMax(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceMax<T>(val);
|
||||
|
||||
if(lane == 0) {
|
||||
shared[wid] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceMax(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -425,59 +425,109 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
cutlass_check(status);
|
||||
} else {
|
||||
if(hAlignment) {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F16_F16_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F16_F16_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
cutlass::Status status = mGemmF16F16LnAlign8Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF16F16LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F16LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F16_F16_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F16_F16_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
cutlass::Status status = mGemmF16F16LnAlign1Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF16F16LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F16 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F16 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F16LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -541,63 +591,31 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
} else {
|
||||
if(hAlignment) {
|
||||
if(mNeedConvertMatAB) {
|
||||
typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
if(mNeedConvertMatAB) {
|
||||
typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F16_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F16_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
cutlass::Status status = mGemmF16F32LnAlign8Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF16F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
|
@ -609,47 +627,179 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F32_F32_Linear_AlignTensor_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F32_F32_Linear_AlignTensor_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
cutlass::Status status = mGemmF32F32LnAlign8Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF32F32LnAlign8Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF32F32LnAlign8RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
if(mNeedConvertMatAB) {
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F16_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F16_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
cutlass::Status status = mGemmF16F32LnAlign1Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF16F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F16 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
size_t workspace_size = GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
if(mConvertGemmSplitK) {
|
||||
int split_k_slices = 16;
|
||||
typename GemmTensor_F32_F32_Linear_AlignCuda_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = GemmTensor_F32_F32_Linear_AlignCuda_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
|
||||
cutlass::Status status = mGemmF32F32LnAlign1Sm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmF32F32LnAlign1Sm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
typename GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F32 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]* mAs), // batch_stride_A
|
||||
{(ElementInput_F32 *)mTempMatB, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elh[2]* mBs), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mBiasPtr, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)C->deviceId(), mGemmInfo.elh[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elh[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBatch}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75::get_workspace_size(arguments);
|
||||
|
||||
if(workspace_size != 0) {
|
||||
workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
|
||||
mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
|
||||
mWorkspace = (void *)workspaceTensor.get()->buffer().device;
|
||||
}
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF32F32LnAlign1RCSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -695,7 +845,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
mNeedBTempBuffer = (needBTranspose || !lAlignment) || mFp16Fp32MixInfer;
|
||||
mNeedConvertMatAB = (mNeedATempBuffer || mNeedBTempBuffer);
|
||||
|
||||
//MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
|
||||
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
|
||||
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
std::pair<void*, size_t> bufferAData, bufferBData;
|
||||
|
@ -730,6 +880,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
}
|
||||
//printf("MatMulAB:%p-%p-%p-%p\n", A->host<void*>(), A->deviceId(), B->host<void*>(), B->deviceId());
|
||||
|
||||
mConvertGemmSplitK = ((mBatch == 1) && (mGemmInfo.elhPad[1] >= 16384));
|
||||
// Set Cutlass Param Arguments
|
||||
mResizeSetArgument = (mTempMatA != nullptr && mTempMatB != nullptr && C->deviceId() != 0);
|
||||
if(mResizeSetArgument) {
|
||||
|
@ -855,19 +1006,39 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
} else {
|
||||
if(hAlignment) {
|
||||
if(mNeedConvertMatAB) {
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF16F32LnAlign8Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF32F32LnAlign8Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(mNeedConvertMatAB) {
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF16F32LnAlign1Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF16F32LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF32F32LnAlign1Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF32F32LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -878,15 +1049,25 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
cutlass_check(status);
|
||||
} else {
|
||||
if(hAlignment) {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF16F16LnAlign8Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign8RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
if(mConvertGemmSplitK) {
|
||||
cutlass::Status status = mGemmF16F16LnAlign1Sm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign1RCSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// printf("normal:%d rrlayout:%d convertab:%d halign:%d\n", mFp16Fp32MixInfer, mUseRRLayout, mNeedConvertMatAB, hAlignment);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "MNNCUDADefine.hpp"
|
||||
#include "CutlassGemmBatchedParam.hpp"
|
||||
#include "CutlassGemmParam.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
|
||||
namespace MNN {
|
||||
|
@ -34,12 +35,18 @@ private:
|
|||
|
||||
std::shared_ptr<Tensor> mBiasTensor;
|
||||
GemmBatchedTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F16LnAlign1RCSm75;
|
||||
GemmTensor_F16_F16_Linear_AlignCuda_Sm75 mGemmF16F16LnAlign1Sm75;
|
||||
GemmBatchedTensor_F32_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF32F32LnAlign1RCSm75;
|
||||
GemmTensor_F32_F32_Linear_AlignCuda_Sm75 mGemmF32F32LnAlign1Sm75;
|
||||
GemmBatchedTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm75 mGemmBatchedF16F32LnAlign1RCSm75;
|
||||
GemmTensor_F16_F32_Linear_AlignCuda_Sm75 mGemmF16F32LnAlign1Sm75;
|
||||
|
||||
GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F16LnAlign8RCSm75;
|
||||
GemmTensor_F16_F16_Linear_AlignTensor_Sm75 mGemmF16F16LnAlign8Sm75;
|
||||
GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF32F32LnAlign8RCSm75;
|
||||
GemmTensor_F32_F32_Linear_AlignTensor_Sm75 mGemmF32F32LnAlign8Sm75;
|
||||
GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75 mGemmBatchedF16F32LnAlign8RCSm75;
|
||||
GemmTensor_F16_F32_Linear_AlignTensor_Sm75 mGemmF16F32LnAlign8Sm75;
|
||||
|
||||
GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF16F16LnAlign8RRSm75;
|
||||
GemmBatchedTensor_F32_F32_Linear_AlignTensor_Row_Row_Sm75 mGemmBatchedF32F32LnAlign8RRSm75;
|
||||
|
@ -69,6 +76,7 @@ private:
|
|||
bool mFp16Infer = false;
|
||||
bool mFp32Infer = false;
|
||||
bool mFp16Fp32MixInfer = false;
|
||||
bool mConvertGemmSplitK = false;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
|
|
@ -190,7 +190,7 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
|
|||
DivModFast sy(size[1]);
|
||||
DivModFast sx(size[2]);
|
||||
|
||||
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
|
||||
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
|
||||
if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
|
||||
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
|
||||
count /= 2;
|
||||
|
|
|
@ -168,7 +168,18 @@ static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
|
||||
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1, Tensor* tensor) {
|
||||
if(tensor->dimensions() < 2) {
|
||||
return false;
|
||||
}
|
||||
if(slice0.src.stride[1] == tensor->width() && slice0.src.stride[0] == tensor->width() * tensor->height()) {
|
||||
// area pack for fast blit only
|
||||
return false;
|
||||
}
|
||||
if(slice1.src.stride[1] == tensor->width() && slice1.src.stride[0] == tensor->width() * tensor->height()) {
|
||||
// area pack for fast blit only
|
||||
return false;
|
||||
}
|
||||
if(slice0.size[1] % PACK_NUMBER != 0 || slice0.size[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
@ -242,7 +253,7 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &____inputs, con
|
|||
mFast = false;
|
||||
break;
|
||||
}
|
||||
if(!_directBlitC4(slice0, slice)) {
|
||||
if(!_directBlitC4(slice0, slice, output)) {
|
||||
mFast = false;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -2,15 +2,86 @@
|
|||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template<typename T>
|
||||
static void callSumFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
|
||||
int inside = param->inside;
|
||||
int outside = param->outside;
|
||||
int axis = param->axis;
|
||||
int count = outside * inside;
|
||||
|
||||
if(axis % 256 == 0 || axis >= 768) {
|
||||
int calc_multi_num = (axis + 255) / 256;
|
||||
SUM_REDUCE_AXIS<<<count, 256>>>(input, output, outside, axis, inside, 256, calc_multi_num);
|
||||
checkKernelErrors;
|
||||
} else if(axis >= 32) {
|
||||
int calc_multi_num = (axis + 63) / 64;
|
||||
SUM_REDUCE_AXIS<<<count, 64>>>(input, output, outside, axis, inside, 64, calc_multi_num);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
SUM_NAIVE<<<block_num, threads_num>>>(input, output, outside, axis, inside);
|
||||
checkKernelErrors;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void callMeanFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
|
||||
int inside = param->inside;
|
||||
int outside = param->outside;
|
||||
int axis = param->axis;
|
||||
int count = outside * inside;
|
||||
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
MEAN<<<block_num, threads_num>>>(input, output, outside, axis, inside);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void callMaxFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
|
||||
int inside = param->inside;
|
||||
int outside = param->outside;
|
||||
int axis = param->axis;
|
||||
int count = outside * inside;
|
||||
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
MAXIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void callMinFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
|
||||
int inside = param->inside;
|
||||
int outside = param->outside;
|
||||
int axis = param->axis;
|
||||
int count = outside * inside;
|
||||
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
MINIMUM<<<block_num, threads_num>>>(input, output, outside, axis, inside);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void callProdFunc(const T* input, T* output, ReduceParam* param, CUDARuntime* runtime) {
|
||||
int inside = param->inside;
|
||||
int outside = param->outside;
|
||||
int axis = param->axis;
|
||||
int count = outside * inside;
|
||||
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
PROD<<<block_num, threads_num>>>(input, output, outside, axis, inside);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
|
||||
mType = opType;
|
||||
mAxis = axis;
|
||||
auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
|
||||
mParam = staticPool->alloc(sizeof(ReduceParam));
|
||||
}
|
||||
ReductionExecution::~ ReductionExecution() {
|
||||
auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
||||
staticPool->free(mParam);
|
||||
}
|
||||
|
||||
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
@ -27,9 +98,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
mCpuParam.inside = inside;
|
||||
mCpuParam.outside = outside;
|
||||
mCpuParam.axis = axis;
|
||||
cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
|
||||
|
||||
//MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
|
||||
// MNN_PRINT("Reduction axis_idx:%d, outside:%d, axis:%d, inside:%d\n", mAxis, outside, axis, inside);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -37,47 +106,46 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
|||
auto input = (void*)inputs[0]->deviceId();
|
||||
auto output = (void*)outputs[0]->deviceId();
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
int inside = mCpuParam.inside;;
|
||||
int inside = mCpuParam.inside;
|
||||
int outside = mCpuParam.outside;
|
||||
int count = inside * outside;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
|
||||
if (inputs[0]->getType() == halide_type_of<float>()) {
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
switch (mType) {
|
||||
case ReductionType_MEAN:
|
||||
MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
|
||||
callMeanFunc((const half*)input, (half*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_SUM:
|
||||
SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
|
||||
callSumFunc((const half*)input, (half*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MINIMUM:
|
||||
MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
|
||||
callMinFunc((const half*)input, (half*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MAXIMUM:
|
||||
MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
|
||||
callMaxFunc((const half*)input, (half*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_PROD:
|
||||
PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
|
||||
callProdFunc((const half*)input, (half*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
}
|
||||
} else {
|
||||
switch (mType) {
|
||||
case ReductionType_MEAN:
|
||||
MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
|
||||
callMeanFunc((const float*)input, (float*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_SUM:
|
||||
SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
|
||||
callSumFunc((const float*)input, (float*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MINIMUM:
|
||||
MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
|
||||
callMinFunc((const float*)input, (float*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MAXIMUM:
|
||||
MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
|
||||
callMaxFunc((const float*)input, (float*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_PROD:
|
||||
PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
|
||||
callProdFunc((const float*)input, (float*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
}
|
||||
}
|
||||
|
@ -88,25 +156,26 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
|||
MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
|
||||
switch (mType) {
|
||||
case ReductionType_MEAN:
|
||||
MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callMeanFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_SUM:
|
||||
SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callSumFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
// SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MINIMUM:
|
||||
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_MAXIMUM:
|
||||
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_PROD:
|
||||
PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callProdFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_ANY:
|
||||
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callMaxFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
case ReductionType_ALL:
|
||||
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
|
||||
callMinFunc((const int32_t*)input, (int32_t*)output, &mCpuParam, runtime);
|
||||
return NO_ERROR;
|
||||
}
|
||||
MNN_ASSERT(false);
|
||||
|
|
|
@ -25,7 +25,6 @@ private:
|
|||
ReductionType mType;
|
||||
int mAxis;
|
||||
ReduceParam mCpuParam;
|
||||
std::pair<void*, int> mParam;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
|
|
@ -1,91 +1,143 @@
|
|||
#ifndef ReductionTemplate_cuh
|
||||
#define ReductionTemplate_cuh
|
||||
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
struct ReduceParam {
|
||||
int inside;
|
||||
int axis;
|
||||
int outside;
|
||||
};
|
||||
template <typename T>
|
||||
__global__ void SUM(const T *input, T *output, const ReduceParam* param) {
|
||||
int count = param->inside * param->outside;
|
||||
__global__ void SUM_NAIVE(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside
|
||||
) {
|
||||
int count = inside * outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / param->inside;
|
||||
int x = i % param->inside;
|
||||
int y = i / inside;
|
||||
int x = i % inside;
|
||||
float sumValue = 0.0;
|
||||
int axis = param->axis;
|
||||
const T* basicInput = input + y * param->axis * param->inside + x;
|
||||
const T* basicInput = input + y * axis * inside + x;
|
||||
for (int v=0; v<axis; ++v) {
|
||||
sumValue += (float)basicInput[v * param->inside];
|
||||
sumValue += (float)basicInput[v * inside];
|
||||
}
|
||||
output[y * param->inside + x] = (T)sumValue;
|
||||
output[y * inside + x] = (T)sumValue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void MEAN(const T *input, T *output, const ReduceParam* param) {
|
||||
int count = param->inside * param->outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / param->inside;
|
||||
int x = i % param->inside;
|
||||
float sumValue = 0.0;
|
||||
int axis = param->axis;
|
||||
const T* basicInput = input + y * param->axis * param->inside + x;
|
||||
for (int v=0; v<axis; ++v) {
|
||||
sumValue += (float)basicInput[v * param->inside];
|
||||
__global__ void SUM_REDUCE_AXIS(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside,
|
||||
const int per_block_size,
|
||||
const int calc_multi_num
|
||||
) {
|
||||
int idx_outside = blockIdx.x / inside;
|
||||
int idx_inside = blockIdx.x - idx_outside * inside;
|
||||
|
||||
const T* src = input + idx_outside * axis * inside + idx_inside;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
float local_src = 0.0;
|
||||
__shared__ float sumValue;
|
||||
for(int i=0; i<calc_multi_num; i++) {
|
||||
if(tid + i * per_block_size < axis) {
|
||||
local_src += (float)(src[(tid + i * per_block_size) * inside]);
|
||||
}
|
||||
output[y * param->inside + x] = (T)(sumValue / (float)param->axis);
|
||||
}
|
||||
float maxRes = blockReduceSum<float>(local_src);
|
||||
if(tid == 0)
|
||||
sumValue = maxRes;
|
||||
__syncthreads();
|
||||
|
||||
output[idx_outside * inside + idx_inside] = (T)sumValue;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void MEAN(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside
|
||||
) {
|
||||
int count = inside * outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / inside;
|
||||
int x = i % inside;
|
||||
float sumValue = 0.0;
|
||||
|
||||
const T* basicInput = input + y * axis * inside + x;
|
||||
for (int v=0; v<axis; ++v) {
|
||||
sumValue += (float)basicInput[v * inside];
|
||||
}
|
||||
output[y * inside + x] = (T)(sumValue / (float)axis);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void MINIMUM(const T *input, T *output, const ReduceParam* param) {
|
||||
int count = param->inside * param->outside;
|
||||
__global__ void MINIMUM(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside
|
||||
) {
|
||||
int count = inside * outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / param->inside;
|
||||
int x = i % param->inside;
|
||||
int axis = param->axis;
|
||||
const T* basicInput = input + y * param->axis * param->inside + x;
|
||||
int y = i / inside;
|
||||
int x = i % inside;
|
||||
|
||||
const T* basicInput = input + y * axis * inside + x;
|
||||
float res = (float)basicInput[0];
|
||||
for (int v=1; v<axis; ++v) {
|
||||
res = min((float)basicInput[v * param->inside], res);
|
||||
res = min((float)basicInput[v * inside], res);
|
||||
}
|
||||
output[y * param->inside + x] = (T)res;
|
||||
output[y * inside + x] = (T)res;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void MAXIMUM(const T *input, T *output, const ReduceParam* param) {
|
||||
int count = param->inside * param->outside;
|
||||
__global__ void MAXIMUM(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside
|
||||
) {
|
||||
int count = inside * outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / param->inside;
|
||||
int x = i % param->inside;
|
||||
const T* basicInput = input + y * param->axis * param->inside + x;
|
||||
int axis = param->axis;
|
||||
int y = i / inside;
|
||||
int x = i % inside;
|
||||
const T* basicInput = input + y * axis * inside + x;
|
||||
|
||||
float res = (float)basicInput[0];
|
||||
for (int v=1; v<axis; ++v) {
|
||||
res = max((float)basicInput[v * param->inside], res);
|
||||
res = max((float)basicInput[v * inside], res);
|
||||
}
|
||||
output[y * param->inside + x] = (T)res;
|
||||
output[y * inside + x] = (T)res;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void PROD(const T *input, T *output, const ReduceParam* param) {
|
||||
int count = param->inside * param->outside;
|
||||
__global__ void PROD(const T *input, T *output,
|
||||
const int outside,
|
||||
const int axis,
|
||||
const int inside
|
||||
) {
|
||||
int count = inside * outside;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
int y = i / param->inside;
|
||||
int x = i % param->inside;
|
||||
int axis = param->axis;
|
||||
int y = i / inside;
|
||||
int x = i % inside;
|
||||
|
||||
float sumValue = 1.0;
|
||||
const T* basicInput = input + y * param->axis * param->inside + x;
|
||||
const T* basicInput = input + y * axis * inside + x;
|
||||
for (int v=0; v<axis; ++v) {
|
||||
sumValue *= (float)basicInput[v * param->inside];
|
||||
sumValue *= (float)basicInput[v * inside];
|
||||
}
|
||||
output[y * param->inside + x] = (T)sumValue;
|
||||
output[y * inside + x] = (T)sumValue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -30,62 +30,6 @@ __global__ void SOFTMAX(const T *input, T *output,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceSum(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1)
|
||||
val += __shfl_xor_sync(0xffffffff, val, mask, 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T warpReduceMax(T val)
|
||||
{
|
||||
for(int mask = 16; mask > 0; mask >>= 1)
|
||||
val = max(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceSum(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceSum<T>(val);
|
||||
|
||||
if(lane == 0)
|
||||
shared[wid] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceSum(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__
|
||||
T blockReduceMax(T val)
|
||||
{
|
||||
static __shared__ T shared[32];
|
||||
int lane = threadIdx.x & 0x1f;
|
||||
int wid = threadIdx.x >> 5;
|
||||
|
||||
val = warpReduceMax<T>(val);
|
||||
|
||||
if(lane == 0)
|
||||
shared[wid] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
|
||||
val = warpReduceMax(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void SOFTMAX_WARP_32(const T *input, T *output,
|
||||
const int inside,
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include <vector>
|
||||
#include "ReductionTemplate.cuh"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include <float.h>
|
||||
|
||||
|
|
|
@ -143,7 +143,11 @@ def generateGemmFile(headfile):
|
|||
|
||||
hpp += out_align + out_precision_name + epilogue_name + ",\n "
|
||||
hpp += "SwizzleThreadBlock,\n "
|
||||
hpp += "NumStages>;\n\n"
|
||||
hpp += "NumStages"
|
||||
if sm_name == "_Sm75":
|
||||
hpp += ",\n 128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, 128 / cutlass::sizeof_bits<" + element_input_precision + ">::value, true>;\n\n"
|
||||
else :
|
||||
hpp += ">;\n\n"
|
||||
|
||||
hpp += "}\n}\n#endif"
|
||||
with open(headfile, "w") as f:
|
||||
|
|
|
@ -428,20 +428,6 @@ Execution* OpenCLBackend::onCreate(const std::vector<Tensor*>& inputs, const std
|
|||
valid = false;
|
||||
break;
|
||||
}
|
||||
|
||||
//input in raster not used, origin instead
|
||||
auto des = TensorUtils::getDescribe(t)->regions;
|
||||
for(auto region : des)
|
||||
{
|
||||
auto tensor = region.origin;
|
||||
auto tensorShape = OpenCL::tensorShapeFormat(tensor);
|
||||
int originHeight = tensorShape[0] * tensorShape[1];
|
||||
int originWidth = tensorShape[2] * UP_DIV(tensorShape[3], 4);
|
||||
if (originHeight > maxImageSize.at(0) || originWidth > maxImageSize.at(1)) {
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto t : outputs) {
|
||||
auto tensorShape = OpenCL::tensorShapeFormat(t);
|
||||
|
|
|
@ -123,15 +123,8 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
|
|||
isSetWorkGroupAttribute = true;
|
||||
} else if (deviceVendor.find("Intel") != std::string::npos) {
|
||||
mGpuType = INTEL;
|
||||
std::string opencl_c_version = mFirstGPUDevicePtr->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
|
||||
int version = 0;
|
||||
for (auto s : opencl_c_version) {
|
||||
if (s >= '0' && s <= '9') {
|
||||
version += (s - '0');
|
||||
version *= 10;
|
||||
}
|
||||
}
|
||||
if (version >= 120) {
|
||||
const std::string extensions = mFirstGPUDevicePtr->getInfo<CL_DEVICE_EXTENSIONS>();
|
||||
if (extensions.find("cl_intel_subgroups") != std::string::npos) {
|
||||
mSupportedIntelSubgroup = true;
|
||||
uint32_t execution_units_count = mFirstGPUDevicePtr->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
|
||||
uint32_t num_threads_per_eu = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NUM_THREADS_PER_EU_INTEL>();
|
||||
|
|
|
@ -16,10 +16,8 @@ namespace MNN {
|
|||
namespace OpenCL {
|
||||
|
||||
BinaryBufExecution::BinaryBufExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend), mCompute(compute) {
|
||||
: CommonExecution(backend, op), mCompute(compute) {
|
||||
mBuildOptions.emplace("-DOPERATOR=" + compute);
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
uint32_t BinaryBufExecution::realSize(const Tensor* tensor) {
|
||||
|
|
|
@ -0,0 +1,351 @@
|
|||
//
|
||||
// LoopBufExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/02/28.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
|
||||
#include "backend/opencl/execution/buffer/LoopBufExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
static void _TileOrPackTensor(Tensor *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
|
||||
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
|
||||
const int Batch, OpenCLRuntime *runTime, const std::string &KernelName, const std::set<std::string> &buildOptions) {
|
||||
kernel = runTime->buildKernel("loop_buf", KernelName, buildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
|
||||
|
||||
uint32_t index = 0;
|
||||
kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
kernel.setArg(index++, openCLBuffer(input));
|
||||
kernel.setArg(index++, openCLBuffer(output));
|
||||
kernel.setArg(index++, Width);
|
||||
kernel.setArg(index++, Height);
|
||||
kernel.setArg(index++, Channel);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, kernel).first;
|
||||
|
||||
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
}
|
||||
|
||||
static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
|
||||
const std::vector<Tensor *> &outputs, const LoopParam *loop) {
|
||||
if (loop->inputIndexes() != nullptr) {
|
||||
for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
|
||||
result[loop->inputIndexes()->data()[i]] = inputs[i];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
|
||||
result[loop->outputIndexes()->data()[i]] = outputs[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
LoopGatherBufExecution::LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
||||
: CommonExecution(bn, op) {
|
||||
mLoop = loop;
|
||||
mTensors.resize(mLoop->tensorNumber());
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
}
|
||||
ErrorCode LoopGatherBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
||||
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
||||
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
||||
mUnits.clear();
|
||||
mOffsetTensors.clear();
|
||||
mTmpTensors.resize(2);
|
||||
int x = cmd->size()->data()[0];
|
||||
int y = cmd->size()->data()[1];
|
||||
int z = cmd->size()->data()[2];
|
||||
int n = mLoop->loopNumber();
|
||||
|
||||
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
mStride_src[i] = srcStride[i];
|
||||
mStride_dst[i] = dstStride[i];
|
||||
}
|
||||
|
||||
mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
|
||||
mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
|
||||
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
||||
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
||||
|
||||
// tile input
|
||||
{
|
||||
auto input = mTensors[cmd->indexes()->data()[1]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mTmpTensors[1] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
|
||||
mOpenCLBackend->onAcquireBuffer(mTmpTensors[1].get(), Backend::DYNAMIC);
|
||||
|
||||
Unit unit;
|
||||
_TileOrPackTensor(mTensors[cmd->indexes()->data()[1]], mTmpTensors[1].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
||||
if (mIter[i] >= 0) {
|
||||
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
|
||||
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
|
||||
|
||||
Unit unit;
|
||||
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
}
|
||||
|
||||
// gather
|
||||
{
|
||||
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n, z, y, x}));
|
||||
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
|
||||
int offset_index = 0;
|
||||
|
||||
Unit unit;
|
||||
std::string KernelName = "batch_gather_buf";
|
||||
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
|
||||
|
||||
uint32_t index = 0;
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
|
||||
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
||||
if (mIter[i] >= 0) {
|
||||
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
|
||||
} else {
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
|
||||
}
|
||||
}
|
||||
unit.kernel.setArg(index++, x);
|
||||
unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
|
||||
unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
|
||||
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
||||
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
||||
|
||||
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
//pack output
|
||||
{
|
||||
auto output = mTensors[cmd->indexes()->data()[0]];
|
||||
std::vector<int> Shape = tensorShapeFormat(output);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
Unit unit;
|
||||
_TileOrPackTensor(mTmpTensors[0].get(), mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for (int i = 0; i < mTmpTensors.size(); ++i) {
|
||||
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
||||
}
|
||||
for (int i = 0; i < mOffsetTensors.size(); ++i) {
|
||||
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
LoopBatchMatMulBufExecution::LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
||||
: CommonExecution(bn, op) {
|
||||
mLoop = loop;
|
||||
mTensors.resize(mLoop->tensorNumber());
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
mHasBias = cmd->indexes()->size() > 3;
|
||||
mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
|
||||
mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
|
||||
}
|
||||
ErrorCode LoopBatchMatMulBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
||||
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
||||
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
||||
|
||||
mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
|
||||
mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
|
||||
mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
|
||||
mUnits.clear();
|
||||
mOffsetTensors.clear();
|
||||
mTmpTensors.resize(3);
|
||||
if (mHasBias) {
|
||||
mTmpTensors.resize(4);
|
||||
mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
|
||||
}
|
||||
|
||||
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
||||
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
||||
int e = cmd->size()->data()[0];
|
||||
int l = cmd->size()->data()[1];
|
||||
int h = cmd->size()->data()[2];
|
||||
int n = mLoop->loopNumber();
|
||||
|
||||
// tile input
|
||||
for (int i = 1; i < cmd->indexes()->size(); ++i) {
|
||||
auto input = mTensors[cmd->indexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}));
|
||||
mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
||||
|
||||
Unit unit;
|
||||
_TileOrPackTensor(input, mTmpTensors[i].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
||||
if (mIter[i] >= 0) {
|
||||
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width})));
|
||||
mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
|
||||
|
||||
Unit unit;
|
||||
_TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "tile_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
}
|
||||
|
||||
// matmul
|
||||
{
|
||||
mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}));
|
||||
mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
|
||||
int offset_index = 0;
|
||||
|
||||
Unit unit;
|
||||
std::string KernelName = "batch_matmul_buf";
|
||||
if (mHasBias) {
|
||||
mBuildOptions.emplace("-DBIAS");
|
||||
}
|
||||
if (mTransposeA) {
|
||||
mBuildOptions.emplace("-DTRANSPOSE_A");
|
||||
}
|
||||
if (mTransposeB) {
|
||||
mBuildOptions.emplace("-DTRANSPOSE_B");
|
||||
}
|
||||
unit.kernel = runTime->buildKernel("loop_buf", KernelName, mBuildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
|
||||
|
||||
uint32_t index = 0;
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[0].get()));
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[1].get()));
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[2].get()));
|
||||
if (mHasBias) {
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTmpTensors[3].get()));
|
||||
}
|
||||
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
||||
if (mIter[i] >= 0) {
|
||||
unit.kernel.setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
|
||||
} else {
|
||||
unit.kernel.setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
|
||||
}
|
||||
}
|
||||
unit.kernel.setArg(index++, e);
|
||||
unit.kernel.setArg(index++, l);
|
||||
unit.kernel.setArg(index++, h);
|
||||
unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
|
||||
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
||||
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
||||
|
||||
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
//pack output
|
||||
{
|
||||
auto output = mTensors[cmd->indexes()->data()[0]];
|
||||
std::vector<int> Shape = tensorShapeFormat(output);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
Unit unit;
|
||||
_TileOrPackTensor(mTmpTensors[0].get(), output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, "pack_buf", mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for (int i = 0; i < cmd->indexes()->size(); ++i) {
|
||||
mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
|
||||
}
|
||||
for (int i = 0; i < mOffsetTensors.size(); ++i) {
|
||||
mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class LoopBufCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
auto loop = op->main_as_LoopParam();
|
||||
if (nullptr == loop || loop->commands() == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (nullptr != loop->initCommand()) {
|
||||
return nullptr;
|
||||
}
|
||||
// Make Tensor Stack
|
||||
if (1 == loop->commands()->size()) {
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
auto subop = cmd->op();
|
||||
if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
|
||||
return new LoopGatherBufExecution(loop, op, backend);
|
||||
}
|
||||
if (OpType_MatMul == subop->type() && loop->parallel()) {
|
||||
return new LoopBatchMatMulBufExecution(loop, op, backend);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<LoopBufCreator> __LoopBuf_op(OpType_While, BUFFER);
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,60 @@
|
|||
//
|
||||
// LoopBufExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/04/23.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
|
||||
#ifndef LoopBufExecution_hpp
|
||||
#define LoopBufExecution_hpp
|
||||
|
||||
#include "backend/opencl/execution/image/CommonExecution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class LoopGatherBufExecution : public CommonExecution {
|
||||
public:
|
||||
LoopGatherBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
|
||||
virtual ~LoopGatherBufExecution() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const LoopParam *mLoop;
|
||||
std::vector<Tensor *> mTensors;
|
||||
std::vector<std::shared_ptr<Tensor>> mTmpTensors;
|
||||
std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
|
||||
int mStride_src[4];
|
||||
int mStride_dst[4];
|
||||
int mStep[2];
|
||||
int mIter[2];
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
class LoopBatchMatMulBufExecution : public CommonExecution {
|
||||
public:
|
||||
LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
|
||||
virtual ~LoopBatchMatMulBufExecution() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const LoopParam *mLoop;
|
||||
std::vector<Tensor *> mTensors;
|
||||
std::vector<std::shared_ptr<Tensor>> mTmpTensors;
|
||||
std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
|
||||
int mOffset[4];
|
||||
int mStep[4];
|
||||
int mIter[4];
|
||||
bool mHasBias = false;
|
||||
bool mTransposeA = false;
|
||||
bool mTransposeB = false;
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* LoopBufExecution_hpp */
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -18,10 +18,8 @@ namespace MNN {
|
|||
namespace OpenCL {
|
||||
|
||||
RasterBufExecution::RasterBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend) {
|
||||
: CommonExecution(backend, op) {
|
||||
mOpenCLBackend = (OpenCLBackend *)backend;
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
//nothing to do
|
||||
}
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
|
||||
ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start ReductionBufExecution init !\n");
|
||||
#endif
|
||||
|
@ -46,7 +46,6 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
|
|||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
mOp = op;
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end ReductionBufExecution init !\n");
|
||||
#endif
|
||||
|
@ -70,20 +69,20 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
|
|||
std::set<std::string> buildOption;
|
||||
switch (mReductType) {
|
||||
case 0:
|
||||
buildOption.emplace("-DOPERATE=num+in");
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
buildOption.emplace("-DGET_AVG");
|
||||
break;
|
||||
case 1:
|
||||
buildOption.emplace("-DOPERATE=max(num,in)");
|
||||
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
||||
break;
|
||||
case 2:
|
||||
buildOption.emplace("-DOPERATE=min(num,in)");
|
||||
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
||||
break;
|
||||
case 3:
|
||||
buildOption.emplace("-DOPERATE=num*in");
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
||||
break;
|
||||
case 4:
|
||||
buildOption.emplace("-DOPERATE=num+in");
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
|
@ -103,6 +102,7 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
|
|||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@ namespace MNN {
|
|||
namespace OpenCL {
|
||||
|
||||
ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend) {
|
||||
: CommonExecution(backend, op) {
|
||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto mPreluParamPtr = op->main_as_PRelu();
|
||||
int preluSize = mPreluParamPtr->slopeCount();
|
||||
|
@ -50,9 +50,6 @@ ReluBufExecution::ReluBufExecution(const std::vector<Tensor *> &inputs, const MN
|
|||
MNN_ERROR("Map error preluDataPtrCL == nullptr \n");
|
||||
}
|
||||
mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(preluBuffer, preluDataPtrCL);
|
||||
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
ReluBufExecution::~ReluBufExecution() {
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void batch_matmul(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
|
||||
#ifdef BIAS
|
||||
__global FLOAT* input_C,
|
||||
#endif
|
||||
__global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
|
||||
#ifdef BIAS
|
||||
__global FLOAT* offset_C,
|
||||
#endif
|
||||
__private const int e,
|
||||
__private const int l,
|
||||
__private const int h,
|
||||
__private const int4 offsets,
|
||||
__private const int4 iters,
|
||||
__private const int4 steps) {
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
int4 index = (int4)(pos.z);
|
||||
if (iters.x >= 0) {
|
||||
index.x = (int)(offset_O[pos.z]);
|
||||
}
|
||||
if (iters.y >= 0) {
|
||||
index.y = (int)(offset_A[pos.z]);
|
||||
}
|
||||
if (iters.z >= 0) {
|
||||
index.z = (int)(offset_B[pos.z]);
|
||||
}
|
||||
#ifdef BIAS
|
||||
if (iters.w >= 0) {
|
||||
index.w = (int)(offset_C[pos.z]);
|
||||
}
|
||||
#endif
|
||||
int4 offset = index * steps + offsets;
|
||||
|
||||
#if TRANSPOSE_A
|
||||
__global FLOAT* A_ptr = input_A + offset.y + pos.y;
|
||||
#else
|
||||
__global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
|
||||
#endif
|
||||
|
||||
#if TRANSPOSE_B
|
||||
__global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
|
||||
#else
|
||||
__global FLOAT* B_ptr = input_B + offset.z + pos.x;
|
||||
#endif
|
||||
|
||||
#ifdef BIAS
|
||||
FLOAT value = input_C[offset.w + pos.x];
|
||||
#else
|
||||
FLOAT value = 0;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < l; ++i){
|
||||
#if TRANSPOSE_A
|
||||
FLOAT value_a = A_ptr[i * e];
|
||||
#else
|
||||
FLOAT value_a = A_ptr[i];
|
||||
#endif
|
||||
|
||||
#if TRANSPOSE_B
|
||||
FLOAT value_b = B_ptr[i];
|
||||
#else
|
||||
FLOAT value_b = B_ptr[i * h];
|
||||
#endif
|
||||
|
||||
value = mad(value_a, value_b, value);
|
||||
}
|
||||
|
||||
output[offset.x + pos.y * h + pos.x] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void tile(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__read_only image2d_t input,
|
||||
__global FLOAT* output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channel){
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
const int w = pos.x % width;
|
||||
const int h = pos.x / width;
|
||||
const int c = pos.y << 2;
|
||||
|
||||
const int x_dst_pitch = 1;
|
||||
const int y_dst_pitch = x_dst_pitch * width;
|
||||
const int c_dst_pitch = y_dst_pitch * height;
|
||||
const int b_dst_pitch = c_dst_pitch * channel;
|
||||
__global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
|
||||
|
||||
FLOAT4 value = RI_F(input, SAMPLER, (int2)(pos.y * width + w, pos.z * height + h));
|
||||
dst_ptr[0] = value.x;
|
||||
if(c + 1 >= channel)return;
|
||||
dst_ptr[c_dst_pitch] = value.y;
|
||||
if(c + 2 >= channel)return;
|
||||
dst_ptr[2 * c_dst_pitch] = value.z;
|
||||
if(c + 3 >= channel)return;
|
||||
dst_ptr[3 * c_dst_pitch] = value.w;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void pack(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* input,
|
||||
__write_only image2d_t output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channel){
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
const int w = pos.x % width;
|
||||
const int h = pos.x / width;
|
||||
const int c = pos.y << 2;
|
||||
|
||||
const int x_src_pitch = 1;
|
||||
const int y_src_pitch = x_src_pitch * width;
|
||||
const int c_src_pitch = y_src_pitch * height;
|
||||
const int b_src_pitch = c_src_pitch * channel;
|
||||
__global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
|
||||
FLOAT4 value = (FLOAT4)0;
|
||||
FLOAT *value_ptr = (FLOAT*)&value;
|
||||
for(int i = 0; i < 4 && (i + c < channel); ++i){
|
||||
value_ptr[i] = src_ptr[i * c_src_pitch];
|
||||
}
|
||||
WI_F(output, (int2)(pos.y * width + w, pos.z * height + h), value);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void batch_gather(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* output, __global FLOAT* input,
|
||||
__global FLOAT* offset_dst, __global FLOAT* offset_src,
|
||||
__private const int x_size,
|
||||
__private const int4 stride_src,
|
||||
__private const int4 stride_dst,
|
||||
__private const int2 steps,
|
||||
__private const int2 iters) {
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
|
||||
int x = pos.x % x_size;
|
||||
int y = pos.x / x_size;
|
||||
|
||||
int2 index = (int2)(pos.z, pos.z);
|
||||
if (iters.x >= 0) {
|
||||
index.x = (int)(offset_dst[pos.z]);
|
||||
}
|
||||
if (iters.y >= 0) {
|
||||
index.y = (int)(offset_src[pos.z]);
|
||||
}
|
||||
int2 offset = index * steps;
|
||||
output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void batch_matmul_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* output, __global FLOAT* input_A, __global FLOAT* input_B,
|
||||
#ifdef BIAS
|
||||
__global FLOAT* input_C,
|
||||
#endif
|
||||
__global FLOAT* offset_O, __global FLOAT* offset_A, __global FLOAT* offset_B,
|
||||
#ifdef BIAS
|
||||
__global FLOAT* offset_C,
|
||||
#endif
|
||||
__private const int e,
|
||||
__private const int l,
|
||||
__private const int h,
|
||||
__private const int4 offsets,
|
||||
__private const int4 iters,
|
||||
__private const int4 steps) {
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
int4 index = (int4)(pos.z);
|
||||
if (iters.x >= 0) {
|
||||
index.x = (int)(offset_O[pos.z]);
|
||||
}
|
||||
if (iters.y >= 0) {
|
||||
index.y = (int)(offset_A[pos.z]);
|
||||
}
|
||||
if (iters.z >= 0) {
|
||||
index.z = (int)(offset_B[pos.z]);
|
||||
}
|
||||
#ifdef BIAS
|
||||
if (iters.w >= 0) {
|
||||
index.w = (int)(offset_C[pos.z]);
|
||||
}
|
||||
#endif
|
||||
int4 offset = index * steps + offsets;
|
||||
|
||||
#if TRANSPOSE_A
|
||||
__global FLOAT* A_ptr = input_A + offset.y + pos.y;
|
||||
#else
|
||||
__global FLOAT* A_ptr = input_A + offset.y + pos.y * l;
|
||||
#endif
|
||||
|
||||
#if TRANSPOSE_B
|
||||
__global FLOAT* B_ptr = input_B + offset.z + pos.x * l;
|
||||
#else
|
||||
__global FLOAT* B_ptr = input_B + offset.z + pos.x;
|
||||
#endif
|
||||
|
||||
#ifdef BIAS
|
||||
FLOAT value = input_C[offset.w + pos.x];
|
||||
#else
|
||||
FLOAT value = 0;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < l; ++i){
|
||||
#if TRANSPOSE_A
|
||||
FLOAT value_a = A_ptr[i * e];
|
||||
#else
|
||||
FLOAT value_a = A_ptr[i];
|
||||
#endif
|
||||
|
||||
#if TRANSPOSE_B
|
||||
FLOAT value_b = B_ptr[i];
|
||||
#else
|
||||
FLOAT value_b = B_ptr[i * h];
|
||||
#endif
|
||||
|
||||
value = mad(value_a, value_b, value);
|
||||
}
|
||||
|
||||
output[offset.x + pos.y * h + pos.x] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void tile_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* input, __global FLOAT* output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channel){
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
const int w = pos.x % width;
|
||||
const int h = pos.x / width;
|
||||
const int c = pos.y << 2;
|
||||
const int x_src_pitch = 4;
|
||||
const int y_src_pitch = x_src_pitch * width;
|
||||
const int c_src_pitch = y_src_pitch * height;
|
||||
const int b_src_pitch = c_src_pitch * ((channel + 3) / 4);
|
||||
|
||||
const int x_dst_pitch = 1;
|
||||
const int y_dst_pitch = x_dst_pitch * width;
|
||||
const int c_dst_pitch = y_dst_pitch * height;
|
||||
const int b_dst_pitch = c_dst_pitch * channel;
|
||||
__global FLOAT* dst_ptr = output + pos.z * b_dst_pitch + c * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch;
|
||||
|
||||
FLOAT4 value = vload4(0, input + pos.z * b_src_pitch + pos.y * c_src_pitch + h * y_src_pitch + w * x_src_pitch);
|
||||
dst_ptr[0] = value.x;
|
||||
if(c + 1 >= channel)return;
|
||||
dst_ptr[c_dst_pitch] = value.y;
|
||||
if(c + 2 >= channel)return;
|
||||
dst_ptr[2 * c_dst_pitch] = value.z;
|
||||
if(c + 3 >= channel)return;
|
||||
dst_ptr[3 * c_dst_pitch] = value.w;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void pack_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* input, __global FLOAT* output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channel){
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
const int w = pos.x % width;
|
||||
const int h = pos.x / width;
|
||||
const int c = pos.y << 2;
|
||||
const int x_dst_pitch = 4;
|
||||
const int y_dst_pitch = x_dst_pitch * width;
|
||||
const int c_dst_pitch = y_dst_pitch * height;
|
||||
const int b_dst_pitch = c_dst_pitch * ((channel + 3) / 4);
|
||||
|
||||
const int x_src_pitch = 1;
|
||||
const int y_src_pitch = x_src_pitch * width;
|
||||
const int c_src_pitch = y_src_pitch * height;
|
||||
const int b_src_pitch = c_src_pitch * channel;
|
||||
__global FLOAT* src_ptr = input + pos.z * b_src_pitch + c * c_src_pitch + h * y_src_pitch + w * x_src_pitch;
|
||||
FLOAT4 value = (FLOAT4)0;
|
||||
FLOAT *value_ptr = (FLOAT*)&value;
|
||||
for(int i = 0; i < 4 && (i + c < channel); ++i){
|
||||
value_ptr[i] = src_ptr[i * c_src_pitch];
|
||||
}
|
||||
vstore4(value, 0, output + pos.z * b_dst_pitch + pos.y * c_dst_pitch + h * y_dst_pitch + w * x_dst_pitch);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void batch_gather_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
|
||||
__global FLOAT* output, __global FLOAT* input,
|
||||
__global FLOAT* offset_dst, __global FLOAT* offset_src,
|
||||
__private const int x_size,
|
||||
__private const int4 stride_src,
|
||||
__private const int4 stride_dst,
|
||||
__private const int2 steps,
|
||||
__private const int2 iters) {
|
||||
int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
|
||||
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
|
||||
|
||||
int x = pos.x % x_size;
|
||||
int y = pos.x / x_size;
|
||||
|
||||
int2 index = (int2)(pos.z, pos.z);
|
||||
if (iters.x >= 0) {
|
||||
index.x = (int)(offset_dst[pos.z]);
|
||||
}
|
||||
if (iters.y >= 0) {
|
||||
index.y = (int)(offset_src[pos.z]);
|
||||
}
|
||||
int2 offset = index * steps;
|
||||
output[offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z] = input[offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z];
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -19,34 +19,44 @@ __kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT sum = 0;
|
||||
FLOAT4 sum = 0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum + in.x;
|
||||
sum = sum + in;
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum/height, 0.0, 0.0, 0.0));
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x += sum_ptr[i];
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
|
||||
}
|
||||
__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT sum = 0;
|
||||
FLOAT4 sum = 0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum + in.x;
|
||||
sum = sum + in;
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x += sum_ptr[i];
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
|
||||
|
@ -54,17 +64,22 @@ __kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT sum = -MAXFLOAT;
|
||||
FLOAT4 sum = (FLOAT4)-MAXFLOAT;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = max(sum, in.x);
|
||||
sum = max(sum, in);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x = max(sum.x, sum_ptr[i]);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
|
||||
|
@ -72,17 +87,22 @@ __kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT sum = MAXFLOAT;
|
||||
FLOAT4 sum = (FLOAT4)MAXFLOAT;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = min(sum, in.x);
|
||||
sum = min(sum, in);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x = min(sum.x, sum_ptr[i]);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
|
||||
|
@ -90,17 +110,22 @@ __kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT sum = 1.0;
|
||||
FLOAT4 sum = (FLOAT4)1.0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum * in.x;
|
||||
sum = sum * in;
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum, 0.0, 0.0, 0.0));
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x *= sum_ptr[i];
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
|
||||
|
@ -108,21 +133,27 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
sum[idx] = 0.0;
|
||||
FLOAT4 out = (FLOAT4)0.0;
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum[idx] = sum[idx] + in.x;
|
||||
out = out + in;
|
||||
}
|
||||
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x += out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
|
@ -130,7 +161,8 @@ __kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
|
|||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/height, 0.0, 0.0, 0.0));
|
||||
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
|
||||
|
@ -138,22 +170,27 @@ __kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
sum[idx] = 0.0;
|
||||
|
||||
FLOAT4 out = (FLOAT4)0.0;
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum[idx] = sum[idx] + in.x;
|
||||
out = out + in;
|
||||
}
|
||||
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x += out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
|
@ -170,20 +207,26 @@ __kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
sum[idx] = -MAXFLOAT;
|
||||
FLOAT4 out = (FLOAT4)(-MAXFLOAT);
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum[idx] = max(sum[idx], in.x);
|
||||
out = max(out, in);
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x = max(out.x, out_ptr[i]);
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
|
@ -202,22 +245,28 @@ __kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
sum[idx] = MAXFLOAT;
|
||||
FLOAT4 out = (FLOAT4)(MAXFLOAT);
|
||||
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum[idx] = min(sum[idx], in.x);
|
||||
out = min(out, in);
|
||||
}
|
||||
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x = min(out.x, out_ptr[i]);
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
|
@ -234,21 +283,27 @@ __kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
|
|||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
sum[idx] = 1.0;
|
||||
FLOAT4 out = (FLOAT4)1.0;
|
||||
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum[idx] = sum[idx] * in.x;
|
||||
out = out * in;
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x *= out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
|
|
|
@ -14,21 +14,26 @@ __kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
|
|||
__global FLOAT* output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
|
||||
FLOAT num = input[inp_offset];
|
||||
FLOAT4 out = vload4(0, input + inp_offset);
|
||||
for (int h = 1; h < height; h++) {
|
||||
FLOAT in = input[inp_offset + h*width*4];
|
||||
num = OPERATE;
|
||||
FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int c = 1; c < channel; ++c){
|
||||
out.x = OPERATE(out.x, out_ptr[c]);
|
||||
}
|
||||
|
||||
#ifdef GET_AVG
|
||||
num = num / height;
|
||||
out.x = out.x / (height * channel);
|
||||
#endif
|
||||
const int out_offset = batch_idx * width + width_idx;
|
||||
vstore4((FLOAT4)(num, 0.0, 0.0, 0.0), out_offset, output);
|
||||
vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
|
||||
}
|
||||
|
|
|
@ -10,7 +10,9 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
CommonExecution::CommonExecution(Backend *backend) : Execution(backend) {
|
||||
CommonExecution::CommonExecution(Backend *backend, const MNN::Op *Op)
|
||||
: Execution(backend), mOp(Op) {
|
||||
mOpType = Op->type();
|
||||
}
|
||||
ErrorCode CommonExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
|
||||
|
|
|
@ -15,7 +15,7 @@ namespace OpenCL {
|
|||
|
||||
class CommonExecution : public Execution {
|
||||
public:
|
||||
CommonExecution(Backend *backend);
|
||||
CommonExecution(Backend *backend, const MNN::Op *Op);
|
||||
virtual ~CommonExecution() = default;
|
||||
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
|
||||
Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
|
||||
auto common = op->main_as_Convolution2D()->common();
|
||||
mStrides = {common->strideY(), common->strideX()};
|
||||
mDilations = {common->dilateY(), common->dilateX()};
|
||||
|
@ -25,8 +25,6 @@ Conv2DBackPropFilter::Conv2DBackPropFilter(const MNN::Op *op, Backend *backend)
|
|||
if (common->padMode() == PadMode_VALID) {
|
||||
mPaddings[0] = mPaddings[1] = 0;
|
||||
}
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
Conv2DBackPropFilter::~Conv2DBackPropFilter() {
|
||||
|
|
|
@ -28,10 +28,8 @@ static string swapComputeIn0In1(const string& computeOrigin) {
|
|||
}
|
||||
|
||||
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend), mCompute(compute) {
|
||||
: CommonExecution(backend, op), mCompute(compute) {
|
||||
mBuildOptions.emplace("-DOPERATOR=" + compute);
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
uint32_t EltwiseExecution::realSize(const Tensor* tensor) {
|
||||
|
|
|
@ -0,0 +1,370 @@
|
|||
//
|
||||
// LoopExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/02/28.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
|
||||
#include "backend/opencl/execution/image/LoopExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
static void _TileTensor(Tensor *input, cl::Buffer *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
|
||||
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
|
||||
const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
|
||||
kernel = runTime->buildKernel("loop", "tile", buildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
|
||||
|
||||
uint32_t index = 0;
|
||||
kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
kernel.setArg(index++, openCLImage(input));
|
||||
kernel.setArg(index++, *output);
|
||||
kernel.setArg(index++, Width);
|
||||
kernel.setArg(index++, Height);
|
||||
kernel.setArg(index++, Channel);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "tile", kernel).first;
|
||||
|
||||
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
}
|
||||
|
||||
static void _PackTensor(cl::Buffer *input, Tensor *output, cl::Kernel& kernel, cl::NDRange &globalWorkSize,
|
||||
cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
|
||||
const int Batch, OpenCLRuntime *runTime, const std::set<std::string> &buildOptions) {
|
||||
kernel = runTime->buildKernel("loop", "pack", buildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width * Height), (uint32_t)(UP_DIV(Channel, 4)), (uint32_t)(Batch)};
|
||||
|
||||
uint32_t index = 0;
|
||||
kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
kernel.setArg(index++, *input);
|
||||
kernel.setArg(index++, openCLImage(output));
|
||||
kernel.setArg(index++, Width);
|
||||
kernel.setArg(index++, Height);
|
||||
kernel.setArg(index++, Channel);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "pack", kernel).first;
|
||||
|
||||
globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
}
|
||||
|
||||
static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
|
||||
const std::vector<Tensor *> &outputs, const LoopParam *loop) {
|
||||
if (loop->inputIndexes() != nullptr) {
|
||||
for (int i = 0; i < loop->inputIndexes()->size(); ++i) {
|
||||
result[loop->inputIndexes()->data()[i]] = inputs[i];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < loop->outputIndexes()->size(); ++i) {
|
||||
result[loop->outputIndexes()->data()[i]] = outputs[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
LoopGatherExecution::LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
||||
: CommonExecution(bn, op) {
|
||||
mLoop = loop;
|
||||
mTensors.resize(mLoop->tensorNumber());
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
mOpType = op->type();
|
||||
}
|
||||
ErrorCode LoopGatherExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
||||
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
||||
auto bufferPool = mOpenCLBackend->getBufferPool();
|
||||
auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
|
||||
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
||||
mUnits.clear();
|
||||
mOffsetBuffers.clear();
|
||||
mTmpBuffers.resize(2);
|
||||
int x = cmd->size()->data()[0];
|
||||
int y = cmd->size()->data()[1];
|
||||
int z = cmd->size()->data()[2];
|
||||
int n = mLoop->loopNumber();
|
||||
|
||||
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
mStride_src[i] = srcStride[i];
|
||||
mStride_dst[i] = dstStride[i];
|
||||
}
|
||||
|
||||
mStride_src[3] = cmd->view()->GetAs<View>(1)->offset();
|
||||
mStride_dst[3] = cmd->view()->GetAs<View>(0)->offset();
|
||||
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
||||
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
||||
|
||||
// tile input
|
||||
{
|
||||
auto input = mTensors[cmd->indexes()->data()[1]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mTmpBuffers[1] = bufferPool->alloc(input->elementSize() * bufferUnitSize);
|
||||
|
||||
Unit unit;
|
||||
_TileTensor(mTensors[cmd->indexes()->data()[1]], mTmpBuffers[1], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height,Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
||||
if (mIter[i] >= 0) {
|
||||
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize));
|
||||
|
||||
Unit unit;
|
||||
_TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
}
|
||||
|
||||
// gather
|
||||
{
|
||||
mTmpBuffers[0] = bufferPool->alloc(n * z * y * x * bufferUnitSize);
|
||||
int offset_index = 0;
|
||||
Unit unit;
|
||||
std::string KernelName = "batch_gather";
|
||||
unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x * y), (uint32_t)(z), (uint32_t)(n)};
|
||||
|
||||
uint32_t index = 0;
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[0]);
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[1]);
|
||||
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
||||
if (mIter[i] >= 0) {
|
||||
unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
|
||||
} else {
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[0]);
|
||||
}
|
||||
}
|
||||
unit.kernel.setArg(index++, x);
|
||||
unit.kernel.setArg(index++, sizeof(mStride_src), mStride_src);
|
||||
unit.kernel.setArg(index++, sizeof(mStride_dst), mStride_dst);
|
||||
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
||||
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
||||
|
||||
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
//pack output
|
||||
{
|
||||
auto output = mTensors[cmd->indexes()->data()[0]];
|
||||
std::vector<int> Shape = tensorShapeFormat(output);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
Unit unit;
|
||||
_PackTensor(mTmpBuffers[0], mTensors[cmd->indexes()->data()[0]], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for (int i = 0; i < mTmpBuffers.size(); ++i) {
|
||||
bufferPool->recycle(mTmpBuffers[i]);
|
||||
}
|
||||
for (int i = 0; i < mOffsetBuffers.size(); ++i) {
|
||||
bufferPool->recycle(mOffsetBuffers[i]);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
LoopBatchMatMulExecution::LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn)
|
||||
: CommonExecution(bn, op) {
|
||||
mLoop = loop;
|
||||
mTensors.resize(mLoop->tensorNumber());
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
mHasBias = cmd->indexes()->size() > 3;
|
||||
mTransposeA = cmd->op()->main_as_MatMul()->transposeA();
|
||||
mTransposeB = cmd->op()->main_as_MatMul()->transposeB();
|
||||
}
|
||||
ErrorCode LoopBatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
OpenCLBackend *mOpenCLBackend = (OpenCLBackend *)backend();
|
||||
auto runTime = mOpenCLBackend->getOpenCLRuntime();
|
||||
auto bufferPool = mOpenCLBackend->getBufferPool();
|
||||
auto bufferUnitSize = runTime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
|
||||
_setTensorStack(mTensors, inputs, outputs, mLoop);
|
||||
|
||||
mOffset[0] = cmd->view()->GetAs<View>(0)->offset();
|
||||
mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
|
||||
mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
|
||||
mUnits.clear();
|
||||
mOffsetBuffers.clear();
|
||||
mTmpBuffers.resize(3);
|
||||
if (mHasBias) {
|
||||
mTmpBuffers.resize(4);
|
||||
mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
|
||||
}
|
||||
|
||||
::memcpy(mStep, cmd->steps()->data(), cmd->steps()->size() * sizeof(int));
|
||||
::memcpy(mIter, cmd->iterIndexes()->data(), cmd->iterIndexes()->size() * sizeof(int));
|
||||
int e = cmd->size()->data()[0];
|
||||
int l = cmd->size()->data()[1];
|
||||
int h = cmd->size()->data()[2];
|
||||
int n = mLoop->loopNumber();
|
||||
|
||||
// tile input
|
||||
for (int i = 1; i < cmd->indexes()->size(); ++i) {
|
||||
auto input = mTensors[cmd->indexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mTmpBuffers[i] = bufferPool->alloc(input->elementSize() * bufferUnitSize);
|
||||
|
||||
Unit unit;
|
||||
_TileTensor(input, mTmpBuffers[i], unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
|
||||
if (mIter[i] >= 0) {
|
||||
auto input = mTensors[cmd->iterIndexes()->data()[i]];
|
||||
std::vector<int> Shape = tensorShapeFormat(input);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
mOffsetBuffers.emplace_back(bufferPool->alloc(input->elementSize() * bufferUnitSize));
|
||||
|
||||
Unit unit;
|
||||
_TileTensor(input, mOffsetBuffers.back(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
}
|
||||
|
||||
// matmul
|
||||
{
|
||||
mTmpBuffers[0] = bufferPool->alloc(n * e * h * bufferUnitSize);
|
||||
int offset_index = 0;
|
||||
|
||||
Unit unit;
|
||||
std::string KernelName = "batch_matmul";
|
||||
if (mHasBias) {
|
||||
mBuildOptions.emplace("-DBIAS");
|
||||
}
|
||||
if (mTransposeA) {
|
||||
mBuildOptions.emplace("-DTRANSPOSE_A");
|
||||
}
|
||||
if (mTransposeB) {
|
||||
mBuildOptions.emplace("-DTRANSPOSE_B");
|
||||
}
|
||||
unit.kernel = runTime->buildKernel("loop", KernelName, mBuildOptions);
|
||||
uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
||||
std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(h), (uint32_t)(e),(uint32_t)(n)};
|
||||
|
||||
uint32_t index = 0;
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[0]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[1]);
|
||||
unit.kernel.setArg(index++, mGlobalWorkSize[2]);
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[0]);
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[1]);
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[2]);
|
||||
if (mHasBias) {
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[3]);
|
||||
}
|
||||
for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
|
||||
if (mIter[i] >= 0) {
|
||||
unit.kernel.setArg(index++, *mOffsetBuffers[offset_index++]);
|
||||
} else {
|
||||
unit.kernel.setArg(index++, *mTmpBuffers[0]);
|
||||
}
|
||||
}
|
||||
unit.kernel.setArg(index++, e);
|
||||
unit.kernel.setArg(index++, l);
|
||||
unit.kernel.setArg(index++, h);
|
||||
unit.kernel.setArg(index++, sizeof(mOffset), mOffset);
|
||||
unit.kernel.setArg(index++, sizeof(mIter), mIter);
|
||||
unit.kernel.setArg(index++, sizeof(mStep), mStep);
|
||||
|
||||
std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
|
||||
|
||||
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
|
||||
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
//pack output
|
||||
{
|
||||
auto output = mTensors[cmd->indexes()->data()[0]];
|
||||
std::vector<int> Shape = tensorShapeFormat(output);
|
||||
const int Channel = Shape.at(3);
|
||||
const int Width = Shape.at(2);
|
||||
const int Height = Shape.at(1);
|
||||
const int Batch = Shape.at(0);
|
||||
Unit unit;
|
||||
_PackTensor(mTmpBuffers[0], output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, runTime, mBuildOptions);
|
||||
mUnits.emplace_back(unit);
|
||||
}
|
||||
|
||||
for (int i = 0; i < mTmpBuffers.size(); ++i) {
|
||||
bufferPool->recycle(mTmpBuffers[i]);
|
||||
}
|
||||
for (int i = 0; i < mOffsetBuffers.size(); ++i) {
|
||||
bufferPool->recycle(mOffsetBuffers[i]);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class LoopCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
auto loop = op->main_as_LoopParam();
|
||||
if (nullptr == loop || loop->commands() == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (nullptr != loop->initCommand()) {
|
||||
return nullptr;
|
||||
}
|
||||
// Make Tensor Stack
|
||||
if (1 == loop->commands()->size()) {
|
||||
auto cmd = loop->commands()->GetAs<RegionCommand>(0);
|
||||
auto subop = cmd->op();
|
||||
if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
|
||||
return new LoopGatherExecution(loop, op, backend);
|
||||
}
|
||||
if (OpType_MatMul == subop->type() && loop->parallel()) {
|
||||
return new LoopBatchMatMulExecution(loop, op, backend);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<LoopCreator> __Loop_op(OpType_While, IMAGE);
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
|
@ -0,0 +1,58 @@
|
|||
//
|
||||
// LoopExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/05/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
|
||||
#ifndef LoopExecution_hpp
|
||||
#define LoopExecution_hpp
|
||||
|
||||
#include "backend/opencl/execution/image/CommonExecution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class LoopGatherExecution : public CommonExecution {
|
||||
public:
|
||||
LoopGatherExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
|
||||
virtual ~LoopGatherExecution() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const LoopParam *mLoop;
|
||||
std::vector<Tensor *> mTensors;
|
||||
std::vector<cl::Buffer *> mTmpBuffers;
|
||||
std::vector<cl::Buffer *> mOffsetBuffers;
|
||||
int mStride_src[4];
|
||||
int mStride_dst[4];
|
||||
int mStep[2];
|
||||
int mIter[2];
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
class LoopBatchMatMulExecution : public CommonExecution {
|
||||
public:
|
||||
LoopBatchMatMulExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
|
||||
virtual ~LoopBatchMatMulExecution() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const LoopParam *mLoop;
|
||||
std::vector<Tensor *> mTensors;
|
||||
std::vector<cl::Buffer*> mTmpBuffers;
|
||||
std::vector<cl::Buffer*> mOffsetBuffers;
|
||||
int mOffset[4];
|
||||
int mStep[4];
|
||||
int mIter[4];
|
||||
bool mHasBias = false;
|
||||
bool mTransposeA = false;
|
||||
bool mTransposeB = false;
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* LoopExecution_hpp */
|
|
@ -15,7 +15,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
|
||||
MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
|
||||
auto common = op->main_as_Convolution2D()->common();
|
||||
mPadMode = common->padMode();
|
||||
mStrides = {common->strideY(), common->strideX()};
|
||||
|
@ -25,8 +25,6 @@ MultiInputDWConvExecution::MultiInputDWConvExecution(const MNN::Op *op, Backend
|
|||
}
|
||||
isRelu = common->relu();
|
||||
isRelu6 = common->relu6();
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
MultiInputDWConvExecution::~MultiInputDWConvExecution() {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend) {
|
||||
MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) {
|
||||
auto common = op->main_as_Convolution2D()->common();
|
||||
|
||||
mStrides = {common->strideY(), common->strideX()};
|
||||
|
@ -30,8 +30,6 @@ MultiInputDWDeconvExecution::MultiInputDWDeconvExecution(const MNN::Op *op, Back
|
|||
|
||||
isRelu = common->relu();
|
||||
isRelu6 = common->relu6();
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
|
||||
MultiInputDWDeconvExecution::~MultiInputDWDeconvExecution() {
|
||||
|
|
|
@ -17,10 +17,8 @@ namespace OpenCL {
|
|||
|
||||
|
||||
RasterExecution::RasterExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend) {
|
||||
: CommonExecution(backend, op) {
|
||||
mOpenCLBackend = (OpenCLBackend *)backend;
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
//nothing to do
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend) {
|
||||
ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start ReductionExecution init !\n");
|
||||
#endif
|
||||
|
@ -44,7 +44,6 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
|
|||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
mOp = op;
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end ReductionExecution init !\n");
|
||||
#endif
|
||||
|
@ -89,7 +88,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
break;
|
||||
}
|
||||
} else { //useLocal
|
||||
uint32_t global_x;
|
||||
uint32_t global_x = 8;
|
||||
int size = inputShape[1];
|
||||
if (size >= 1024) {
|
||||
global_x = 256;
|
||||
|
@ -144,6 +143,7 @@ ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
||||
mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace MNN {
|
|||
namespace OpenCL {
|
||||
|
||||
ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend) {
|
||||
: CommonExecution(backend, op) {
|
||||
auto mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto mPreluParamPtr = op->main_as_PRelu();
|
||||
int preluSize = mPreluParamPtr->slopeCount();
|
||||
|
@ -50,8 +50,6 @@ ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op
|
|||
mOpenCLBackend->onAcquireBuffer(mPreluParam.get(), Backend::STATIC);
|
||||
copyBufferToImage(mOpenCLBackend->getOpenCLRuntime(), preluBuffer, openCLImage(mPreluParam.get()),
|
||||
UP_DIV(preluSize, 4), 1);
|
||||
mOp = op;
|
||||
mOpType = op->type();
|
||||
}
|
||||
ReluExecution::~ReluExecution() {
|
||||
backend()->onReleaseBuffer(mPreluParam.get(), Backend::STATIC);
|
||||
|
|
|
@ -13,8 +13,7 @@
|
|||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend), mOp(op), mInitialized(false) {
|
||||
mOp = op;
|
||||
TrainableParamExecution::TrainableParamExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op), mInitialized(false) {
|
||||
}
|
||||
|
||||
TrainableParamExecution::~TrainableParamExecution() {
|
||||
|
|
|
@ -84,7 +84,41 @@ public:
|
|||
}
|
||||
return true;
|
||||
}
|
||||
static bool compressFloatWeightToSparse(MNN::OpT* op) {
|
||||
auto opType = op->type;
|
||||
auto param = op->main.AsConvolution2D();
|
||||
if (param->sparseParameter.get() == nullptr) {
|
||||
return false;
|
||||
}
|
||||
// Encode for sparse float weight
|
||||
size_t weightSize = param->weight.size();
|
||||
|
||||
if (weightSize > std::numeric_limits<uint32_t>().max()) {
|
||||
MNN_ERROR("The weightSize exceed uint32_t, can't compress the sparse weight\n");
|
||||
return false;
|
||||
}
|
||||
param->quanParameter.reset(new IDSTQuanT);
|
||||
size_t validSize = 0;
|
||||
std::vector<uint32_t> indexes;
|
||||
std::vector<float> newWeights;
|
||||
|
||||
for (size_t i=0; i<weightSize; ++i) {
|
||||
if (param->weight[i] != 0.0f) {
|
||||
indexes.emplace_back(i);
|
||||
newWeights.emplace_back(param->weight[i]);
|
||||
}
|
||||
}
|
||||
// If empty, Add Single weight to avoid error, runtime can't extract full sparse convolution
|
||||
if (indexes.empty()) {
|
||||
indexes.emplace_back(0);
|
||||
newWeights.emplace_back(0.0f);
|
||||
}
|
||||
param->weight.clear();
|
||||
param->quanParameter->alpha = std::move(newWeights);
|
||||
param->quanParameter->weightSize = (uint32_t)weightSize;
|
||||
param->quanParameter->index = std::move(indexes);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
|
|
|
@ -256,6 +256,10 @@ public:
|
|||
virtual int onGetRuntimeStatus(RuntimeStatus statusEnum) const {
|
||||
return 0;
|
||||
}
|
||||
// If the info user set can't be match by runtime, return false and set real info
|
||||
virtual bool onCheckInfo(Backend::Info& info) const {
|
||||
return true;
|
||||
}
|
||||
struct OpInfo {
|
||||
bool initCostLong;
|
||||
float exeutionCost; // In ms
|
||||
|
|
|
@ -8,12 +8,13 @@
|
|||
|
||||
#include "ConvolutionCommon.hpp"
|
||||
#include <math.h>
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "half.hpp"
|
||||
namespace MNN {
|
||||
static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
|
||||
return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
|
||||
}
|
||||
static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeBufCnt) {
|
||||
static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
|
||||
int uSize = myfile[0];
|
||||
myfile++;
|
||||
if (uSize > 4) {
|
||||
|
@ -24,8 +25,16 @@ static int ReadBlobDim(unsigned char *&myfile, unsigned short *shape, int shapeB
|
|||
if (copyLength > shapeBufCnt) {
|
||||
copyLength = shapeBufCnt;
|
||||
}
|
||||
::memcpy(shape, myfile, sizeof(unsigned short) * copyLength);
|
||||
myfile += copyLength * sizeof(unsigned short);
|
||||
if (useInt32) {
|
||||
::memcpy(shape, myfile, sizeof(unsigned int) * copyLength);
|
||||
myfile += copyLength * sizeof(unsigned int);
|
||||
} else {
|
||||
auto myfileint16 = (uint16_t*)myfile;
|
||||
for (int i=0; i<copyLength; ++i) {
|
||||
shape[i] = myfileint16[i];
|
||||
}
|
||||
myfile += copyLength * sizeof(unsigned short);
|
||||
}
|
||||
return copyLength;
|
||||
}
|
||||
|
||||
|
@ -176,18 +185,17 @@ static void StreamSizeRead(void *dst, int unit, size_t count, unsigned char *&fi
|
|||
file += (unit * count);
|
||||
}
|
||||
|
||||
static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
|
||||
static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32) {
|
||||
int8_t *blob = nullptr;
|
||||
int8_t *samples = nullptr;
|
||||
uint8_t *idxBuf = nullptr;
|
||||
uint8_t *idxBytes = nullptr;
|
||||
uint32_t dataCnt = 1;
|
||||
|
||||
do {
|
||||
// blob shape
|
||||
unsigned short shape[64] = {0};
|
||||
uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 64);
|
||||
if (shapeDim == 0 || shapeDim > 64)
|
||||
unsigned int shape[32] = {0};
|
||||
uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
|
||||
if (shapeDim == 0 || shapeDim > 32)
|
||||
break;
|
||||
for (uint32_t i = 0; i < shapeDim; i++)
|
||||
dataCnt *= shape[i];
|
||||
|
@ -198,7 +206,8 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
|
|||
if (0 == sampleCnt) {
|
||||
sampleCnt = 256;
|
||||
}
|
||||
samples = (int8_t *)MNNMemoryAllocAlignZeroAlign(sampleCnt);
|
||||
result->weightMap.resize(sampleCnt);
|
||||
auto samples = result->weightMap.data();
|
||||
if (samples == nullptr)
|
||||
break;
|
||||
StreamSizeRead(samples, 1, sampleCnt, s);
|
||||
|
@ -238,8 +247,6 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
|
|||
}
|
||||
} while (0);
|
||||
|
||||
if (samples != nullptr)
|
||||
MNNMemoryFreeAlign(samples);
|
||||
if (idxBuf != nullptr)
|
||||
MNNMemoryFreeAlign(idxBuf);
|
||||
if (idxBytes != nullptr)
|
||||
|
@ -249,9 +256,9 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
|
|||
return blob;
|
||||
}
|
||||
|
||||
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
|
||||
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const flatbuffers::Vector<float> *alpha, ConvolutionCommon::Int8Common* result, bool useInt32) {
|
||||
// MNN_ERROR("sparse:%d\n", 1);
|
||||
unsigned short shape[64] = {0};
|
||||
unsigned int shape[32];
|
||||
uint32_t ucMapSize = 0;
|
||||
PSIMPLE_SET setWeight = CreateSimpleSet(256);
|
||||
if (setWeight == nullptr) {
|
||||
|
@ -262,8 +269,8 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
|
|||
unsigned char iIdxNeedBits;
|
||||
int8_t *blob = nullptr;
|
||||
// 1. weights blob shape(unsigned int32)
|
||||
int ShapeDim = ReadBlobDim(myfile, shape, 64);
|
||||
int Size = sizeof(int8_t);
|
||||
int ShapeDim = ReadBlobDim(myfile, shape, 32, useInt32);
|
||||
size_t Size = sizeof(int8_t);
|
||||
for (int i = 0; i < ShapeDim; i++)
|
||||
Size *= shape[i];
|
||||
blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)Size);
|
||||
|
@ -295,11 +302,13 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
|
|||
if (0 == ucMapSize) {
|
||||
ucMapSize = 256;
|
||||
}
|
||||
result->weightMap.resize(ucMapSize);
|
||||
// 6. valueset(signed char * valueset_size)
|
||||
for (int i = 0; i < ucMapSize; i++) {
|
||||
int8_t tmp;
|
||||
StreamSizeRead(&tmp, 1, 1, myfile);
|
||||
InsertSimpleSet(setWeight, tmp);
|
||||
result->weightMap[i] = tmp;
|
||||
}
|
||||
SimpleRank(setWeight->UniSet, setWeight->CurUniCnt, 1);
|
||||
// map<unsigned char, signed char> mapWeight;
|
||||
|
@ -367,14 +376,61 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const
|
|||
}
|
||||
std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat, bool forceInt8) {
|
||||
auto result = std::make_shared<Int8Common>();
|
||||
uint32_t weightLength = 0;
|
||||
result->quan = quan;
|
||||
if (quan->index() != nullptr) {
|
||||
if (forceFloat) {
|
||||
// Expand sparse to dense
|
||||
result->weightFloat.reset(quan->weightSize());
|
||||
if (nullptr == result->weightFloat.get()) {
|
||||
return nullptr;
|
||||
}
|
||||
::memset(result->weightFloat.get(), 0, quan->weightSize() * sizeof(float));
|
||||
auto index = quan->index()->data();
|
||||
auto indexSize = quan->index()->size();
|
||||
if (nullptr == quan->alpha() || quan->alpha()->size() != indexSize) {
|
||||
MNN_ERROR("The model is error, don't has alpha but has index\n");
|
||||
return nullptr;
|
||||
}
|
||||
auto weightRaw = quan->alpha()->data();
|
||||
for (uint32_t i=0; i<indexSize; ++i) {
|
||||
result->weightFloat.get()[index[i]] = weightRaw[i];
|
||||
}
|
||||
} // Otherwise needn't treat, just return result with quan info
|
||||
return result;
|
||||
}
|
||||
size_t weightLength = 0;
|
||||
int8_t *buffer = nullptr;
|
||||
auto originBuffer = (unsigned char *)quan->buffer()->data();
|
||||
if (1 == quan->type()) {
|
||||
buffer = ReadQuanData_c(originBuffer, &weightLength);
|
||||
buffer = ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32());
|
||||
}
|
||||
if (2 == quan->type()) {
|
||||
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
|
||||
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha(), result.get(), quan->shapeInt32());
|
||||
}
|
||||
if (result->weightMap.size() > 0 && result->weightMap.size() <= 16) {
|
||||
// Compute Remap for int4
|
||||
result->canUseInt4 = true;
|
||||
result->weightReverseMap.resize(256);
|
||||
::memset(result->weightReverseMap.data(), 0, 256 * sizeof(int8_t));
|
||||
for (int i=0; i<result->weightMap.size(); ++i) {
|
||||
int value = result->weightMap[i];
|
||||
value = value + 128;
|
||||
result->weightReverseMap[value] = i;
|
||||
}
|
||||
#ifdef MNN_TEST_REMAPQUANT
|
||||
// Test reverse
|
||||
std::vector<int8_t> originBuffer(weightLength);
|
||||
for (int i=0; i<weightLength; ++i) {
|
||||
originBuffer[i] = buffer[i];
|
||||
buffer[i] = result->weightReverseMap[(int)buffer[i] + 128];
|
||||
}
|
||||
for (int i=0; i<weightLength; ++i) {
|
||||
buffer[i] = result->weightMap[buffer[i]];
|
||||
}
|
||||
for (int i=0; i<weightLength; ++i) {
|
||||
MNN_ASSERT(buffer[i] == originBuffer[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// read fp16 data
|
||||
if (3 == quan->type()) {
|
||||
|
@ -406,13 +462,41 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
|
|||
}
|
||||
result->weight.set(buffer, weightLength);
|
||||
}
|
||||
result->quan = quan;
|
||||
result->alpha.reset(quan->alpha()->size());
|
||||
if (nullptr == result->alpha.get()) {
|
||||
MNN_PRINT("Alloc memory error for extract idst int8\n");
|
||||
return nullptr;
|
||||
}
|
||||
::memcpy(result->alpha.get(), quan->alpha()->data(), quan->alpha()->size() * sizeof(float));
|
||||
{
|
||||
int outputCount = 0;
|
||||
bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
|
||||
if (quan->readType() != 0 || oldType4) {
|
||||
result->asymmetric = true;
|
||||
outputCount = result->alpha.size() / 2;
|
||||
} else {
|
||||
result->asymmetric = false;
|
||||
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
|
||||
}
|
||||
if (result->asymmetric) {
|
||||
// clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
|
||||
// and old version clampMin is -128
|
||||
float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
|
||||
for (int o = 0; o < outputCount; ++o) {
|
||||
result->alpha.get()[2 * o] = result->alpha.get()[2 * o] - clampMin * result->alpha.get()[2 * o + 1];
|
||||
}
|
||||
}
|
||||
if (!quan->has_scaleInt()) {
|
||||
float extraFactor = quan->quantScale();
|
||||
// for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
|
||||
if (oldType4) {
|
||||
extraFactor = 1.0f;
|
||||
}
|
||||
for (int o=0; o<result->alpha.size(); ++o) {
|
||||
result->alpha.get()[o] *= extraFactor;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (forceInt8) {
|
||||
return result;
|
||||
}
|
||||
|
@ -424,42 +508,30 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
|
|||
return nullptr;
|
||||
}
|
||||
int outputCount = 0;
|
||||
bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
|
||||
if (quan->readType() != 0 || oldType4) {
|
||||
outputCount = result->alpha.size() / 2;
|
||||
if (result->asymmetric) {
|
||||
outputCount = result->alpha.size() / 2;
|
||||
} else {
|
||||
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
|
||||
outputCount = result->alpha.size();
|
||||
}
|
||||
int partWeightSize = weightLength / outputCount;
|
||||
for (int o = 0; o < outputCount; ++o) {
|
||||
float min = 0.0f;
|
||||
float alpha = 0.0f;
|
||||
if (result->asymmetric) {
|
||||
min = result->alpha.get()[2*o];
|
||||
alpha = result->alpha.get()[2*o+1];
|
||||
} else {
|
||||
alpha = result->alpha.get()[o];
|
||||
}
|
||||
auto dstW = result->weightFloat.get() + o * partWeightSize;
|
||||
auto srcW = result->weight.get() + o * partWeightSize;
|
||||
float extraFactor = quan->quantScale();
|
||||
// for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
|
||||
if (oldType4) {
|
||||
extraFactor = 1.0f;
|
||||
}
|
||||
if (result->alpha.size() == 2 * outputCount) {
|
||||
float min = result->alpha.get()[2*o];
|
||||
float alpha = result->alpha.get()[2*o+1];
|
||||
// clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
|
||||
// and old version clampMin is -128
|
||||
float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
|
||||
for (int j = 0; j < partWeightSize; ++j) {
|
||||
dstW[j] = (( (float)srcW[j] - clampMin ) * alpha + min) * extraFactor;
|
||||
}
|
||||
} else {
|
||||
float alpha = result->alpha.get()[o];
|
||||
for (int j = 0; j < partWeightSize; ++j) {
|
||||
dstW[j] = ((float)srcW[j]) * alpha * extraFactor;
|
||||
}
|
||||
for (int v=0; v < partWeightSize; ++v) {
|
||||
dstW[v] = (float)srcW[v] * alpha + min;
|
||||
}
|
||||
}
|
||||
|
||||
result->weight.release();
|
||||
result->alpha.release();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,10 @@ public:
|
|||
AutoStorage<float> alpha;
|
||||
AutoStorage<float> weightFloat;
|
||||
const IDSTQuan* quan;
|
||||
bool asymmetric;
|
||||
std::vector<int8_t> weightMap;
|
||||
std::vector<uint8_t> weightReverseMap;
|
||||
bool canUseInt4 = false;
|
||||
};
|
||||
static std::shared_ptr<Int8Common> load(const IDSTQuan* quan, bool forceFloat = false, bool forceInt8 = false);
|
||||
static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize);
|
||||
|
|
|
@ -189,6 +189,7 @@ Pipeline::Pipeline(Schedule::PipelineInfo&& info, bool allocInput, bool outputSt
|
|||
#else
|
||||
{
|
||||
#endif
|
||||
rt->onCheckInfo(info.first.info);
|
||||
mRuntime = rt;
|
||||
mCpuRuntime = cpuRt;
|
||||
mTuneAttr = tune;
|
||||
|
|
|
@ -266,7 +266,16 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
|
|||
} else {
|
||||
*dst = 0;
|
||||
}
|
||||
return true;
|
||||
} break;
|
||||
case Interpreter::THREAD_NUMBER: {
|
||||
auto dst = (int*)ptr;
|
||||
if (mPipelines.empty()) {
|
||||
break;
|
||||
}
|
||||
*dst = mPipelines[0]->getPipelineInfo().first.info.numThread;
|
||||
return true;
|
||||
}
|
||||
// TODO: Support other debug info
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -399,17 +399,21 @@ bool TensorUtils::isDepthToSpaceRegions(const Tensor* output) {
|
|||
}
|
||||
|
||||
// compute offset through region
|
||||
static inline int offsetCompute(Tensor::InsideDescribe::Region reg, int offset, bool backward) {
|
||||
static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int offset, bool backward) {
|
||||
Tensor::InsideDescribe::View src;
|
||||
Tensor::InsideDescribe::View dst;
|
||||
if (backward) {
|
||||
auto tmp = reg.src;
|
||||
reg.src = reg.dst;
|
||||
reg.dst = tmp;
|
||||
src = reg.dst;
|
||||
dst = reg.src;
|
||||
} else {
|
||||
src = reg.src;
|
||||
dst = reg.dst;
|
||||
}
|
||||
int res = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (reg.size[i] > 1) {
|
||||
res += offset / reg.src.stride[i] * reg.dst.stride[i];
|
||||
offset %= reg.src.stride[i];
|
||||
res += offset / src.stride[i] * dst.stride[i];
|
||||
offset %= src.stride[i];
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
@ -461,6 +465,23 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) {
|
|||
return needMalloc;
|
||||
}
|
||||
|
||||
static bool _RegionValid(int* stride, int offset, int* size, int sizeNum, size_t limitSize) {
|
||||
int maxOffset = offset;
|
||||
int minOffset = offset;
|
||||
// Check start and end
|
||||
for (int i=0; i<sizeNum; ++i) {
|
||||
if (stride[i] > 0) {
|
||||
maxOffset += (stride[i] * (size[i] - 1));
|
||||
} else {
|
||||
minOffset += (stride[i] * (size[i] - 1));
|
||||
}
|
||||
}
|
||||
if (minOffset < 0 || maxOffset >= limitSize) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// fuse srcRegion and dstRegion to dstRegion if return true
|
||||
bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
|
||||
// src data isnot full data of dst
|
||||
|
@ -573,6 +594,14 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
|
|||
}
|
||||
// set final size and set expandIdx if expand val is 1
|
||||
int expandIdx = -1;
|
||||
int newSrcOffset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
|
||||
if (nullptr != srcReg.origin) {
|
||||
bool valid = _RegionValid(newSrc, newSrcOffset, dstSize, dstNum, TensorUtils::getRawSize(srcReg.origin));
|
||||
if (!valid) {
|
||||
// Exceed src range
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (dstNum > sizeNum) {
|
||||
for (int i = 2; i >= 0; i--) {
|
||||
if (i < dstNum) {
|
||||
|
@ -654,7 +683,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
|
|||
}
|
||||
}
|
||||
dstReg.origin = srcReg.origin;
|
||||
dstReg.src.offset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
|
||||
dstReg.src.offset = newSrcOffset;
|
||||
return true;
|
||||
}
|
||||
void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
|
||||
|
@ -680,70 +709,6 @@ Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) {
|
|||
return Tensor::TENSORFLOW;
|
||||
}
|
||||
|
||||
halide_type_t TensorUtils::DataTypeToHalideType(DataType t) {
|
||||
switch (t) {
|
||||
case DataType_DT_DOUBLE:
|
||||
case DataType_DT_FLOAT:
|
||||
return halide_type_of<float>();
|
||||
case DataType_DT_BFLOAT16:
|
||||
return halide_type_t(halide_type_float, 16);
|
||||
case DataType_DT_QINT32:
|
||||
case DataType_DT_INT32:
|
||||
case DataType_DT_BOOL:
|
||||
case DataType_DT_INT64:
|
||||
return halide_type_of<int32_t>();
|
||||
case DataType_DT_QINT8:
|
||||
case DataType_DT_INT8:
|
||||
return halide_type_of<int8_t>();
|
||||
case DataType_DT_QUINT8:
|
||||
case DataType_DT_UINT8:
|
||||
return halide_type_of<uint8_t>();
|
||||
case DataType_DT_QUINT16:
|
||||
case DataType_DT_UINT16:
|
||||
return halide_type_of<uint16_t>();
|
||||
case DataType_DT_QINT16:
|
||||
case DataType_DT_INT16:
|
||||
return halide_type_of<int16_t>();
|
||||
case DataType_DT_STRING:
|
||||
default:
|
||||
MNN_PRINT("Unsupported data type!");
|
||||
MNN_ASSERT(false);
|
||||
return halide_type_of<float>();
|
||||
}
|
||||
}
|
||||
|
||||
DataType TensorUtils::HaildeTypeToDataType(halide_type_t t) {
|
||||
if (t == halide_type_of<int8_t>()) {
|
||||
return DataType_DT_INT8;
|
||||
}
|
||||
if (t == halide_type_of<int16_t>()) {
|
||||
return DataType_DT_INT16;
|
||||
}
|
||||
if (t == halide_type_of<int32_t>()) {
|
||||
return DataType_DT_INT32;
|
||||
}
|
||||
if (t == halide_type_of<int64_t>()) {
|
||||
return DataType_DT_INT64;
|
||||
}
|
||||
if (t == halide_type_of<uint8_t>()) {
|
||||
return DataType_DT_UINT8;
|
||||
}
|
||||
if (t == halide_type_of<uint16_t>()) {
|
||||
return DataType_DT_UINT16;
|
||||
}
|
||||
if (t == halide_type_t(halide_type_float, 16)) {
|
||||
return DataType_DT_BFLOAT16;
|
||||
}
|
||||
if (t == halide_type_of<float>()) {
|
||||
return DataType_DT_FLOAT;
|
||||
}
|
||||
if (t == halide_type_of<double>()) {
|
||||
return DataType_DT_DOUBLE;
|
||||
}
|
||||
MNN_PRINT("Unsupported data type!");
|
||||
MNN_ASSERT(false);
|
||||
return DataType_DT_INVALID;
|
||||
}
|
||||
std::vector<float> TensorUtils::getQuantInfo(const Tensor* t) {
|
||||
float scale = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->scale : 0.0f;
|
||||
float zero = getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->zero : 0.0f;
|
||||
|
|
|
@ -163,8 +163,6 @@ public:
|
|||
static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
|
||||
static void adjustTensorForCompability(Tensor* t);
|
||||
static Tensor::DimensionType getDimType(const Tensor* t);
|
||||
static halide_type_t DataTypeToHalideType(DataType t);
|
||||
static DataType HaildeTypeToDataType(halide_type_t t);
|
||||
static std::vector<float> getQuantInfo(const Tensor* t);
|
||||
|
||||
static size_t getRawSize(const Tensor* t);
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include "geometry/GeometryComputer.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
namespace MNN {
|
||||
|
@ -21,7 +22,6 @@ public:
|
|||
MNN_ASSERT(input->dimensions() >= 1);
|
||||
MNN_ASSERT(output->dimensions() == input->dimensions());
|
||||
auto originTensor = input;
|
||||
int basicOffset = 0;
|
||||
int shape[MNN_MAX_TENSOR_DIM];
|
||||
if (op->type() == OpType_Permute) {
|
||||
auto shapeValue = op->main_as_Permute()->dims();
|
||||
|
@ -53,6 +53,7 @@ public:
|
|||
continue;
|
||||
}
|
||||
if (axis - preAxis == 1) {
|
||||
// Fuse dimension if possible
|
||||
inputShape[inputShapeSize - 1] *= len;
|
||||
} else {
|
||||
if (preAxis >= 0) {
|
||||
|
@ -89,7 +90,18 @@ public:
|
|||
stride *= inputShape[i];
|
||||
}
|
||||
}
|
||||
int basicStride = 1;
|
||||
// Sort inputShapeSize from small to large
|
||||
if (inputShapeSize > 3) {
|
||||
for (int i=0; i<inputShapeSize; ++i) {
|
||||
for (int j=i+1; j<inputShapeSize; ++j) {
|
||||
if (inputShape[i] > inputShape[j]) {
|
||||
std::swap(inputShape[i], inputShape[j]);
|
||||
std::swap(inputStrides[i], inputStrides[j]);
|
||||
std::swap(outputStrides[i], outputStrides[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compute inside, outside, axis
|
||||
int inside = 1;
|
||||
int insideStride = 0;
|
||||
|
@ -99,18 +111,24 @@ public:
|
|||
int axisStride = 0;
|
||||
int breakAxis = -1;
|
||||
int remainSize = 1;
|
||||
int outputInsideStride = 0;
|
||||
int outputAxisStride = 0;
|
||||
int outputOutsideStride = 0;
|
||||
{
|
||||
if (inputShapeSize >= 1) {
|
||||
inside = inputShape[inputShapeSize-1];
|
||||
insideStride = inputStrides[inputShapeSize-1];
|
||||
outputInsideStride = outputStrides[inputShapeSize-1];
|
||||
}
|
||||
if (inputShapeSize >= 2) {
|
||||
axis = inputShape[inputShapeSize-2];
|
||||
axisStride = inputStrides[inputShapeSize-2];
|
||||
outputAxisStride = outputStrides[inputShapeSize-2];
|
||||
}
|
||||
if (inputShapeSize >= 3) {
|
||||
outside = inputShape[inputShapeSize-3];
|
||||
outsideStride = inputStrides[inputShapeSize-3];
|
||||
outputOutsideStride = outputStrides[inputShapeSize-3];
|
||||
breakAxis = inputShapeSize - 3;
|
||||
for (int i = 0; i < inputShapeSize - 3; ++i) {
|
||||
remainSize *= inputShape[i];
|
||||
|
@ -130,24 +148,26 @@ public:
|
|||
for (int indice = 0; indice < remainSize; ++indice) {
|
||||
int value = indice;
|
||||
int inputOffset = 0;
|
||||
int outputOffset = 0;
|
||||
for (int i = 0; i < breakAxis; ++i) {
|
||||
auto coordinate = value / mod[i];
|
||||
inputOffset += coordinate * inputStrides[i];
|
||||
outputOffset += coordinate * outputStrides[i];
|
||||
value = value % mod[i];
|
||||
}
|
||||
Tensor::InsideDescribe::Region& slice = outputDes->regions[indice];
|
||||
slice.src.offset = inputOffset + basicOffset;
|
||||
slice.src.stride[0] = outsideStride * basicStride;
|
||||
slice.src.offset = inputOffset;
|
||||
slice.src.stride[0] = outsideStride;
|
||||
slice.size[0] = outside;
|
||||
slice.src.stride[1] = axisStride * basicStride;
|
||||
slice.src.stride[1] = axisStride;
|
||||
slice.size[1] = axis;
|
||||
slice.src.stride[2] = insideStride * basicStride;
|
||||
slice.src.stride[2] = insideStride;
|
||||
slice.size[2] = inside;
|
||||
slice.origin = originTensor;
|
||||
slice.dst.offset = indice * outside * axis * inside;
|
||||
slice.dst.stride[0] = axis * inside;
|
||||
slice.dst.stride[1] = inside;
|
||||
slice.dst.stride[2] = 1;
|
||||
slice.dst.offset = outputOffset;
|
||||
slice.dst.stride[0] = outputOutsideStride;
|
||||
slice.dst.stride[1] = outputAxisStride;
|
||||
slice.dst.stride[2] = outputInsideStride;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -67,6 +67,8 @@ public:
|
|||
};
|
||||
for (int i = 0; i < N; i++) {
|
||||
Region src, dst;
|
||||
src.origin = nullptr;
|
||||
dst.origin = nullptr;
|
||||
::memcpy(&src, data[3 * i], 44);
|
||||
::memcpy(&dst, data[3 * i + 1], 44);
|
||||
bool fused = TensorUtils::fuseRegion(src, dst);
|
||||
|
|
|
@ -68,6 +68,7 @@ public:
|
|||
}
|
||||
|
||||
virtual bool run(int precision) {
|
||||
int numberThread = 0;
|
||||
MNN::BackendConfig bnConfig;
|
||||
auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
|
||||
ExecutorScope scope(exe);
|
||||
|
@ -77,10 +78,31 @@ public:
|
|||
auto y = _ReduceSum(_Multiply(x, x), {});
|
||||
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
|
||||
y->readMap<float>();
|
||||
auto res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
|
||||
if (numberThread != 4 || res == false) {
|
||||
FUNC_PRINT(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4);
|
||||
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
|
||||
y->readMap<float>();
|
||||
res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
|
||||
if (numberThread != 4 || res == false) {
|
||||
FUNC_PRINT(1);
|
||||
return false;
|
||||
}
|
||||
exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 1);
|
||||
// Reset x, y
|
||||
x = _Input({1, 3, 224, 224}, NC4HW4);
|
||||
y = _ReduceSum(_Multiply(x, x), {});
|
||||
::memset(x->writeMap<float>(), 0, x->getInfo()->size * sizeof(float));
|
||||
y->readMap<float>();
|
||||
res = Executor::getComputeInfo(y->expr().first, MNN::Interpreter::THREAD_NUMBER, &numberThread);
|
||||
if (numberThread != 1 || res == false) {
|
||||
FUNC_PRINT(1);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -689,9 +689,18 @@ public:
|
|||
auto bufferOutput = builderOutput.GetBufferPointer();
|
||||
std::shared_ptr<Interpreter> net(Interpreter::createFromBuffer((void*)bufferOutput, sizeOutput), Interpreter::destroy);
|
||||
ScheduleConfig config;
|
||||
config.numThread = 1;
|
||||
int runTime = 5;
|
||||
auto s0 = net->createSession(config);
|
||||
{
|
||||
AUTOTIME;
|
||||
for (int t = 0; t < runTime; ++t) {
|
||||
net->runSession(s0);
|
||||
}
|
||||
}
|
||||
net->releaseSession(s0);
|
||||
config.numThread = 4;
|
||||
auto s1 = net->createSession(config);
|
||||
int runTime = 10;
|
||||
{
|
||||
AUTOTIME;
|
||||
for (int t = 0; t < runTime; ++t) {
|
||||
|
@ -699,7 +708,6 @@ public:
|
|||
}
|
||||
}
|
||||
net->releaseSession(s1);
|
||||
net = nullptr;
|
||||
std::vector<std::thread> allThreads;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
allThreads.emplace_back(std::thread([runTime, i, bufferOutput, sizeOutput] {
|
||||
|
@ -722,6 +730,31 @@ public:
|
|||
for (auto& t : allThreads) {
|
||||
t.join();
|
||||
}
|
||||
for (int i=0; i<3; ++i) {
|
||||
auto rt = Interpreter::createRuntime({config});
|
||||
auto s0 = net->createSession(config, rt);
|
||||
auto s1 = net->createSession(config, rt);
|
||||
int numberThread = 0;
|
||||
net->getSessionInfo(s0, MNN::Interpreter::THREAD_NUMBER, &numberThread);
|
||||
if (numberThread != 4) {
|
||||
FUNC_PRINT(i);
|
||||
return false;
|
||||
}
|
||||
net->getSessionInfo(s1, MNN::Interpreter::THREAD_NUMBER, &numberThread);
|
||||
if (numberThread != 4) {
|
||||
FUNC_PRINT(i);
|
||||
return false;
|
||||
}
|
||||
{
|
||||
AUTOTIME;
|
||||
for (int t = 0; t < runTime; ++t) {
|
||||
net->runSession(s0);
|
||||
}
|
||||
}
|
||||
net->releaseSession(s0);
|
||||
net->releaseSession(s1);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
virtual bool run(int precision) {
|
||||
|
|
|
@ -42,6 +42,7 @@ int main(int argc, char* argv[]) {
|
|||
MNN::BackendConfig config;
|
||||
config.precision = (MNN::BackendConfig::PrecisionMode)precision;
|
||||
MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(type, config, thread);
|
||||
FUNC_PRINT(thread);
|
||||
precisionInTestUtil = getTestPrecision(type, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
|
||||
MNN_PRINT("After update, precision in TestUtil:%d\n", precisionInTestUtil);
|
||||
}
|
||||
|
|
|
@ -19,7 +19,8 @@ static void fillFloat(float* dst, int h, int w, ConvertFP32 functor, float offse
|
|||
for (int y = 0; y < h; ++y) {
|
||||
auto dstY = dst + w * y;
|
||||
for (int x = 0; x < w; ++x) {
|
||||
dstY[x] = functor((float)x * 0.1f + (float)y + offset);
|
||||
int temp = (x + y) % 31;
|
||||
dstY[x] = functor(((float)temp + offset) * 0.01f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,7 +39,7 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i
|
|||
}
|
||||
expected = functor(expected);
|
||||
auto diff = fabsf(expected - computed);
|
||||
if (diff > 0.1f) {
|
||||
if (diff / fabsf(expected) > 0.005f) {
|
||||
MNN_PRINT("%f -> %f\n", expected, computed);
|
||||
res = false;
|
||||
}
|
||||
|
@ -270,6 +271,50 @@ public:
|
|||
}
|
||||
}
|
||||
}
|
||||
// BatchMatMul batch = 1 with large K
|
||||
{
|
||||
std::vector<std::vector<int>> values = {
|
||||
{16, 262144, 15},
|
||||
{3, 262144, 16}
|
||||
};
|
||||
for(auto value : values) {
|
||||
e = value[0];
|
||||
l = value[1];
|
||||
h = value[2];
|
||||
|
||||
std::unique_ptr<MNN::OpT> op(new MNN::OpT);
|
||||
op->type = MNN::OpType_BatchMatMul;
|
||||
op->main.type = MNN::OpParameter_BatchMatMulParam;
|
||||
op->main.value = new MNN::BatchMatMulParamT;
|
||||
auto param = op->main.AsBatchMatMulParam();
|
||||
param->adjX = false;
|
||||
param->adjY = true;
|
||||
|
||||
int batch = 1;
|
||||
auto x0 = _Input({}, NHWC, halide_type_of<float>());
|
||||
auto x1 = _Input({}, NHWC, halide_type_of<float>());
|
||||
x0->resize({batch, h, l});
|
||||
x1->resize({batch, l, e});
|
||||
auto x0Ptr = x0->writeMap<float>();
|
||||
auto x1Ptr = x1->writeMap<float>();
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
fillFloat(x0Ptr + b * h * l, h, l, FP32Converter[precision], (float)b * 10);
|
||||
fillFloat(x1Ptr + b * e * l, l, e, FP32Converter[precision], (float)b * 10);
|
||||
}
|
||||
auto tranposeB = _Transpose(x1, {0, 2, 1});
|
||||
auto y = Variable::create(Expr::create(op.get(), {x0, tranposeB}));
|
||||
|
||||
auto yPtr = y->readMap<float>();
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
auto res = checkMatMul(yPtr + b * e * h, x0Ptr + b * h * l, x1Ptr + b * e * l, e, l, h, FP32Converter[precision]);
|
||||
if (!res) {
|
||||
FUNC_PRINT(1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -71,6 +71,7 @@ protected:
|
|||
for (int i = 0; i < size_out; ++i) {
|
||||
auto error = (int32_t)data_out[i] - (int32_t)gotOutput[i];
|
||||
if (error * error > 1) {
|
||||
MNN_PRINT("Error case = %d:\n", i);
|
||||
MNN_PRINT("%s Test error: compute result=%d, right value=%d\n", name.c_str(), (int32_t)gotOutput[i], (int32_t)data_out[i]);
|
||||
return false;
|
||||
}
|
||||
|
@ -88,7 +89,7 @@ class AddTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~AddTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Add, "AddTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Add, "AddTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {0.0, 0.0, 0.0, 0.0},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -101,7 +102,7 @@ class AddInt8Test : public BinaryTestCommon {
|
|||
vector<float> inp2 = {1.1, 2.2, 3.3, 4.6}, inp1 = {2};
|
||||
vector<float> rightResult = {3.1, 4.2, 5.3, 6.6};
|
||||
|
||||
return test<float, float>(_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
|
||||
return test<float, float>(MNN::Express::_Add, "AddInt8Test", 0.01, inp1, inp2, rightResult, {1}, {4}, {4}, {0.4, 0.4, 0.4},
|
||||
{0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -110,7 +111,7 @@ class SubtractTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~SubtractTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Subtract, "SubtractTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Subtract, "SubtractTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -119,11 +120,11 @@ class SubtractInt8Test : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~SubtractInt8Test() = default;
|
||||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
|
||||
vector<float> rightResult = {-4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
|
||||
vector<float> inp1 = {7.0, 28.2, 3.3, 4.6, 1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6,1.1, 2.2, 3.3, 4.6}, inp2 = {5.7};
|
||||
vector<float> rightResult = {1.3, 22.5, -2.4, -1.1, -4.6, -3.5, -2.4, -1.1, -4.6, -3.5, -2.4,
|
||||
-1.1, -4.6, -3.5, -2.4, -1.1};
|
||||
|
||||
return test<float, float>(_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_Subtract, "SubtractInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4, 4}, {1}, {4, 4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -132,7 +133,7 @@ class MultiplyTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~MultiplyTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Multiply, "MultiplyTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Multiply, "MultiplyTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -4.0, -9.0, -16.0},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -143,7 +144,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 0.25, 0.43};
|
||||
vector<float> rightResult = {6.27 , 5.5 , 0.825, 1.978};
|
||||
return test<float, float>(_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_Multiply, "MultiplyInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {4}, {4}, {0.4, 0.4, 0.16}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -152,7 +153,7 @@ class DivideTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~DivideTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Divide, "DivideTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Divide, "DivideTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {-0.5, -0.5, -0.5, -0.5},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -163,7 +164,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {1.1, 2.2, 3.3, 4.6}, inp2 = {5.7, 2.5, 2.6, 1.88};
|
||||
vector<float> rightResult = {0.19298, 0.88, 1.269, 2.4468};
|
||||
return test<float, float>(_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_Divide, "DivideInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {4}, {4}, {0.4, 0.4, 1.0}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -173,7 +174,7 @@ public:
|
|||
virtual ~PowTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 10;
|
||||
return test<float, float>(_Pow, "PowTest", 0.01 * errorScale,
|
||||
return test<float, float>(MNN::Express::_Pow, "PowTest", 0.01 * errorScale,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -182,10 +183,10 @@ class PowInt8Test : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~PowInt8Test() = default;
|
||||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 2, 4.0};
|
||||
vector<float> rightResult = {1, 16, 8, 0};
|
||||
return test<float, float>(_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {4}, {4}, {1.0, 1.0, 1.0}, {0., 0., 0.});
|
||||
vector<float> inp1 = {-1.0, -2.0, -3.0, -4.0}, inp2 = {2.0, 4.0, 3, 4.0};
|
||||
vector<float> rightResult = {1, 16, -27.0, 256};
|
||||
return test<float, float>(MNN::Express::_Pow, "PowInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {4}, {4}, {1.0, 1.0, 3.0}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -193,7 +194,7 @@ class MinimumTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~MinimumTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Minimum, "MinimumTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Minimum, "MinimumTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0},
|
||||
{4}, {4}, {4});
|
||||
}
|
||||
|
@ -204,7 +205,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {-1.2, -5.0, 8, 10}, inp2 = {9.3, 3.1, 11.0, 2.9};
|
||||
vector<float> rightResult = {-1.2, -5.0, 8, 2.9};
|
||||
return test<float, float>(_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_Minimum, "MinimumInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {4}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -224,7 +225,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {-1, -5, 8, 10}, inp2 = {9};
|
||||
vector<float> rightResult = {9, 9, 9, 10};
|
||||
return test<float, float>(_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_Maximum, "MaximumInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {1}, {4}, {0.4, 0.4, 0.4}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -233,7 +234,7 @@ class BiasAddTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~BiasAddTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_BiasAdd, "BiasAddTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_BiasAdd, "BiasAddTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
|
||||
{1.0, 2.0},
|
||||
{0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0},
|
||||
|
@ -244,7 +245,7 @@ class GreaterTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~GreaterTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, int>(_Greater, "GreaterTest", 0,
|
||||
return test<float, int>(MNN::Express::_Greater, "GreaterTest", 0,
|
||||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
|
||||
{3.0, 4.0},
|
||||
{0, 0, 0, 0, 1, 1, 1, 1},
|
||||
|
@ -255,7 +256,7 @@ class GreaterEqualTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~GreaterEqualTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, int>(_GreaterEqual, "GreaterEqualTest", 0,
|
||||
return test<float, int>(MNN::Express::_GreaterEqual, "GreaterEqualTest", 0,
|
||||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
|
||||
{3.0, 4.0},
|
||||
{0, 0, 1, 1, 1, 1, 1, 1},
|
||||
|
@ -266,7 +267,7 @@ class LessTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~LessTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, int>(_Less, "LessTest", 0,
|
||||
return test<float, int>(MNN::Express::_Less, "LessTest", 0,
|
||||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
|
||||
{3.0, 4.0},
|
||||
{1, 1, 0, 0, 0, 0, 0, 0},
|
||||
|
@ -277,7 +278,7 @@ class FloorDivTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~FloorDivTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_FloorDiv, "FloorDivTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_FloorDiv, "FloorDivTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.1},
|
||||
{3.0, 4.0},
|
||||
{-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0},
|
||||
|
@ -290,7 +291,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {-3.98, 17.5, 25.4, 6.7}, inp2 = {3};
|
||||
vector<float> rightResult = {-2, 5, 8, 2};
|
||||
return test<float, float>(_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_FloorDiv, "FloorDivInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{4}, {1}, {4}, {0.4, 0.4, 1}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -327,7 +328,7 @@ public:
|
|||
z[i + j * 2] = FP32Converter[precision](fmodf(FP32Converter[precision](x[i+j*2]), FP32Converter[precision](y[i])));
|
||||
}
|
||||
}
|
||||
return test<float, float>(_Mod, "ModTestFloat", 0,
|
||||
return test<float, float>(MNN::Express::_Mod, "ModTestFloat", 0,
|
||||
x,y,z,
|
||||
{4, 2}, {2}, {4, 2});
|
||||
}
|
||||
|
@ -336,7 +337,7 @@ class SquaredDifferenceTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~SquaredDifferenceTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_SquaredDifference, "SquaredDifferenceTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceTest", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001},
|
||||
{3.0, 4.0},
|
||||
{16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0},
|
||||
|
@ -349,7 +350,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<float> inp1 = {-1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8, -1, -2, -3, -4, 5, 6, 7, 8}, inp2 = {3};
|
||||
vector<float> rightResult = {16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25, 16, 25, 36, 49, 4, 9, 16, 25};
|
||||
return test<float, float>(_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
return test<float, float>(MNN::Express::_SquaredDifference, "SquaredDifferenceInt8Test", 0.01, inp1, inp2, rightResult,
|
||||
{8, 4}, {1}, {8, 4}, {1, 1, 1}, {0., 0., 0.});
|
||||
}
|
||||
};
|
||||
|
@ -358,7 +359,7 @@ class EqualTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~EqualTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, int>(_Equal, "EqualTest", 0,
|
||||
return test<float, int>(MNN::Express::_Equal, "EqualTest", 0,
|
||||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
|
||||
{3.0, 4.0},
|
||||
{0, 0, 1, 1, 0, 0, 0, 0},
|
||||
|
@ -380,7 +381,7 @@ class FloorModTest : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~FloorModTest() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_FloorMod, "FloorModTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_FloorMod, "FloorModTest", 0.01,
|
||||
{-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.1f},
|
||||
{3.0f, 4.0f},
|
||||
{2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.1f},
|
||||
|
@ -391,7 +392,7 @@ class FloorModInt8Test : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~FloorModInt8Test() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_FloorMod, "FloorModInt8Test", 0.01,
|
||||
return test<float, float>(MNN::Express::_FloorMod, "FloorModInt8Test", 0.01,
|
||||
{-1, -3, 5, 7},
|
||||
{3.0f}, {2, 0, 2, 1},
|
||||
{4}, {1}, {4}, {0.3, 0.3, 0.3}, {0., 0., 0.});
|
||||
|
@ -401,7 +402,7 @@ class Atan2Test : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~Atan2Test() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Atan2, "Atan2Test", 0.01,
|
||||
return test<float, float>(MNN::Express::_Atan2, "Atan2Test", 0.01,
|
||||
{-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0},
|
||||
{3.0, -4.0},
|
||||
{-0.32175055, -2.67794504, -0.7853982, -2.35619449, 1.0303768, 2.15879893, 1.1659045, 2.03444394},
|
||||
|
@ -412,7 +413,7 @@ class Atan2Int8Test : public BinaryTestCommon {
|
|||
public:
|
||||
virtual ~Atan2Int8Test() = default;
|
||||
virtual bool run(int precision) {
|
||||
return test<float, float>(_Atan2, "Atan2Int8Test", 0.01,
|
||||
return test<float, float>(MNN::Express::_Atan2, "Atan2Int8Test", 0.01,
|
||||
{-1, -3, 5, 7},
|
||||
{3}, {-1, 0, 2, 1},
|
||||
{4}, {1}, {4}, {1, 1, 1}, {0., 0., 0.});
|
||||
|
@ -523,7 +524,7 @@ public:
|
|||
virtual bool run(int precision) {
|
||||
vector<int> data_x(8, 1), data_y(8, 1), data_out(64, 2);
|
||||
vector<int> shape_x = {4, 1, 2, 1}, shape_y = {2, 1, 4}, shape_out = {4, 2, 2, 4};
|
||||
return test<int, int>(_Add, "BinaryBroadcastShapeTest", 0,
|
||||
return test<int, int>(MNN::Express::_Add, "BinaryBroadcastShapeTest", 0,
|
||||
data_x, data_y, data_out, shape_x, shape_y, shape_out);
|
||||
}
|
||||
};
|
||||
|
@ -546,7 +547,7 @@ public:
|
|||
data_out[j + i * 560] = func(data_x[j] - data_y[j + i * 560]);
|
||||
}
|
||||
}
|
||||
return test<float, float>(_Subtract, "SubtractBroastTest", 0.01,
|
||||
return test<float, float>(MNN::Express::_Subtract, "SubtractBroastTest", 0.01,
|
||||
data_x, data_y, data_out, shape_x, shape_y, shape_out);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -212,9 +212,13 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
|
|||
conv2D->common->kernelY = kernelSize[1];
|
||||
conv2D->common->relu6 = relu6;
|
||||
conv2D->common->relu = relu;
|
||||
MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
|
||||
conv2D->weight = std::move(weight);
|
||||
MNN_ASSERT(bias.size() == channel[1]);
|
||||
conv2D->bias = std::move(bias);
|
||||
if (sparese) {
|
||||
size_t weightNNZElement, weightBlockNumber = 0;
|
||||
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, weight.data(), bias.size(), weight.size() / bias.size(), sparseBlockOC);
|
||||
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, conv2D->weight.data(), conv2D->bias.size(), conv2D->weight.size() / conv2D->bias.size(), sparseBlockOC);
|
||||
|
||||
std::unique_ptr<MNN::AttributeT> arg1(new MNN::AttributeT);
|
||||
arg1->key = "sparseBlockOC";
|
||||
|
@ -250,11 +254,8 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
|
|||
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
|
||||
|
||||
conv2D->sparseParameter.reset(sparseComPtr);
|
||||
CommonCompute::compressFloatWeightToSparse(convOp.get());
|
||||
}
|
||||
MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
|
||||
conv2D->weight = std::move(weight);
|
||||
MNN_ASSERT(bias.size() == channel[1]);
|
||||
conv2D->bias = std::move(bias);
|
||||
return (Variable::create(Expr::create(convOp.get(), {x})));
|
||||
}
|
||||
|
||||
|
|
|
@ -6,12 +6,22 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include "CommonUtils.hpp"
|
||||
#include "common/CommonCompute.hpp"
|
||||
#include "backend/cpu/compute/SparseConvolutionTiledExecutor.hpp"
|
||||
|
||||
using namespace MNN;
|
||||
static inline std::vector<float> getSparsityThreshold() {
|
||||
|
||||
// sparsity threadhold values, when sparseblock is
|
||||
// {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
||||
return {1.f, 0.6f, 0.5f, 0.4f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f, 0.3f};
|
||||
}
|
||||
static bool inline shouldUseSparseConvolution(float sparsity, int sparseBlockOC) {
|
||||
std::vector<float> thresholds = getSparsityThreshold();
|
||||
return sparsity > thresholds[std::min(std::max(sparseBlockOC, 0), (int)thresholds.size() - 1)];
|
||||
}
|
||||
void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
|
||||
auto prune_algo_type = MNN::SparseAlgo_RANDOM;
|
||||
int sparseBlockOC = 1;
|
||||
|
@ -41,10 +51,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
|
|||
size_t weightNNZElement, weightBlockNumber = 0;
|
||||
CommonCompute::statisticWeightSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), biasSize, weightSize / biasSize, sparseBlockOC);
|
||||
float sparsity = 1. - double(weightNNZElement) / weightSize;
|
||||
// MNN_PRINT(" opname [%s] sparsity is:%f\n", op->name.c_str(), sparsity);
|
||||
if (!SparseConvolutionTiledExecutor::shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
|
||||
if (!shouldUseSparseConvolution(sparsity, sparseBlockOC)) {
|
||||
return;
|
||||
}
|
||||
// MNN_PRINT(" opname [%s] sparsity is:%f, use sparse\n", op->name.c_str(), sparsity);
|
||||
|
||||
MNN::AttributeT* arg1(new MNN::AttributeT);
|
||||
arg1->key = "sparseBlockOC";
|
||||
|
@ -74,6 +84,7 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
|
|||
argsVector.emplace_back(sparseArg3);
|
||||
argsVector.emplace_back(sparseArg4);
|
||||
|
||||
// sparseArgs need sorted table, can't use obj interface
|
||||
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
|
||||
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
|
||||
builder.Finish(sparseCom);
|
||||
|
@ -81,6 +92,10 @@ void AddSparseInfo(std::unique_ptr<MNN::OpT>& op, Compression::Pipeline proto) {
|
|||
|
||||
param->sparseParameter.reset(sparseComPtr);
|
||||
|
||||
delete arg1;
|
||||
delete arg2;
|
||||
delete arg3;
|
||||
delete arg4;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -0,0 +1,367 @@
|
|||
//
|
||||
// ChannelPruneConvert.cpp
|
||||
// MNNConverter
|
||||
//
|
||||
// Created by MNN on 2023/05/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "CommonUtils.hpp"
|
||||
#include "MNN/expr/ExprCreator.hpp"
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace MNN;
|
||||
using namespace MNN::Express;
|
||||
using namespace std;
|
||||
|
||||
// TODO: add more unsafe ops
|
||||
static std::vector<MNN::OpType> unSafeOpTypes = {
|
||||
OpType_BroadcastTo, OpType_BatchToSpaceND, OpType_Concat, OpType_LSTM, OpType_LSTMBlockCell, OpType_Reshape, OpType_Resize,
|
||||
OpType_RNN, OpType_RNNSequenceGRU, OpType_ScatterNd, OpType_Slice, OpType_SliceTf, OpType_SpaceToBatchND, OpType_Raster,
|
||||
};
|
||||
|
||||
struct TensorMaskInfo {
|
||||
std::vector<int> mask; // per-channel 1 or 0
|
||||
std::string oriConvName;
|
||||
};
|
||||
|
||||
std::vector<MNN::OpT*> findUserOps(int outputIndex, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph) {
|
||||
std::vector<MNN::OpT*> userOps;
|
||||
if (subgraph) {
|
||||
for (auto& subOp : subgraph->nodes) {
|
||||
for (int inputIndex : subOp->inputIndexes) {
|
||||
if (inputIndex == outputIndex) {
|
||||
userOps.push_back(subOp.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto& netOp : netT->oplists) {
|
||||
for (int inputIndex : netOp->inputIndexes) {
|
||||
if (inputIndex == outputIndex) {
|
||||
userOps.push_back(netOp.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return userOps;
|
||||
}
|
||||
|
||||
// do the actual channel prune on weights and bias
|
||||
void channelPrune(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo) {
|
||||
auto opType = op->type;
|
||||
if (opType != OpType_Convolution && opType != OpType_ConvolutionDepthwise && opType != OpType_Deconvolution && opType != OpType_DeconvolutionDepthwise && opType != OpType_BatchNorm) {
|
||||
return;
|
||||
}
|
||||
if (op->inputIndexes.size() != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int inputIndex = op->inputIndexes[0];
|
||||
int outputIndex = op->outputIndexes[0];
|
||||
std::string inputTensorName = subgraph ? subgraph->tensors[inputIndex] : netT->tensorName[inputIndex];
|
||||
std::string outputTensorName = subgraph ? subgraph->tensors[outputIndex] : netT->tensorName[outputIndex];
|
||||
|
||||
std::vector<int> inputMask = tensorMaskInfo[inputTensorName].mask;
|
||||
int inputMaskSum = 0;
|
||||
for (int i = 0; i < inputMask.size(); i++) {
|
||||
inputMaskSum += inputMask[i];
|
||||
}
|
||||
|
||||
if (opType == OpType_BatchNorm) {
|
||||
if (!(inputMaskSum < inputMask.size())) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto bnParams = op->main.AsBatchNorm();
|
||||
auto slopFloat = bnParams->slopeData;
|
||||
auto biasFloat = bnParams->biasData;
|
||||
auto meanFloat = bnParams->meanData;
|
||||
auto varianceFloat = bnParams->varData;
|
||||
|
||||
bnParams->slopeData.clear();
|
||||
bnParams->biasData.clear();
|
||||
bnParams->meanData.clear();
|
||||
bnParams->varData.clear();
|
||||
|
||||
for (int i = 0; i < varianceFloat.size(); i++) {
|
||||
if (inputMask[i] == 1) {
|
||||
bnParams->slopeData.push_back(slopFloat[i]);
|
||||
bnParams->biasData.push_back(biasFloat[i]);
|
||||
bnParams->meanData.push_back(meanFloat[i]);
|
||||
bnParams->varData.push_back(varianceFloat[i]);
|
||||
}
|
||||
}
|
||||
bnParams->channels = inputMaskSum;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
auto convParams = op->main.AsConvolution2D();
|
||||
auto weightFloat = convParams->weight;
|
||||
auto biasFloat = convParams->bias;
|
||||
auto& common = convParams->common;
|
||||
|
||||
int ko = common->outputCount;
|
||||
int ki = common->inputCount / common->group;
|
||||
int kh = common->kernelY;
|
||||
int kw = common->kernelX;
|
||||
|
||||
std::vector<int> opMask;
|
||||
for (auto info : tensorMaskInfo) {
|
||||
if (op->name == info.second.oriConvName) {
|
||||
opMask = info.second.mask;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int opMaskSum = 0;
|
||||
for (int i = 0; i < opMask.size(); i++) {
|
||||
opMaskSum += opMask[i];
|
||||
}
|
||||
|
||||
if (opMaskSum < opMask.size()) {
|
||||
convParams->weight.clear();
|
||||
convParams->bias.clear();
|
||||
|
||||
for (int i = 0; i < ko; i++) {
|
||||
int offset = i * ki * kh * kw;
|
||||
if (opMask[i] == 1) {
|
||||
for (int j = 0; j < ki * kh * kw; j++) {
|
||||
convParams->weight.emplace_back(weightFloat[offset + j]);
|
||||
}
|
||||
convParams->bias.emplace_back(biasFloat[i]);
|
||||
}
|
||||
}
|
||||
common->outputCount = opMaskSum;
|
||||
}
|
||||
|
||||
if (inputMaskSum < inputMask.size()) {
|
||||
auto weightFloat = convParams->weight;
|
||||
convParams->weight.clear();
|
||||
|
||||
int ko = common->outputCount;
|
||||
int ki = common->inputCount / common->group;
|
||||
int kh = common->kernelY;
|
||||
int kw = common->kernelX;
|
||||
|
||||
for (int i = 0; i < ko; i++) {
|
||||
for (int j = 0; j < ki; j++) {
|
||||
int offset = i * ki * kh * kw + j * kh * kw;
|
||||
if (inputMask[j] == 1) {
|
||||
for (int k = 0; k < kh * kw; k++) {
|
||||
convParams->weight.emplace_back(weightFloat[offset + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
common->inputCount = inputMaskSum;
|
||||
|
||||
// we will not do prune for depthwise, its channel pruning only depends on its input tensor's pruning
|
||||
if (opType == OpType_ConvolutionDepthwise || opType == OpType_DeconvolutionDepthwise) {
|
||||
common->outputCount = inputMaskSum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// propagate and analyze prune mask info in model
|
||||
void analyzePruneInfo(std::unique_ptr<MNN::OpT>& op, std::unique_ptr<MNN::NetT>& netT, SubGraphProtoT* subgraph, std::map<std::string, TensorMaskInfo>& tensorMaskInfo, std::set<std::string>& notSafeConvNames) {
|
||||
auto opType = op->type;
|
||||
auto inputIndices = op->inputIndexes;
|
||||
if (inputIndices.size() == 0) {
|
||||
return;
|
||||
}
|
||||
auto outputIndices = op->outputIndexes;
|
||||
std::vector<std::string> inputTensorNames;
|
||||
for (int i = 0; i < inputIndices.size(); i++) {
|
||||
inputTensorNames.push_back(subgraph ? subgraph->tensors[inputIndices[i]] : netT->tensorName[inputIndices[i]]);
|
||||
}
|
||||
std::vector<std::string> outputTensorNames;
|
||||
for (int i = 0; i < outputIndices.size(); i++) {
|
||||
outputTensorNames.push_back(subgraph ? subgraph->tensors[outputIndices[i]] : netT->tensorName[outputIndices[i]]);
|
||||
}
|
||||
|
||||
if (opType == OpType_Convolution || opType == OpType_Deconvolution) {
|
||||
if (inputIndices.size() == 1) {
|
||||
auto convParams = op->main.AsConvolution2D();
|
||||
auto weightFloat = convParams->weight;
|
||||
auto biasFloat = convParams->bias;
|
||||
auto& common = convParams->common;
|
||||
|
||||
const int ko = common->outputCount;
|
||||
const int ki = common->inputCount / common->group;
|
||||
const int kh = common->kernelY;
|
||||
const int kw = common->kernelX;
|
||||
|
||||
VARP weightVar = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW);
|
||||
|
||||
VARP weightMask = _Greater(_ReduceSum(_Abs(weightVar), {1, 2, 3}), _Scalar<float>(1e-6));
|
||||
VARP maskSum = _ReduceSum(weightMask);
|
||||
auto maskInfo = weightMask->getInfo();
|
||||
auto maskPtr = weightMask->readMap<int>();
|
||||
|
||||
if (maskSum->readMap<int>()[0] == maskInfo->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
// conv has pruned, propagate its mask down
|
||||
tensorMaskInfo[outputTensorNames[0]].oriConvName = op->name;
|
||||
for (int i = 0; i < maskInfo->size; i++) {
|
||||
tensorMaskInfo[outputTensorNames[0]].mask.push_back(maskPtr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<MNN::OpType>::iterator iter;
|
||||
iter = std::find(unSafeOpTypes.begin(), unSafeOpTypes.end(), opType);
|
||||
// not safe op and num_outputs > 1 op are not safe
|
||||
if ((iter != unSafeOpTypes.end()) || (outputTensorNames.size() > 1)) {
|
||||
for (auto name : inputTensorNames) {
|
||||
if (!tensorMaskInfo[name].oriConvName.empty()) {
|
||||
// so that input tensor mask's oriConv op is not safe
|
||||
notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// when a mask is propagated to the output, its oriConv ops are not safe
|
||||
std::vector<MNN::OpT*> userOps = findUserOps(outputIndices[0], netT, subgraph);
|
||||
if (userOps.size() == 0) {
|
||||
for (auto name : inputTensorNames) {
|
||||
if (!tensorMaskInfo[name].oriConvName.empty()) {
|
||||
notSafeConvNames.insert(tensorMaskInfo[name].oriConvName);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// if the op has more than one input (including const input)
|
||||
// we need its input tensor's masks are all from one oriConv op
|
||||
if (inputIndices.size() > 1) {
|
||||
std::string oriConvName;
|
||||
std::string oriTensorName;
|
||||
for (auto name : inputTensorNames) {
|
||||
if (!tensorMaskInfo[name].oriConvName.empty()) {
|
||||
oriConvName = tensorMaskInfo[name].oriConvName;
|
||||
oriTensorName = name;
|
||||
}
|
||||
}
|
||||
if (oriConvName.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// oriConvName is not empty
|
||||
bool unsafe = false;
|
||||
for (auto name : inputTensorNames) {
|
||||
auto tOriName = tensorMaskInfo[name].oriConvName;
|
||||
if ((tOriName != oriConvName) && (!tOriName.empty())) {
|
||||
unsafe = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if unsafe, all its input tensor mask's oriConvs are not safe
|
||||
if (unsafe) {
|
||||
for (auto name : inputTensorNames) {
|
||||
auto tOriName = tensorMaskInfo[name].oriConvName;
|
||||
if (!tOriName.empty()) {
|
||||
notSafeConvNames.insert(tOriName);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// if safe, propagate mask down
|
||||
tensorMaskInfo[outputTensorNames[0]].oriConvName = oriConvName;
|
||||
tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[oriTensorName].mask;
|
||||
return;
|
||||
}
|
||||
|
||||
// for 1 input and 1 output safe op, propagate mask down
|
||||
tensorMaskInfo[outputTensorNames[0]].oriConvName = tensorMaskInfo[inputTensorNames[0]].oriConvName;
|
||||
tensorMaskInfo[outputTensorNames[0]].mask = tensorMaskInfo[inputTensorNames[0]].mask;
|
||||
}
|
||||
|
||||
void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto) {
|
||||
bool filterPruned = false;
|
||||
for (const auto& algo : proto.algo()) {
|
||||
if (algo.type() == Compression::CompressionAlgo::PRUNE) {
|
||||
auto prune_type = algo.prune_params().type();
|
||||
auto prune_algo_type = MNN::SparseAlgo(prune_type);
|
||||
if (prune_type == Compression::PruneParams_PruneType_FILTER) {
|
||||
filterPruned = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!filterPruned) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::map<std::string, TensorMaskInfo> netMaskInfo;
|
||||
for (auto tensorName : netT->tensorName) {
|
||||
netMaskInfo[tensorName] = TensorMaskInfo();
|
||||
}
|
||||
|
||||
std::set<std::string> notSafeConvNames;
|
||||
for (auto& op : netT->oplists) {
|
||||
analyzePruneInfo(op, netT, nullptr, netMaskInfo, notSafeConvNames);
|
||||
}
|
||||
|
||||
std::set<std::string>::iterator iter;
|
||||
if (!notSafeConvNames.empty()) {
|
||||
for (auto& info : netMaskInfo) {
|
||||
iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
|
||||
if (iter != notSafeConvNames.end()) {
|
||||
for (int i = 0; i < info.second.mask.size(); i++) {
|
||||
if (info.second.mask[i] == 0) {
|
||||
info.second.mask[i] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& op : netT->oplists) {
|
||||
channelPrune(op, netT, nullptr, netMaskInfo);
|
||||
}
|
||||
|
||||
|
||||
for (auto& subgraph : netT->subgraphs) {
|
||||
std::map<std::string, TensorMaskInfo> subgraphMaskInfo;
|
||||
for (auto tensorName : subgraph->tensors) {
|
||||
subgraphMaskInfo[tensorName] = TensorMaskInfo();
|
||||
}
|
||||
|
||||
std::set<std::string> notSafeConvNames;
|
||||
for (auto& op : subgraph->nodes) {
|
||||
analyzePruneInfo(op, netT, subgraph.get(), subgraphMaskInfo, notSafeConvNames);
|
||||
}
|
||||
|
||||
std::set<std::string>::iterator iter;
|
||||
if (!notSafeConvNames.empty()) {
|
||||
for (auto& info : subgraphMaskInfo) {
|
||||
iter = std::find(notSafeConvNames.begin(), notSafeConvNames.end(), info.second.oriConvName);
|
||||
if (iter != notSafeConvNames.end()) {
|
||||
for (int i = 0; i < info.second.mask.size(); i++) {
|
||||
if (info.second.mask[i] == 0) {
|
||||
info.second.mask[i] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& op : subgraph->nodes) {
|
||||
channelPrune(op, netT, subgraph.get(), subgraphMaskInfo);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -24,5 +24,6 @@ void addSparseInfo(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline
|
|||
void fullQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
|
||||
void weightQuantAndCoding(std::unique_ptr<MNN::NetT>& netT, const modelConfig& config);
|
||||
void addUUID(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
|
||||
void channelPruneConvert(std::unique_ptr<MNN::NetT>& netT, MNN::Compression::Pipeline proto);
|
||||
|
||||
#endif // COMMMON_UTILS_HPP
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
//
|
||||
|
||||
#include "CommonUtils.hpp"
|
||||
#include "common/CommonCompute.hpp"
|
||||
#include "cpp/IDSTEncoder.hpp"
|
||||
|
||||
static float findAbsMax(const float *weights, const int count) {
|
||||
|
@ -42,17 +43,26 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
|
|||
const auto opType = op->type;
|
||||
// config.weightQuantBits only control weight quantization for float convolution
|
||||
// by default, do coding for convint8 and depthwiseconvint8, if there is any
|
||||
if ((config.weightQuantBits == 0) && (
|
||||
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
|
||||
opType != MNN::OpType_Deconvolution && opType != MNN::OpType_DeconvolutionDepthwise &&
|
||||
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8) {
|
||||
return;
|
||||
}
|
||||
auto param = op->main.AsConvolution2D();
|
||||
auto& common = param->common;
|
||||
if (param->quanParameter.get() != nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (config.weightQuantBits == 0) {
|
||||
if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
|
||||
// Do nothing
|
||||
} else {
|
||||
CommonCompute::compressFloatWeightToSparse(op.get());
|
||||
return;
|
||||
}
|
||||
}
|
||||
int bits = 8;
|
||||
if ((config.weightQuantBits > 0) && (
|
||||
opType != MNN::OpType_ConvInt8 && opType != MNN::OpType_DepthwiseConvInt8)) {
|
||||
|
@ -62,12 +72,6 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
|
|||
bits = std::max(bits, 2);
|
||||
bits = std::min(bits, 8);
|
||||
|
||||
auto param = op->main.AsConvolution2D();
|
||||
auto& common = param->common;
|
||||
if (param->quanParameter.get() != nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
int weightSize = param->weight.size();
|
||||
// shared weights or sth else.
|
||||
if (weightSize == 0) {
|
||||
|
|
|
@ -48,7 +48,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c
|
|||
if (config.benchmarkModel) {
|
||||
removeParams(netT);
|
||||
}
|
||||
|
||||
if (config.compressionParamsFile != "") {
|
||||
channelPruneConvert(netT, proto);
|
||||
}
|
||||
if (config.saveHalfFloat) {
|
||||
castParamsToHalf(netT);
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ message LayerQuantizeParams {
|
|||
optional int32 clamp_min = 4 [default = -128];
|
||||
optional int32 clamp_max = 5 [default = 127];
|
||||
}
|
||||
|
||||
|
||||
message WinogradParams {
|
||||
required int32 version = 1 [default = 0];
|
||||
// units_attr: {kyStart, kxStart, subKy, subKx, unitY, unitX} x N
|
||||
|
@ -80,6 +80,7 @@ message PruneParams {
|
|||
enum PruneType {
|
||||
RANDOM = 0;
|
||||
SIMD_OC = 1;
|
||||
FILTER = 2;
|
||||
}
|
||||
optional PruneType type = 1 [default = RANDOM];
|
||||
optional LevelPrunerParams level_pruner_params = 2;
|
||||
|
|
|
@ -359,25 +359,26 @@ const char descriptor_table_protodef_MNN_5fcompression_2eproto[] PROTOBUF_SECTIO
|
|||
"\030\003 \003(\t\"o\n\022SIMDOCPrunerParams\022\033\n\023weight_t"
|
||||
"ensor_names\030\001 \003(\t\022\024\n\014prune_ratios\030\002 \003(\002\022"
|
||||
"\023\n\013layer_names\030\003 \003(\t\022\021\n\toc_blocks\030\004 \003(\005\""
|
||||
"\366\001\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
|
||||
"\202\002\n\013PruneParams\022<\n\004type\030\001 \001(\0162&.MNN.Comp"
|
||||
"ression.PruneParams.PruneType:\006RANDOM\022\?\n"
|
||||
"\023level_pruner_params\030\002 \001(\0132\".MNN.Compres"
|
||||
"sion.LevelPrunerParams\022B\n\025simd_oc_pruner"
|
||||
"_params\030\003 \001(\0132#.MNN.Compression.SIMDOCPr"
|
||||
"unerParams\"$\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
|
||||
"IMD_OC\020\001\"\362\001\n\017CompressionAlgo\022H\n\004type\030\001 \001"
|
||||
"(\01620.MNN.Compression.CompressionAlgo.Com"
|
||||
"pressionType:\010QUANTIZE\0225\n\014quant_params\030\002"
|
||||
" \001(\0132\037.MNN.Compression.QuantizeParams\0222\n"
|
||||
"\014prune_params\030\003 \001(\0132\034.MNN.Compression.Pr"
|
||||
"uneParams\"*\n\017CompressionType\022\014\n\010QUANTIZE"
|
||||
"\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n\007version\030\001 \002"
|
||||
"(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MNN.Compressio"
|
||||
"n.CompressionAlgo\022\020\n\010mnn_uuid\030\003 \001(\t"
|
||||
"unerParams\"0\n\tPruneType\022\n\n\006RANDOM\020\000\022\013\n\007S"
|
||||
"IMD_OC\020\001\022\n\n\006FILTER\020\002\"\362\001\n\017CompressionAlgo"
|
||||
"\022H\n\004type\030\001 \001(\01620.MNN.Compression.Compres"
|
||||
"sionAlgo.CompressionType:\010QUANTIZE\0225\n\014qu"
|
||||
"ant_params\030\002 \001(\0132\037.MNN.Compression.Quant"
|
||||
"izeParams\0222\n\014prune_params\030\003 \001(\0132\034.MNN.Co"
|
||||
"mpression.PruneParams\"*\n\017CompressionType"
|
||||
"\022\014\n\010QUANTIZE\020\000\022\t\n\005PRUNE\020\001\"d\n\010Pipeline\022\026\n"
|
||||
"\007version\030\001 \002(\t:\0050.0.0\022.\n\004algo\030\002 \003(\0132 .MN"
|
||||
"N.Compression.CompressionAlgo\022\020\n\010mnn_uui"
|
||||
"d\030\003 \001(\t"
|
||||
;
|
||||
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_MNN_5fcompression_2eproto_once;
|
||||
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_MNN_5fcompression_2eproto = {
|
||||
false, false, 1835, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto",
|
||||
false, false, 1847, descriptor_table_protodef_MNN_5fcompression_2eproto, "MNN_compression.proto",
|
||||
&descriptor_table_MNN_5fcompression_2eproto_once, nullptr, 0, 10,
|
||||
schemas, file_default_instances, TableStruct_MNN_5fcompression_2eproto::offsets,
|
||||
file_level_metadata_MNN_5fcompression_2eproto, file_level_enum_descriptors_MNN_5fcompression_2eproto, file_level_service_descriptors_MNN_5fcompression_2eproto,
|
||||
|
@ -444,6 +445,7 @@ bool PruneParams_PruneType_IsValid(int value) {
|
|||
switch (value) {
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -453,6 +455,7 @@ bool PruneParams_PruneType_IsValid(int value) {
|
|||
#if (__cplusplus < 201703) && (!defined(_MSC_VER) || (_MSC_VER >= 1900 && _MSC_VER < 1912))
|
||||
constexpr PruneParams_PruneType PruneParams::RANDOM;
|
||||
constexpr PruneParams_PruneType PruneParams::SIMD_OC;
|
||||
constexpr PruneParams_PruneType PruneParams::FILTER;
|
||||
constexpr PruneParams_PruneType PruneParams::PruneType_MIN;
|
||||
constexpr PruneParams_PruneType PruneParams::PruneType_MAX;
|
||||
constexpr int PruneParams::PruneType_ARRAYSIZE;
|
||||
|
|
|
@ -153,11 +153,12 @@ inline bool LayerQuantizeParams_QuantMethod_Parse(
|
|||
}
|
||||
enum PruneParams_PruneType : int {
|
||||
PruneParams_PruneType_RANDOM = 0,
|
||||
PruneParams_PruneType_SIMD_OC = 1
|
||||
PruneParams_PruneType_SIMD_OC = 1,
|
||||
PruneParams_PruneType_FILTER = 2
|
||||
};
|
||||
bool PruneParams_PruneType_IsValid(int value);
|
||||
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MIN = PruneParams_PruneType_RANDOM;
|
||||
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_SIMD_OC;
|
||||
constexpr PruneParams_PruneType PruneParams_PruneType_PruneType_MAX = PruneParams_PruneType_FILTER;
|
||||
constexpr int PruneParams_PruneType_PruneType_ARRAYSIZE = PruneParams_PruneType_PruneType_MAX + 1;
|
||||
|
||||
const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* PruneParams_PruneType_descriptor();
|
||||
|
@ -1935,6 +1936,8 @@ class PruneParams final :
|
|||
PruneParams_PruneType_RANDOM;
|
||||
static constexpr PruneType SIMD_OC =
|
||||
PruneParams_PruneType_SIMD_OC;
|
||||
static constexpr PruneType FILTER =
|
||||
PruneParams_PruneType_FILTER;
|
||||
static inline bool PruneType_IsValid(int value) {
|
||||
return PruneParams_PruneType_IsValid(value);
|
||||
}
|
||||
|
|
|
@ -18,16 +18,30 @@ using namespace MNN;
|
|||
|
||||
namespace IDSTEncoder {
|
||||
|
||||
static void WriteBlobDim(std::ostream &out, std::vector<int> dims)
|
||||
static bool WriteBlobDim(std::ostream &out, std::vector<int> dims)
|
||||
{
|
||||
char tmp[4];
|
||||
bool useInt32 = false;
|
||||
((unsigned char *)tmp)[0] = (unsigned char)dims.size();
|
||||
out.write(tmp, 1);
|
||||
for (int i = 0; i < dims.size(); i++)
|
||||
{
|
||||
unsigned short tmpShort = (unsigned short)dims[i];
|
||||
out.write((const char*)(&tmpShort), 2);
|
||||
for (int i = 0; i < dims.size(); i++) {
|
||||
if (dims[i] > ((1<<16)-1)) {
|
||||
useInt32 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (useInt32) {
|
||||
for (int i = 0; i < dims.size(); i++) {
|
||||
unsigned int tmpShort = (unsigned int)dims[i];
|
||||
out.write((const char*)(&tmpShort), 4);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < dims.size(); i++) {
|
||||
unsigned short tmpShort = (unsigned short)dims[i];
|
||||
out.write((const char*)(&tmpShort), 2);
|
||||
}
|
||||
}
|
||||
return useInt32;
|
||||
}
|
||||
|
||||
static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
|
||||
|
@ -174,7 +188,7 @@ static unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsi
|
|||
return best_nnz;
|
||||
}
|
||||
|
||||
static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
|
||||
{
|
||||
//push values into buffer
|
||||
//Find int values in all blobs and check;
|
||||
|
@ -239,7 +253,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
|
|||
{
|
||||
char tmp[100];
|
||||
//1. weights blob shape(unsigned int32)
|
||||
WriteBlobDim(out, {channel, area});
|
||||
shapeUseInt32 = WriteBlobDim(out, {channel, area});
|
||||
// 2. Avalable values Count(unsigned char)
|
||||
tmp[0] = (unsigned char)iCount;
|
||||
out.write(tmp, 1);
|
||||
|
@ -256,7 +270,7 @@ static void WriteCQBlobs(std::ostream &out, const float* weightData, const float
|
|||
delete[] buf;
|
||||
}
|
||||
|
||||
static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, bool& shapeUseInt32)
|
||||
{
|
||||
std::set<int> setWeight;
|
||||
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
|
||||
|
@ -358,7 +372,7 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
|
|||
{ //write
|
||||
char tmp[100];
|
||||
// 1.weights blob shape(unsigned int32)
|
||||
WriteBlobDim(out, {channel, area});
|
||||
shapeUseInt32 = WriteBlobDim(out, {channel, area});
|
||||
// 2. nnz
|
||||
out.write((const char*) &nnz, 4);
|
||||
// 3. max_step use # bits () (unsigned char)
|
||||
|
@ -384,12 +398,14 @@ static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
|
|||
static std::unique_ptr<IDSTQuanT> encode(const std::vector<float>& weight, const std::vector<float>& scale, int kernelSize, int kernelNum,
|
||||
bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin) {
|
||||
std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
|
||||
WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
|
||||
WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag);
|
||||
bool shapeUseInt32 = false;
|
||||
WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
|
||||
WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32);
|
||||
std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
|
||||
auto cqStr = outputStringStreamCQ.str();
|
||||
auto sqStr = outputStringStreamSQ.str();
|
||||
int int8Size = kernelNum * kernelSize;
|
||||
idst->shapeInt32 = shapeUseInt32;
|
||||
if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
|
||||
idst->type = 4;
|
||||
idst->aMax = kernelNum;
|
||||
|
|
|
@ -59,6 +59,7 @@ void Revert::packMNNNet() {
|
|||
void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
|
||||
if (mMNNNet->bizCode == "benchmark" || rewrite) {
|
||||
randStart();
|
||||
bool useSparse = spasity > 0.5f;
|
||||
for (auto& op : mMNNNet->oplists) {
|
||||
const auto opType = op->type;
|
||||
switch (opType) {
|
||||
|
@ -71,51 +72,53 @@ void Revert::initialize(float spasity, int sparseBlockOC, bool rewrite) {
|
|||
const int oc = convCommon->outputCount / convCommon->group;
|
||||
param->weight.resize(oc * weightReduceStride);
|
||||
::memset(param->weight.data(), 0, param->weight.size() * sizeof(float));
|
||||
size_t weightNNZElement, weightBlockNumber = 0;
|
||||
MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
|
||||
|
||||
MNN::AttributeT* arg1(new MNN::AttributeT);
|
||||
arg1->key = "sparseBlockOC";
|
||||
arg1->i = sparseBlockOC;
|
||||
|
||||
MNN::AttributeT* arg2(new MNN::AttributeT);
|
||||
arg2->key = "sparseBlockKernel";
|
||||
arg2->i = 1;
|
||||
|
||||
MNN::AttributeT* arg3(new MNN::AttributeT);
|
||||
arg3->key = "NNZElement";
|
||||
arg3->i = weightNNZElement;
|
||||
|
||||
MNN::AttributeT* arg4(new MNN::AttributeT);
|
||||
arg4->key = "blockNumber";
|
||||
arg4->i = weightBlockNumber;
|
||||
|
||||
flatbuffers::FlatBufferBuilder builder;
|
||||
std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
|
||||
auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
|
||||
auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
|
||||
auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
|
||||
auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
|
||||
|
||||
argsVector.emplace_back(sparseArg1);
|
||||
argsVector.emplace_back(sparseArg2);
|
||||
argsVector.emplace_back(sparseArg3);
|
||||
argsVector.emplace_back(sparseArg4);
|
||||
|
||||
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
|
||||
MNN::SparseAlgo prune_algo_type;
|
||||
if (sparseBlockOC == 4) {
|
||||
prune_algo_type = MNN::SparseAlgo_SIMD_OC;
|
||||
} else {
|
||||
prune_algo_type = MNN::SparseAlgo_RANDOM;
|
||||
}
|
||||
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
|
||||
builder.Finish(sparseCom);
|
||||
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
|
||||
param->sparseParameter.reset(sparseComPtr);
|
||||
|
||||
param->bias.resize(convCommon->outputCount);
|
||||
::memset(param->bias.data(), 0, param->bias.size() * sizeof(float));
|
||||
if (useSparse) {
|
||||
size_t weightNNZElement, weightBlockNumber = 0;
|
||||
MNN::CommonCompute::fillRandValueAsSparsity(weightNNZElement, weightBlockNumber, param->weight.data(), oc, weightReduceStride, spasity, sparseBlockOC);
|
||||
|
||||
MNN::AttributeT* arg1(new MNN::AttributeT);
|
||||
arg1->key = "sparseBlockOC";
|
||||
arg1->i = sparseBlockOC;
|
||||
|
||||
MNN::AttributeT* arg2(new MNN::AttributeT);
|
||||
arg2->key = "sparseBlockKernel";
|
||||
arg2->i = 1;
|
||||
|
||||
MNN::AttributeT* arg3(new MNN::AttributeT);
|
||||
arg3->key = "NNZElement";
|
||||
arg3->i = weightNNZElement;
|
||||
|
||||
MNN::AttributeT* arg4(new MNN::AttributeT);
|
||||
arg4->key = "blockNumber";
|
||||
arg4->i = weightBlockNumber;
|
||||
|
||||
flatbuffers::FlatBufferBuilder builder;
|
||||
std::vector<flatbuffers::Offset<MNN::Attribute>> argsVector;
|
||||
auto sparseArg1 = MNN::CreateAttribute(builder, arg1);
|
||||
auto sparseArg2 = MNN::CreateAttribute(builder, arg2);
|
||||
auto sparseArg3 = MNN::CreateAttribute(builder, arg3);
|
||||
auto sparseArg4 = MNN::CreateAttribute(builder, arg4);
|
||||
|
||||
argsVector.emplace_back(sparseArg1);
|
||||
argsVector.emplace_back(sparseArg2);
|
||||
argsVector.emplace_back(sparseArg3);
|
||||
argsVector.emplace_back(sparseArg4);
|
||||
|
||||
auto sparseArgs = builder.CreateVectorOfSortedTables<MNN::Attribute>(&argsVector);
|
||||
MNN::SparseAlgo prune_algo_type;
|
||||
if (sparseBlockOC == 4) {
|
||||
prune_algo_type = MNN::SparseAlgo_SIMD_OC;
|
||||
} else {
|
||||
prune_algo_type = MNN::SparseAlgo_RANDOM;
|
||||
}
|
||||
auto sparseCom = MNN::CreateSparseCommon(builder, prune_algo_type, sparseArgs);
|
||||
builder.Finish(sparseCom);
|
||||
auto sparseComPtr = flatbuffers::GetRoot<MNN::SparseCommon>(builder.GetBufferPointer())->UnPack();
|
||||
param->sparseParameter.reset(sparseComPtr);
|
||||
MNN::CommonCompute::compressFloatWeightToSparse(op.get());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MNN::OpType_Scale: {
|
||||
|
|
|
@ -270,9 +270,12 @@ if __name__ == '__main__':
|
|||
t = TestModel(modelName)
|
||||
if len(sys.argv) > 2:
|
||||
if sys.argv[2] == 'DEBUG':
|
||||
debugMode = len(sys.argv) > 2
|
||||
print('Debug Mode: ', debugMode)
|
||||
t.Debug()
|
||||
message = t.Test()
|
||||
print(message)
|
||||
if message.find("TEST_SUCCESS") < 0:
|
||||
debugMode = len(sys.argv) > 2
|
||||
print('Debug Mode: ', debugMode)
|
||||
t.Debug()
|
||||
else:
|
||||
specifyOpName = sys.argv[2]
|
||||
t.TestName(specifyOpName)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## 编译
|
||||
### MNN 编译与安装
|
||||
- MNN 编译时打开 MNN_SUPPORT_TRAIN 开关:cmake .. -DMNN_SUPPORT_TRAIN=true
|
||||
- MNN 编译时打开 MNN_BUILD_TRAIN 开关:cmake .. -DMNN_BUILD_TRAIN=true
|
||||
|
||||
### 产物
|
||||
- transformer.out
|
||||
|
@ -11,6 +11,7 @@
|
|||
- train.out
|
||||
- backendTest.out
|
||||
- backwardTest.out
|
||||
- runTrainDemo.out
|
||||
|
||||
|
||||
## 使用
|
||||
|
|
|
@ -29,6 +29,35 @@ using namespace MNN::Express;
|
|||
using namespace MNN::Train;
|
||||
using namespace std;
|
||||
|
||||
|
||||
VARP getLocalLearningRate(std::string pName, std::vector<std::vector<std::string>> weightNameGroups, std::vector<std::string> lrNames,
|
||||
std::map<std::string, VARP> &lrMap, std::map<std::string, std::string> &extraInputs) {
|
||||
bool hasLocalOptConf = false;
|
||||
std::string localLrName;
|
||||
for (int ii = 0; ii < weightNameGroups.size(); ii++) {
|
||||
if (std::find(weightNameGroups[ii].begin(), weightNameGroups[ii].end(), pName) != weightNameGroups[ii].end()) {
|
||||
hasLocalOptConf = true;
|
||||
localLrName = lrNames[ii];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!hasLocalOptConf) {
|
||||
localLrName = "LearningRate";
|
||||
}
|
||||
VARP localLearningRate;
|
||||
if (lrMap.find(localLrName) != lrMap.end()) {
|
||||
localLearningRate = lrMap[localLrName];
|
||||
} else {
|
||||
auto newLr = _Input({}, NCHW);
|
||||
newLr->setName(localLrName);
|
||||
lrMap[localLrName] = newLr;
|
||||
localLearningRate = newLr;
|
||||
}
|
||||
extraInputs[localLrName] = "float";
|
||||
return localLearningRate;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, const char* argv[]) {
|
||||
if (argc < 4) {
|
||||
MNN_PRINT("Usage: ./transformer.out temp.bin dst.bin config.json\n");
|
||||
|
@ -54,34 +83,59 @@ int main(int argc, const char* argv[]) {
|
|||
std::vector<std::string> onlyUpdateOps;
|
||||
std::vector<std::string> stopBackPropOps;
|
||||
std::string optimizerType = "SGD";
|
||||
if (configObject.HasMember("Optimizor")) {
|
||||
auto optimizor = configObject["Optimizor"].GetObject();
|
||||
if (optimizor.HasMember("OnlyUpdateOps")) {
|
||||
auto limitArray = optimizor["OnlyUpdateOps"].GetArray();
|
||||
std::vector<std::string> fixAsConstOps;
|
||||
std::vector<std::vector<std::string>> weightNameGroups;
|
||||
std::vector<std::string> lrNames;
|
||||
if (configObject.HasMember("Optimizer")) {
|
||||
auto optimizer = configObject["Optimizer"].GetObject();
|
||||
if (optimizer.HasMember("OnlyUpdateOps")) {
|
||||
auto limitArray = optimizer["OnlyUpdateOps"].GetArray();
|
||||
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
|
||||
onlyUpdateOps.emplace_back(vIter->GetString());
|
||||
MNN_PRINT("will only update: %s \n", vIter->GetString());
|
||||
}
|
||||
}
|
||||
if (optimizor.HasMember("NoUpdateOps")) {
|
||||
auto limitArray = optimizor["NoUpdateOps"].GetArray();
|
||||
if (optimizer.HasMember("NoUpdateOps")) {
|
||||
auto limitArray = optimizer["NoUpdateOps"].GetArray();
|
||||
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
|
||||
noUpdateOps.emplace_back(vIter->GetString());
|
||||
if (onlyUpdateOps.empty())
|
||||
MNN_PRINT("will not update: %s \n", vIter->GetString());
|
||||
}
|
||||
}
|
||||
if (optimizor.HasMember("StopBackPropOps")) {
|
||||
auto limitArray = optimizor["StopBackPropOps"].GetArray();
|
||||
if (optimizer.HasMember("StopBackPropOps")) {
|
||||
auto limitArray = optimizer["StopBackPropOps"].GetArray();
|
||||
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
|
||||
stopBackPropOps.emplace_back(vIter->GetString());
|
||||
MNN_PRINT("will stop back prop from (also not update this op): %s \n", vIter->GetString());
|
||||
}
|
||||
}
|
||||
if (optimizor.HasMember("type")) {
|
||||
optimizerType = std::string(optimizor["type"].GetString());
|
||||
if (optimizer.HasMember("type")) {
|
||||
optimizerType = std::string(optimizer["type"].GetString());
|
||||
MNN_PRINT("optimizer type: %s\n", optimizerType.c_str());
|
||||
}
|
||||
if (optimizer.HasMember("FixAsConstOps")) {
|
||||
auto limitArray = optimizer["FixAsConstOps"].GetArray();
|
||||
for (auto vIter = limitArray.begin(); vIter != limitArray.end(); vIter++) {
|
||||
fixAsConstOps.emplace_back(vIter->GetString());
|
||||
MNN_PRINT("this op will be fixed as Const, and maybe turn to Trainable later: %s \n", vIter->GetString());
|
||||
}
|
||||
}
|
||||
if (optimizer.HasMember("ParameterOptConfig")) {
|
||||
auto pConf = optimizer["ParameterOptConfig"].GetArray();
|
||||
for (auto vIter = pConf.begin(); vIter != pConf.end(); vIter++) {
|
||||
auto conf = vIter->GetObject();
|
||||
if (conf.HasMember("WeightNames") && conf.HasMember("LrName")) {
|
||||
auto wn = conf["WeightNames"].GetArray();
|
||||
std::vector<std::string> wNames;
|
||||
for (auto wIter = wn.begin(); wIter != wn.end(); wIter++) {
|
||||
wNames.push_back(wIter->GetString());
|
||||
}
|
||||
weightNameGroups.push_back(wNames);
|
||||
lrNames.push_back(conf["LrName"].GetString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto bnMomentum = new MNN::AttributeT;
|
||||
bnMomentum->f = 0.99;
|
||||
|
@ -100,6 +154,17 @@ int main(int argc, const char* argv[]) {
|
|||
inputVars = inputsOutputs.first;
|
||||
outputVars = inputsOutputs.second;
|
||||
}
|
||||
for (auto& varIter : inputVars) {
|
||||
auto var = varIter.second;
|
||||
auto varInfo = var->getInfo();
|
||||
auto vDims = varInfo->dim;
|
||||
|
||||
if (!fixAsConstOps.empty()) {
|
||||
if (std::find(fixAsConstOps.begin(), fixAsConstOps.end(), var->name()) != fixAsConstOps.end()) {
|
||||
var.fix(VARP::CONSTANT);
|
||||
}
|
||||
}
|
||||
}
|
||||
Transformer::TrainConfig trainConfig;
|
||||
trainConfig.noUpdateOps = std::move(noUpdateOps);
|
||||
trainConfig.onlyUpdateOps = std::move(onlyUpdateOps);
|
||||
|
@ -185,15 +250,19 @@ int main(int argc, const char* argv[]) {
|
|||
}
|
||||
}
|
||||
}
|
||||
auto lossInfo = loss->getInfo();
|
||||
MNN_ASSERT(nullptr != loss);
|
||||
auto gradMap = OpGrad::grad(loss, parameters, stopBackPropOps);
|
||||
// Make Update
|
||||
std::map<VARP, VARP> varUpdateMap;
|
||||
auto learningRate = _Input();
|
||||
auto learningRate = _Input({}, NCHW);
|
||||
learningRate->setName("LearningRate");
|
||||
auto weightDecay = _Input();
|
||||
auto weightDecay = _Input({}, NCHW);
|
||||
weightDecay->setName("WeightDecay");
|
||||
|
||||
std::map<std::string, VARP> lrMap;
|
||||
lrMap["LearningRate"] = learningRate;
|
||||
|
||||
auto step = _Scalar<float>(1.0f);
|
||||
step->setName("optimize_step");
|
||||
step.fix(VARP::TRAINABLE);
|
||||
|
@ -209,12 +278,13 @@ int main(int argc, const char* argv[]) {
|
|||
}
|
||||
|
||||
if (optimizerType == "SGD") {
|
||||
auto momentum = _Input();
|
||||
auto momentum = _Input({}, NCHW);
|
||||
momentum->setName("Momentum");
|
||||
extraInputs["Momentum"] = "float";
|
||||
|
||||
for (auto iter : gradMap) {
|
||||
auto p = iter.first;
|
||||
MNN_PRINT("optimize variable: %s\n", p->name().c_str());
|
||||
p.fix(VARP::TRAINABLE);
|
||||
auto grad = iter.second;
|
||||
grad->setName(p->name()+"_grad");
|
||||
|
@ -251,7 +321,9 @@ int main(int argc, const char* argv[]) {
|
|||
auto newHistory = gradWithDecay + momentum * history;
|
||||
newHistory->setName("update_" + history->name());
|
||||
|
||||
auto finalGrad = learningRate * history;
|
||||
VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
|
||||
MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
|
||||
VARP finalGrad = localLearningRate * history;
|
||||
finalGrad->setName(p->name() + "_final_grad");
|
||||
|
||||
auto updateValue = _Subtract(p, finalGrad);
|
||||
|
@ -260,11 +332,11 @@ int main(int argc, const char* argv[]) {
|
|||
varUpdateMap[history] = newHistory;
|
||||
}
|
||||
} else if (optimizerType == "ADAM") {
|
||||
auto beta1 = _Input();
|
||||
auto beta1 = _Input({}, NCHW);
|
||||
beta1->setName("Beta1");
|
||||
auto beta2 = _Input();
|
||||
auto beta2 = _Input({}, NCHW);
|
||||
beta2->setName("Beta2");
|
||||
auto eps = _Input();
|
||||
auto eps = _Input({}, NCHW);
|
||||
eps->setName("Eps");
|
||||
|
||||
extraInputs["Beta1"] = "float";
|
||||
|
@ -276,6 +348,7 @@ int main(int argc, const char* argv[]) {
|
|||
|
||||
for (auto iter : gradMap) {
|
||||
auto p = iter.first;
|
||||
MNN_PRINT("optimize variable: %s\n", p->name().c_str());
|
||||
p.fix(VARP::TRAINABLE);
|
||||
auto grad = iter.second;
|
||||
grad->setName(p->name()+"_grad");
|
||||
|
@ -317,7 +390,9 @@ int main(int argc, const char* argv[]) {
|
|||
auto newHistory2 = beta2 * history2 + (_Scalar(1.0f) - beta2) * _Square(gradWithDecay);
|
||||
newHistory2->setName("update_" + history2->name());
|
||||
|
||||
auto finalGrad = learningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
|
||||
VARP localLearningRate = getLocalLearningRate(p->name(), weightNameGroups, lrNames, lrMap, extraInputs);
|
||||
MNN_PRINT("variable: %s, lr name: %s\n", p->name().c_str(), localLearningRate->name().c_str());
|
||||
auto finalGrad = localLearningRate * correction * (history1 / (_Sqrt(history2 + _Scalar<float>(1e-8)) + eps));
|
||||
finalGrad->setName(p->name() + "_final_grad");
|
||||
|
||||
auto updateValue = _Subtract(p, finalGrad);
|
||||
|
|
|
@ -79,6 +79,11 @@ public:
|
|||
for (int i = 0; i < expr->outputSize(); ++i) {
|
||||
output[i] = Variable::create(expr, i);
|
||||
}
|
||||
int activateType = op->main_as_BinaryOp()->activationType();
|
||||
if (activateType == 1) { // relu
|
||||
auto mask = _Cast<float>(_Greater(output[0], _Scalar(0.0f)));
|
||||
outputDiff = mask * backwardOutput[0];
|
||||
}
|
||||
switch (op->main_as_BinaryOp()->opType()) {
|
||||
case BinaryOpOperation_ADD: {
|
||||
res[0] = outputDiff;
|
||||
|
|
|
@ -1,20 +1,28 @@
|
|||
{
|
||||
"Train": true,
|
||||
"Loss": {
|
||||
"op": "output"
|
||||
"op": "loss"
|
||||
},
|
||||
"Optimizor": {
|
||||
"Optimizer": {
|
||||
"OnlyUpdateOps":[],
|
||||
"NoUpdateOps":[],
|
||||
"StopBackPropOps":[],
|
||||
"type": "SGD"
|
||||
"type": "SGD",
|
||||
"ParameterOptConfig":[
|
||||
{
|
||||
"WeightNames":["example_Weight1", "example_Weight2"],
|
||||
"LrName":"LearningRate2"
|
||||
},
|
||||
{
|
||||
"WeightNames":["example_Weight3"],
|
||||
"LrName":"LearningRate3"
|
||||
}
|
||||
],
|
||||
"FixAsConstOps":[]
|
||||
},
|
||||
"BatchNorm": {
|
||||
"momentum":0.99
|
||||
},
|
||||
"Debug": {
|
||||
"L2Norm": []
|
||||
},
|
||||
"Shape": {
|
||||
"input": [1, 3, 224, 224]
|
||||
}
|
||||
|
|
|
@ -4,8 +4,5 @@
|
|||
"OnlyUpdateOps":[],
|
||||
"NoUpdateOps":[],
|
||||
"type": "SGD"
|
||||
},
|
||||
"Debug": {
|
||||
"L2Norm": []
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue